CSV,全称Comma-Separated Values。CSV文件是每一行都是以逗号分隔的纯文本文件。
既然是纯文本,那么Linux下所有文本处理工具都可以大展拳脚,比如sed、grep等,但其实还有一些更为傻瓜、更为高效的工具,比如csvkit工具包。
csvkit是用Python写成的一个专门的CSV处理程序,既提供了命令行工具集,也能当做Python Library使用。安装方式很简单,直接pip install csvkit
就可以了。
usage: sql2csv [-h] [-v] [-l] [--db CONNECTION_STRING] [-q QUERY] [-H] [FILE]
[root@CentOS ~]# sql2csv --db "mysql://root:ligewei@127.0.0.1:3306/hlw" --query "select ftime, count(*) from z_reg_tmp group by ftime" > sixin_count.csv
[root@CentOS ~]# in2csv data.xls > data.csv
[root@CentOS ~]# in2csv data.json > data.csv
[root@CentOS ~]# csvcut -n sixin_count.csv
1: ftime
2: count(*)
[root@CentOS ~]# head -n 3 sixin_count.csv
ftime,count(*)
20160415,390
20160416,456
[root@CentOS ~]# csvcut -c ftime ./sixin_count.csv > ftime.csv
[root@CentOS ~]# head -n 3 ftime.csv
ftime
20160415
20160416
[root@CentOS ~]# head -n 3 sixin_count.csv
ftime,count(*)
20160415,390
20160416,456
[root@CentOS ~]# csvcut -c "count(*),ftime" ./sixin_count.csv | head -n 3
count(*),ftime
390,20160415
456,20160416
-c
指定列,如 -c 1
指定第一列 -r
支持正则表达式[root@CentOS ~]# csvgrep -c 1 -r "2016060*" ./sixin_count.csv
ftime,count(*)
20160601,336
20160602,305
20160603,69
20160604,51
20160605,53
-c
指定列,如 -c 1
按照第一列来排序[root@CentOS ~]# csvsort -c 1 -r ./sixin_count.csv | head -n 3
ftime,count(*)
20160605,53
20160604,51
-g
可以指定分组[root@CentOS ~]# cat 1.csv
ftime,count(*)
20160415,390
20160416,456
[root@CentOS ~]# cat 2.csv
ftime,count(*)
20160603,69
20160604,51
[root@CentOS ~]# csvstack -g head,tail 1.csv 2.csv
group,ftime,count(*)
head,20160415,390
head,20160416,456
tail,20160603,69
tail,20160604,51
-c
指定join on的列名[root@CentOS ~]# cat 1.csv
ftime,count
20160415,390
20160416,456
[root@CentOS ~]# cat 2.csv
ftime,sum
20160415,1000
20160416,2000
20160417,3000
20160418,4000
[root@CentOS ~]# csvjoin -c "ftime" 1.csv 2.csv
ftime,count,ftime,sum
20160415,390,20160415,1000
20160416,456,20160416,2000
[root@CentOS ~]# csvlook ./sixin_count.csv
|-----------+-----------|
| ftime | count(*) |
|-----------+-----------|
| 20160415 | 390 |
| 20160416 | 456 |
| 20160417 | 473 |
[root@CentOS ~]# csvstat ./sixin_count.csv
1. ftime
<type 'int'>
Nulls: False
Min: 20160415
Max: 20160605
Sum: 1048345771
Mean: 20160495.5962
Median: 20160510.5
Standard Deviation: 55.2794507707
Unique values: 52
2. count(*)
<type 'int'>
Nulls: False
Min: 51
Max: 1260
Sum: 29206
Mean: 561.653846154
Median: 517.0
Standard Deviation: 236.189044608
Unique values: 50
5 most frequent values:
454: 2
509: 2
450: 1
451: 1
665: 1
Row count: 52
--insert
,只创建table[root@CentOS ~]# mysqlshow -h 127.0.0.1 -P 3306 hlw -p
Enter password:
Database: hlw
+--------------+
| Tables |
+--------------+
| t_dw_dc00294 |
| uid_mobile |
| z_reg_tmp |
+--------------+
[root@CentOS ~]# csvsql --db "mysql://root:ligewei@127.0.0.1:3306/hlw" --insert ./sixin_count.csv
[root@CentOS ~]# mysqlshow -h 127.0.0.1 -P 3306 hlw -p
Enter password:
Database: hlw
+--------------+
| Tables |
+--------------+
| sixin_count |
| t_dw_dc00294 |
| uid_mobile |
| z_reg_tmp |
+--------------+
[root@CentOS ~]# csvsql --query "select * from sixin_count where ftime > 20160600;" sixin_count.csv | csvlook
|-----------+-----------|
| ftime | count(*) |
|-----------+-----------|
| 20160601 | 336 |
| 20160602 | 305 |
| 20160603 | 69 |
| 20160604 | 51 |
| 20160605 | 53 |
|-----------+-----------|
csvkit提供了一整套成熟的处理CSV工具集,包括格式转换、搜索、合并等,而且使用起来简单方便,是一套很精致的工具。
参考资料:
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。