在翻阅calamine作者的quick-csv存储库时无意中看到有个10年前的csv读取比赛, 把比赛选手源程序下载下来测试看到底有多快。
bash
git clone https://bitbucket.org/ewanhiggs/csv-game.git
这些源程序只有比赛程序本身,依赖的文件有的在主页,有的在makefile中写出。冠军是一个叫csvmonkey的程序,如下所示
c
#include <iostream>
#include <iterator>
#include "csvmonkey.hpp"
using namespace csvmonkey;
int main(int argc, char* argv[])
{
if(argc < 2) {
return 1;
}
MappedFileCursor cursor;
if(! cursor.open(argv[1])) {
// new behaviour for csvmonkey returns 0 for empty files.
std::cout << 0 << std::endl;
return 0;
}
CsvReader reader(cursor);
CsvCursor &row = reader.row();
int sum = 0;
while(reader.read_row()) {
sum += row.count;
}
std::cout << sum << std::endl;
}
它的主要实现都在头文件csvmonkey.hpp里,找来编译执行了,结果不太快,500万行的读取正确,用时23秒,100万行41列的读取失败,没有文档,不知怎么改。
bash
wget https://raw.githubusercontent.com/dw/csvmonkey/csv-game/csvmonkey.hpp
g++ csv.cc -w -pedantic -std=c++11 -g -O3 -fpic -msse4.2 -march=native -DNDEBUG -o csvmonkey -I .
root@6ae32a5ffcde:/par# time ./csvmonkey "5m Sales Records.csv"
70000000
real 0m23.437s
user 0m0.388s
sys 0m1.325s
root@6ae32a5ffcde:/par# time ./csvmonkey "NYC_311_SR_2010-2020-sample-1M.csv"
1060693607
Segmentation fault (core dumped)
real 0m19.954s
user 0m0.194s
sys 0m1.147s
再拿基于libcsv的程序来测,它在列表中是第8名
c
#include <csv.h>
#include <stdio.h>
#include <string.h>
void field_count(void* str, size_t str_len, void* data) {
int* count = (int*)data;
*count += 1;
}
const int READ_SZ = 1024 * 1024;
int main (int argc, char* argv[]) {
struct csv_parser parser = {0};
csv_init(&parser, CSV_APPEND_NULL);
FILE* f = fopen(argv[1], "r");
char *buf = (char*)malloc(READ_SZ);
size_t buflen = READ_SZ;
int count = 0;
while((buflen = fread(buf, 1, READ_SZ, f)) > 0){
csv_parse(&parser, buf, buflen, field_count, 0, &count);
}
printf("%d\n", count);
fclose(f);
free(buf);
csv_free(&parser);
return EXIT_SUCCESS;
}
效果好了很多,都在3秒左右,但读出的列数有点差距,应该是标题行影响,但第一个难以理解,14列怎么读成13列了。
bash
gcc csv.c libcsv.c -Werror -Wall -pedantic -std=c11 -g -O3 -fpic -march=native -o libcsv -I .
root@6ae32a5ffcde:/par# time ./libcsv "5m Sales Records.csv"
70000013
real 0m3.608s
user 0m1.301s
sys 0m0.100s
root@6ae32a5ffcde:/par# time ./libcsv "NYC_311_SR_2010-2020-sample-1M.csv"
41000041
real 0m3.078s
user 0m1.083s
sys 0m0.109s
再拿我常用的Duckdb来读,效率高了很多。读取统计行数都在2秒左右,写入内存表也不过增加了0.2秒。对读入的数据进行总结,内容也都正确。
sql
D .timer on
D SET enable_progress_bar =false;
D select count(*) from '5m Sales Records.csv';
┌────────────────┐
│ count_star() │
│ int64 │
├────────────────┤
│ 5000000 │
│ (5.00 million) │
└────────────────┘
Run Time (s): real 2.329 user 0.998917 sys 0.234929
D select count(*) from 'NYC_311_SR_2010-2020-sample-1M.csv';
┌────────────────┐
│ count_star() │
│ int64 │
├────────────────┤
│ 1000000 │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 1.981 user 0.649826 sys 0.196659
D create table t as from '5m Sales Records.csv';
Run Time (s): real 2.585 user 3.853148 sys 0.752251
D summarize t;
┌────────────────┬─────────────┬─────────────┬────────────────────┬───┬────────────────────┬─────────┬─────────────────┐
│ column_name │ column_type │ min │ max │ ... │ q75 │ count │ null_percentage │
│ varchar │ varchar │ varchar │ varchar │ │ varchar │ int64 │ decimal(9,2) │
├────────────────┼─────────────┼─────────────┼────────────────────┼───┼────────────────────┼─────────┼─────────────────┤
│ Region │ VARCHAR │ Asia │ Sub-Saharan Africa │ ... │ NULL │ 5000000 │ 0.00 │
│ Country │ VARCHAR │ Afghanistan │ Zimbabwe │ ... │ NULL │ 5000000 │ 0.00 │
│ Item Type │ VARCHAR │ Baby Food │ Vegetables │ ... │ NULL │ 5000000 │ 0.00 │
│ Sales Channel │ VARCHAR │ Offline │ Online │ ... │ NULL │ 5000000 │ 0.00 │
│ Order Priority │ VARCHAR │ C │ M │ ... │ NULL │ 5000000 │ 0.00 │
│ Order Date │ DATE │ 2010-01-01 │ 2020-09-10 │ ... │ 2018-01-08 │ 5000000 │ 0.00 │
│ Order ID │ BIGINT │ 100000321 │ 999999892 │ ... │ 775399388 │ 5000000 │ 0.00 │
│ Ship Date │ DATE │ 2010-01-01 │ 2020-10-30 │ ... │ 2018-02-01 │ 5000000 │ 0.00 │
│ Units Sold │ BIGINT │ 1 │ 10000 │ ... │ 7498 │ 5000000 │ 0.00 │
│ Unit Price │ DOUBLE │ 9.33 │ 668.27 │ ... │ 428.7937475664888 │ 5000000 │ 0.00 │
│ Unit Cost │ DOUBLE │ 6.92 │ 524.96 │ ... │ 307.03048573417 │ 5000000 │ 0.00 │
│ Total Revenue │ DOUBLE │ 9.33 │ 6682700.0 │ ... │ 1823813.9831833337 │ 5000000 │ 0.00 │
│ Total Cost │ DOUBLE │ 6.92 │ 5249600.0 │ ... │ 1203364.1504520269 │ 5000000 │ 0.00 │
│ Total Profit │ DOUBLE │ 2.41 │ 1738700.0 │ ... │ 566819.0278750015 │ 5000000 │ 0.00 │
├────────────────┴─────────────┴─────────────┴────────────────────┴───┴────────────────────┴─────────┴─────────────────┤
│ 14 rows 12 columns (7 shown) │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
Run Time (s): real 0.792 user 10.106701 sys 0.037936
D create table t2 as from 'NYC_311_SR_2010-2020-sample-1M.csv';
Run Time (s): real 2.285 user 2.901837 sys 0.686295
D summarize t2;
┌──────────────────────┬─────────────┬──────────────────────┬───┬──────────────────────┬─────────┬─────────────────┐
│ column_name │ column_type │ min │ ... │ q75 │ count │ null_percentage │
│ varchar │ varchar │ varchar │ │ varchar │ int64 │ decimal(9,2) │
├──────────────────────┼─────────────┼──────────────────────┼───┼──────────────────────┼─────────┼─────────────────┤
│ Unique Key │ BIGINT │ 11465364 │ ... │ 40208216 │ 1000000 │ 0.00 │
│ Created Date │ TIMESTAMP │ 2010-01-01 00:00:00 │ ... │ 2018-07-30 12:48:4... │ 1000000 │ 0.00 │
│ Closed Date │ TIMESTAMP │ 1900-01-01 00:00:00 │ ... │ 2018-08-04 11:42:5... │ 1000000 │ 2.86 │
│ Agency │ VARCHAR │ 3-1-1 │ ... │ NULL │ 1000000 │ 0.00 │
│ Agency Name │ VARCHAR │ 3-1-1 │ ... │ NULL │ 1000000 │ 0.00 │
│ Complaint Type │ VARCHAR │ ../../WEB-INF/web.... │ ... │ NULL │ 1000000 │ 0.00 │
│ Descriptor │ VARCHAR │ 1 Missed Collection │ ... │ NULL │ 1000000 │ 0.30 │
│ Location Type │ VARCHAR │ 1-, 2- and 3- Fami... │ ... │ NULL │ 1000000 │ 23.91 │
│ Incident Zip │ VARCHAR │ * │ ... │ NULL │ 1000000 │ 5.50 │
│ Incident Address │ VARCHAR │ * * │ ... │ NULL │ 1000000 │ 17.47 │
│ Street Name │ VARCHAR │ * │ ... │ NULL │ 1000000 │ 17.47 │
│ Cross Street 1 │ VARCHAR │ 1 AVE │ ... │ NULL │ 1000000 │ 32.04 │
│ Cross Street 2 │ VARCHAR │ 1 AVE │ ... │ NULL │ 1000000 │ 32.36 │
│ Intersection Stree... │ VARCHAR │ 1 AVE │ ... │ NULL │ 1000000 │ 76.74 │
│ Intersection Stree... │ VARCHAR │ 1 AVE │ ... │ NULL │ 1000000 │ 76.77 │
│ Address Type │ VARCHAR │ ADDRESS │ ... │ NULL │ 1000000 │ 12.58 │
│ City │ VARCHAR │ * │ ... │ NULL │ 1000000 │ 6.20 │
│ Landmark │ VARCHAR │ 1 AVENUE │ ... │ NULL │ 1000000 │ 91.28 │
│ Facility Type │ VARCHAR │ DSNY Garage │ ... │ NULL │ 1000000 │ 14.55 │
│ Status │ VARCHAR │ Assigned │ ... │ NULL │ 1000000 │ 0.00 │
│ Due Date │ TIMESTAMP │ 1900-01-02 00:00:00 │ ... │ 2017-09-22 19:44:2... │ 1000000 │ 64.78 │
│ Resolution Descrip... │ VARCHAR │ A DOB violation wa... │ ... │ NULL │ 1000000 │ 2.05 │
│ Resolution Action ... │ TIMESTAMP │ 2009-12-31 01:35:00 │ ... │ 2018-08-10 13:25:1... │ 1000000 │ 1.51 │
│ Community Board │ VARCHAR │ 0 Unspecified │ ... │ NULL │ 1000000 │ 0.00 │
│ BBL │ VARCHAR │ 0000000000 │ ... │ NULL │ 1000000 │ 24.30 │
│ Borough │ VARCHAR │ BRONX │ ... │ NULL │ 1000000 │ 0.00 │
│ X Coordinate (Stat... │ BIGINT │ 913281 │ ... │ 1018255 │ 1000000 │ 8.53 │
│ Y Coordinate (Stat... │ BIGINT │ 121152 │ ... │ 234907 │ 1000000 │ 8.53 │
│ Open Data Channel ... │ VARCHAR │ MOBILE │ ... │ NULL │ 1000000 │ 0.00 │
│ Park Facility Name │ VARCHAR │ "Uncle" Vito F. Ma... │ ... │ NULL │ 1000000 │ 0.00 │
│ Park Borough │ VARCHAR │ BRONX │ ... │ NULL │ 1000000 │ 0.00 │
│ Vehicle Type │ VARCHAR │ Ambulette / Paratr... │ ... │ NULL │ 1000000 │ 99.97 │
│ Taxi Company Borough │ VARCHAR │ BRONX │ ... │ NULL │ 1000000 │ 99.92 │
│ Taxi Pick Up Locat... │ VARCHAR │ 1 5 AVENUE MANHATTAN │ ... │ NULL │ 1000000 │ 99.21 │
│ Bridge Highway Name │ VARCHAR │ 145th St. Br - Len... │ ... │ NULL │ 1000000 │ 99.77 │
│ Bridge Highway Dir... │ VARCHAR │ Bronx Bound │ ... │ NULL │ 1000000 │ 99.77 │
│ Road Ramp │ VARCHAR │ N/A │ ... │ NULL │ 1000000 │ 99.77 │
│ Bridge Highway Seg... │ VARCHAR │ 1-1-1265963747 │ ... │ NULL │ 1000000 │ 99.76 │
│ Latitude │ DOUBLE │ 40.1123853 │ ... │ 40.80288457482914 │ 1000000 │ 25.47 │
│ Longitude │ DOUBLE │ -77.5195844 │ ... │ -73.87602468658496 │ 1000000 │ 25.47 │
│ Location │ VARCHAR │ (40.1123853, -77.5... │ ... │ NULL │ 1000000 │ 25.47 │
├──────────────────────┴─────────────┴──────────────────────┴───┴──────────────────────┴─────────┴─────────────────┤
│ 41 rows 12 columns (6 shown) │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
Run Time (s): real 0.332 user 2.741898 sys 0.049018