一、问题解析
原文链接:
The One Billion Row Challenge - Gunnar Morlinghttps://www.morling.dev/blog/one-billion-row-challenge/一个文件中有10亿行数据,文件大小约为13G
文件内容如下:
Hamburg;12.0
Bulawayo;8.9
Palembang;38.8
St. John's;15.2
Cracow;12.6
;前面是城市名,后面代表 城市的测量值,值的范围是-99.9到99.9
要求输出每个城市的名称及测量值的 最大值、最小值、平均值
类似这样
Abha=5.0/18.0/27.4
Abidjan=15.7/26.0/34.1
二、数据准备
-
在机器上安装jdk21
-
./create_measurements.sh 1000000000
生成测试数据
三、代码实现
1. 文件读取
使用两个协程读取文件,先找到文件中间后的首行偏移量,一个协程读取文件开始到文件中间,另一个协程读取文件中间到文件结束。
func readFile(offset, size, chunkSize int64, bytesChan chan []byte) {
var readSpendTime int64 = 0
defer func() {
fmt.Println("read file spend time" + strconv.FormatInt(readSpendTime, 10))
}()
file, err := os.Open("measurements.txt")
if err != nil {
panic(err)
}
_, err = file.Seek(offset, 0)
if err != nil {
panic(err)
}
buf := make([]byte, chunkSize)
leftover := make([]byte, 0, chunkSize)
var readTotal int64 = 0
for {
readStart := time.Now().UnixMilli()
singleRead, err := file.Read(buf)
readSpendTime = readSpendTime + (time.Now().UnixMilli() - readStart)
if err != nil {
if errors.Is(err, io.EOF) {
break
}
panic(err)
}
buf = buf[:singleRead]
readTotal = readTotal + int64(singleRead)
if readTotal > size {
idx := int64(singleRead) - (readTotal - size)
buf = buf[:idx]
}
toSend := make([]byte, singleRead)
copy(toSend, buf)
lastNewLineIndex := bytes.LastIndex(buf, []byte{'\n'})
toSend = append(leftover, buf[:lastNewLineIndex+1]...)
leftover = make([]byte, len(buf[lastNewLineIndex+1:]))
copy(leftover, buf[lastNewLineIndex+1:])
if readTotal >= size {
toSend = append(toSend, leftover...)
bytesChan <- toSend
break
} else {
bytesChan <- toSend
}
}
}
2. 数据处理
使用unsafe实现了byte到string的转换,减少内存分配
将float转化为int存储,加快执行速度
func process(readBytes []byte, dataChan chan map[string]*measurement) {
m := make(map[string]*measurement)
start := 0
var city string
var sign bool
var processDom bool
for idx, v := range readBytes {
if v == byte(';') {
city = unsafeString(unsafe.Pointer(&readBytes[start]), start, idx)
start = idx + 1
}
if v == byte('\n') || idx == len(readBytes)-1 {
if city != "" {
sign = true
processDom = false
var measure int64 = 0
for i := start; i < idx; i++ {
if readBytes[i] == '-' {
sign = false
} else if readBytes[i] == '.' {
processDom = true
} else if !processDom {
measure = measure*10 + int64(readBytes[i]-'0')*10
} else {
measure = measure + int64(readBytes[i]-'0')
}
}
if !sign {
measure = 0 - measure
}
if exist, ok := m[city]; !ok {
m[city] = &measurement{
min: measure,
max: measure,
sum: measure,
cnt: 1,
}
} else {
if measure < exist.min {
exist.min = measure
} else if measure > exist.max {
exist.max = measure
}
exist.sum = exist.sum + measure
exist.cnt++
}
city = ""
}
start = idx + 1
}
}
dataChan <- m
}