go
复制代码
package main
import (
"bufio"
"bytes"
"fmt"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
"io"
"os"
"strings"
)
// detectBOM 检测常见的BOM类型并返回编码名称和BOM长度
func detectBOM(b []byte) (enc string, bomLen int) {
// UTF-8 BOM
if len(b) >= 3 && b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF {
return "UTF-8 BOM", 3
}
// UTF-16 LE/BE
if len(b) >= 2 {
if b[0] == 0xFF && b[1] == 0xFE {
return "UTF-16LE", 2
}
if b[0] == 0xFE && b[1] == 0xFF {
return "UTF-16BE", 2
}
}
// UTF-32 LE/BE (可能不常见,但示例处理)
if len(b) >= 4 {
if b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00 {
return "UTF-32LE", 4
}
if b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF {
return "UTF-32BE", 4
}
}
return "", 0
}
// ReadTextFile 逐行读取文本文件,自动处理编码和BOM
func ReadTextFile(filename string) ([]string, error) {
file, err := os.Open(filename)
if err != nil {
return nil, err
}
defer file.Close()
// 读取前4字节用于BOM检测
bomBuf := make([]byte, 4)
n, err := file.Read(bomBuf)
if err != nil && err != io.EOF {
return nil, err
}
bomBuf = bomBuf[:n]
enc, bomLen := detectBOM(bomBuf)
var source io.Reader
var decoder transform.Transformer
if enc != "" {
// 处理带BOM的情况
remaining := bomBuf[bomLen:]
source = io.MultiReader(bytes.NewReader(remaining), file)
switch enc {
case "UTF-8 BOM":
decoder = unicode.UTF8.NewDecoder()
case "UTF-16LE":
decoder = unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder()
case "UTF-16BE":
decoder = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewDecoder()
default:
return nil, fmt.Errorf("unsupported encoding: %s", enc)
}
} else {
// 无BOM时检测编码
source = io.MultiReader(bytes.NewReader(bomBuf), file)
br := bufio.NewReader(source)
// 读取前1024字节进行编码检测
detectBuf, err := br.Peek(1024)
if err != nil && err != io.EOF {
return nil, err
}
// 自动检测编码
e, _, _ := charset.DetermineEncoding(detectBuf, "")
if e == nil {
e = unicode.UTF8 // 默认UTF-8
}
decoder = e.NewDecoder()
source = br
}
// 创建UTF-8转换Reader
utf8Reader := transform.NewReader(source, decoder)
scanner := bufio.NewScanner(utf8Reader)
var lines []string
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
if err := scanner.Err(); err != nil {
return nil, err
}
return lines, nil
}
// 过滤双引号已经前后空格
func Filter(lists []string) []string {
r := make([]string, 0, len(lists))
for _, list := range lists {
item := strings.TrimSpace(list)
if len(item) >= 2 {
first := item[0]
last := item[len(item)-1]
if first == last && first == '"' {
r = append(r, strings.TrimSpace(item[1:len(item)-1]))
} else {
r = append(r, item)
}
} else {
r = append(r, item)
}
}
return r
}
func main() {
name := "./csv/export.csv"
lines, err := ReadTextFile(name)
if err != nil {
fmt.Println("读取错误:" + err.Error())
return
}
for _, line := range lines {
lists := Filter(strings.Split(line, "\t"))
fmt.Println(line + " ==> " + strings.Join(lists, ","))
}
}