重点:ES 不擅长存超大原文/海量 blob;HDFS 不擅长低延迟随机小读。工程上通常会做 "HDFS 大文件 + 索引定位" 或 "HDFS(Parquet/ORC) + 回表服务缓存"。
适合用于新增数据、查询数据,不是很适合删除数据和修改数据,如果需要修改数据,就需要重新创建一份文件,将es存储的路径进行更改即可。
总体架构与数据模型
1) 写入流程(Ingest)
- 生成全局
docID(UUID/雪花) - 将原始数据写入 HDFS
- 推荐路径:
/data/app/yyyy/mm/dd/docID.json(小规模) - 或写入按天滚动的大文件 (更适合 HDFS):
/data/app/yyyy/mm/dd/part-0000,并记录 offset/length
- 推荐路径:
- 向 ES 写索引文档:仅包含
- 可检索字段(title、tags、time、userId...)
- HDFS 指针:
hdfs_path,以及(可选)offset、length、checksum、version
2) 读取流程(Query)
- 用 ES 搜索(全文/过滤/聚合)
- 对命中的每条结果,按
hdfs_path(或 path+offset)到 HDFS 取原文 - 返回:ES 的高亮/摘要 + 原文(或原文片段)
关键设计选择
HDFS 如何存:小文件 vs 大文件
- 小文件(每条一文件):实现最简单,但 NameNode 压力大,规模上来会出问题
- 大文件(按批/按天 append 或者写成 Parquet) :更推荐
- 需要额外记录
offset/length或者使用列存格式 + 通过主键回表 - Go 直接"随机读大文件 offset"在 HDFS 上可行,但你要确保写入方式支持可追踪 offset,并且 reader 端可
Seek
- 需要额外记录
下面代码我给两种方案:
- 方案 A:每条一个文件(最直观、好理解)
- 方案 B:按天滚动大文件(更贴近生产,但略复杂)
方案1
ES Mapping
(索引只存字段 + HDFS 指针)
示例(你可以用 Kibana/脚本创建):
json
PUT my_index
{
"mappings": {
"properties": {
"doc_id": { "type": "keyword" },
"title": { "type": "text", "analyzer": "standard" },
"tags": { "type": "keyword" },
"ts": { "type": "date" },
"hdfs_path": { "type": "keyword" },
"offset": { "type": "long" },
"length": { "type": "integer" },
"checksum": { "type": "keyword" }
}
}
}
- ES:官方 Go 客户端
github.com/elastic/go-elasticsearch/v8 - HDFS:常用库
github.com/colinmarc/hdfs/v2(通过 RPC 直连 NameNode,不依赖 shell)
bash
go get github.com/elastic/go-elasticsearch/v8
go get github.com/colinmarc/hdfs/v2
写入 HDFS + 写入 ES
go
package main
import (
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"log"
"path"
"time"
"github.com/colinmarc/hdfs/v2"
"github.com/elastic/go-elasticsearch/v8"
"github.com/elastic/go-elasticsearch/v8/esapi"
)
type RawDoc struct {
DocID string `json:"doc_id"`
Title string `json:"title"`
Tags []string `json:"tags"`
TS time.Time `json:"ts"`
Body any `json:"body"` // 这里放你的原始大 JSON / 原始事件
}
type IndexDoc struct {
DocID string `json:"doc_id"`
Title string `json:"title"`
Tags []string `json:"tags"`
TS time.Time `json:"ts"`
HDFSPath string `json:"hdfs_path"`
Checksum string `json:"checksum"`
// 方案A不需要 offset/length
}
func sha256Hex(b []byte) string {
sum := sha256.Sum256(b)
return hex.EncodeToString(sum[:])
}
func mustJSON(v any) []byte {
b, err := json.Marshal(v)
if err != nil {
panic(err)
}
return b
}
// 写原文到 HDFS:/data/app/yyyy/mm/dd/<docID>.json
func writeToHDFS(client *hdfs.Client, baseDir string, doc RawDoc) (hdfsPath string, checksum string, err error) {
rawBytes := mustJSON(doc)
checksum = sha256Hex(rawBytes)
dateDir := doc.TS.Format("2006/01/02")
dir := path.Join(baseDir, dateDir)
if err = client.MkdirAll(dir, 0755); err != nil {
return "", "", fmt.Errorf("mkdir %s: %w", dir, err)
}
hdfsPath = path.Join(dir, doc.DocID+".json")
// Create 会覆盖;若你要幂等,可先 Stat 判断或用临时文件+Rename
f, err := client.Create(hdfsPath)
if err != nil {
return "", "", fmt.Errorf("create %s: %w", hdfsPath, err)
}
defer f.Close()
if _, err = f.Write(rawBytes); err != nil {
return "", "", fmt.Errorf("write %s: %w", hdfsPath, err)
}
return hdfsPath, checksum, nil
}
func indexToES(es *elasticsearch.Client, index string, idx IndexDoc) error {
body := mustJSON(idx)
req := esapi.IndexRequest{
Index: index,
DocumentID: idx.DocID, // 用 doc_id 当 ES _id,方便幂等 upsert
Body: bytes.NewReader(body),
Refresh: "false", // 批量写入建议 false 或 "wait_for"
}
resp, err := req.Do(context.Background(), es)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.IsError() {
b, _ := io.ReadAll(resp.Body)
return fmt.Errorf("es index error: %s", string(b))
}
return nil
}
func ingestOne(hdfsClient *hdfs.Client, es *elasticsearch.Client, hdfsBaseDir, esIndex string, doc RawDoc) error {
hdfsPath, checksum, err := writeToHDFS(hdfsClient, hdfsBaseDir, doc)
if err != nil {
return err
}
idx := IndexDoc{
DocID: doc.DocID,
Title: doc.Title,
Tags: doc.Tags,
TS: doc.TS,
HDFSPath: hdfsPath,
Checksum: checksum,
}
// 注意一致性:先写 HDFS 再写 ES;若 ES 失败,做重试/补偿
return indexToES(es, esIndex, idx)
}
func main() {
// 1) HDFS client(按你的集群配置:NameNode 地址、Kerberos 等)
hdfsClient, err := hdfs.New("namenode1:8020") // 也可从 HADOOP_CONF_DIR 读取
if err != nil {
log.Fatal(err)
}
// 2) ES client
es, err := elasticsearch.NewClient(elasticsearch.Config{
Addresses: []string{"http://es1:9200"},
// Username/Password/TLS 按需配置
})
if err != nil {
log.Fatal(err)
}
doc := RawDoc{
DocID: "doc-001",
Title: "hello es+hdfs",
Tags: []string{"demo", "hdfs"},
TS: time.Now(),
Body: map[string]any{
"message": "raw big json here",
"n": 123,
},
}
if err := ingestOne(hdfsClient, es, "/data/app", "my_index", doc); err != nil {
log.Fatal(err)
}
log.Println("ok")
}
代码:查询 ES + 回表 HDFS 读取原文
go
func readFromHDFS(client *hdfs.Client, hdfsPath string) ([]byte, error) {
f, err := client.Open(hdfsPath)
if err != nil {
return nil, err
}
defer f.Close()
return io.ReadAll(f)
}
ES 搜索示例(简化版):
go
type HitSource struct {
DocID string `json:"doc_id"`
Title string `json:"title"`
HDFSPath string `json:"hdfs_path"`
}
type ESSearchResp struct {
Hits struct {
Hits []struct {
Source HitSource `json:"_source"`
} `json:"hits"`
} `json:"hits"`
}
func searchAndFetch(es *elasticsearch.Client, hdfsClient *hdfs.Client, index, q string) error {
query := map[string]any{
"query": map[string]any{
"multi_match": map[string]any{
"query": q,
"fields": []string{"title^2", "tags"},
},
},
"size": 10,
}
var buf bytes.Buffer
_ = json.NewEncoder(&buf).Encode(query)
resp, err := es.Search(
es.Search.WithIndex(index),
es.Search.WithBody(&buf),
)
if err != nil {
return err
}
defer resp.Body.Close()
var r ESSearchResp
if err := json.NewDecoder(resp.Body).Decode(&r); err != nil {
return err
}
for _, h := range r.Hits.Hits {
raw, err := readFromHDFS(hdfsClient, h.Source.HDFSPath)
if err != nil {
return fmt.Errorf("hdfs read %s: %w", h.Source.HDFSPath, err)
}
fmt.Printf("doc_id=%s title=%s raw_len=%d\n", h.Source.DocID, h.Source.Title, len(raw))
}
return nil
}
一致性与可靠性(生产必做)
因为你是"双写"(HDFS + ES),必然面对一致性问题。常见做法:
- 写入顺序:先 HDFS,后 ES(避免 ES 指针指向不存在的数据)
- 幂等 :
- ES:用
doc_id做_id(重复写覆盖) - HDFS:写入用
tmp文件 +Rename原子替换,或检查存在则跳过
- ES:用
- 失败补偿 :
- ES 写失败:把 docID + hdfsPath 放到重试队列(Kafka/DB)
- HDFS 写失败:不写 ES
- 可观测性:记录 checksum/版本号;
- 回表时校验(可选)
缺陷
- 由于小文件较多,对于集群压力比较大,并不是很好用。
方案2
HDFS 按天滚动大文件 append ,每条记录采用 length-prefix(4 字节大端)+ payload (JSON) 的格式。
go
package main
import (
"bytes"
"context"
"crypto/sha256"
"encoding/binary"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"path"
"time"
"github.com/colinmarc/hdfs/v2"
"github.com/elastic/go-elasticsearch/v8"
"github.com/elastic/go-elasticsearch/v8/esapi"
)
/*
Record format in HDFS:
[4 bytes big-endian uint32: payloadLen] [payload bytes: JSON]
We store in ES:
hdfs_path, offset (start of record), length (total bytes = 4 + payloadLen)
*/
type RawDoc struct {
DocID string `json:"doc_id"`
Title string `json:"title"`
Tags []string `json:"tags"`
TS time.Time `json:"ts"`
Body any `json:"body"` // 原始大JSON/事件
}
type IndexDoc struct {
DocID string `json:"doc_id"`
Title string `json:"title"`
Tags []string `json:"tags"`
TS time.Time `json:"ts"`
HDFSPath string `json:"hdfs_path"`
Offset int64 `json:"offset"`
Length int `json:"length"` // record total length (4 + payloadLen)
Checksum string `json:"checksum"` // sha256(payload)
}
func mustJSON(v any) []byte {
b, err := json.Marshal(v)
if err != nil {
panic(err)
}
return b
}
func sha256Hex(b []byte) string {
sum := sha256.Sum256(b)
return hex.EncodeToString(sum[:])
}
func dailyLogPath(baseDir string, t time.Time) string {
// /data/app/yyyy/mm/dd/part-0000.log
dateDir := t.Format("2006/01/02")
return path.Join(baseDir, dateDir, "part-0000.log")
}
func ensureParentDir(client *hdfs.Client, filePath string) error {
dir := path.Dir(filePath)
return client.MkdirAll(dir, 0755)
}
// appendRecord appends one record to an HDFS file and returns offset/length.
// Important: file must have single writer to avoid race/overlap offsets.
func appendRecord(client *hdfs.Client, filePath string, payload []byte) (offset int64, recLen int, err error) {
if len(payload) > int(^uint32(0)) {
return 0, 0, fmt.Errorf("payload too large: %d", len(payload))
}
if err := ensureParentDir(client, filePath); err != nil {
return 0, 0, fmt.Errorf("ensureParentDir: %w", err)
}
// Open existing file for append, or create if not exists.
var f *hdfs.FileWriter
// Try append first.
f, err = client.Append(filePath)
if err != nil {
// If file doesn't exist, create it.
// Unfortunately different clusters may return different errors; do a stat check.
_, statErr := client.Stat(filePath)
if statErr != nil {
// does not exist (or inaccessible) -> create
f, err = client.Create(filePath)
if err != nil {
return 0, 0, fmt.Errorf("create %s: %w", filePath, err)
}
} else {
// exists but append failed
return 0, 0, fmt.Errorf("append %s: %w", filePath, err)
}
}
defer f.Close()
// Current file size as offset (start position of this record)
info, err := client.Stat(filePath)
if err != nil {
return 0, 0, fmt.Errorf("stat %s: %w", filePath, err)
}
offset = info.Size()
// Build record bytes: len-prefix + payload
recLen = 4 + len(payload)
buf := make([]byte, recLen)
binary.BigEndian.PutUint32(buf[0:4], uint32(len(payload)))
copy(buf[4:], payload)
// Write atomically from writer perspective; HDFS itself is streaming, so ensure full write
n, err := f.Write(buf)
if err != nil {
return 0, 0, fmt.Errorf("write record: %w", err)
}
if n != len(buf) {
return 0, 0, fmt.Errorf("short write: %d/%d", n, len(buf))
}
// Close() will flush pipeline
return offset, recLen, nil
}
// readRecord reads one record from HDFS by offset.
// It validates length-prefix and returns payload bytes.
func readRecord(client *hdfs.Client, filePath string, offset int64) ([]byte, error) {
f, err := client.Open(filePath)
if err != nil {
return nil, fmt.Errorf("open %s: %w", filePath, err)
}
defer f.Close()
if _, err := f.Seek(offset, io.SeekStart); err != nil {
return nil, fmt.Errorf("seek %s offset=%d: %w", filePath, offset, err)
}
var lenBuf [4]byte
if _, err := io.ReadFull(f, lenBuf[:]); err != nil {
return nil, fmt.Errorf("read length-prefix: %w", err)
}
payloadLen := binary.BigEndian.Uint32(lenBuf[:])
if payloadLen == 0 {
return nil, errors.New("invalid payloadLen=0")
}
// 你也可以设置上限避免异常长度攻击
if payloadLen > 64*1024*1024 {
return nil, fmt.Errorf("payloadLen too large: %d", payloadLen)
}
payload := make([]byte, payloadLen)
if _, err := io.ReadFull(f, payload); err != nil {
return nil, fmt.Errorf("read payload: %w", err)
}
return payload, nil
}
func indexToES(es *elasticsearch.Client, index string, doc IndexDoc) error {
body := mustJSON(doc)
req := esapi.IndexRequest{
Index: index,
DocumentID: doc.DocID,
Body: bytes.NewReader(body),
Refresh: "false",
}
resp, err := req.Do(context.Background(), es)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.IsError() {
b, _ := io.ReadAll(resp.Body)
return fmt.Errorf("es index error: %s", string(b))
}
return nil
}
func ingestOne(hdfsClient *hdfs.Client, es *elasticsearch.Client, hdfsBaseDir, esIndex string, doc RawDoc) (IndexDoc, error) {
// payload = 原始文档(可只存 Body,也可存整个 RawDoc;这里存整个 RawDoc 方便回放)
payload := mustJSON(doc)
checksum := sha256Hex(payload)
hdfsPath := dailyLogPath(hdfsBaseDir, doc.TS)
offset, recLen, err := appendRecord(hdfsClient, hdfsPath, payload)
if err != nil {
return IndexDoc{}, err
}
idx := IndexDoc{
DocID: doc.DocID,
Title: doc.Title,
Tags: doc.Tags,
TS: doc.TS,
HDFSPath: hdfsPath,
Offset: offset,
Length: recLen,
Checksum: checksum,
}
if err := indexToES(es, esIndex, idx); err != nil {
// 一致性建议:ES失败则将 idx 记录到重试队列(Kafka/DB),避免"已落HDFS但没索引"
return IndexDoc{}, err
}
return idx, nil
}
// ---- ES Search + HDFS fetch ----
type HitSource struct {
DocID string `json:"doc_id"`
Title string `json:"title"`
Tags []string `json:"tags"`
TS time.Time `json:"ts"`
HDFSPath string `json:"hdfs_path"`
Offset int64 `json:"offset"`
Length int `json:"length"`
Checksum string `json:"checksum"`
}
type ESSearchResp struct {
Hits struct {
Hits []struct {
Source HitSource `json:"_source"`
} `json:"hits"`
} `json:"hits"`
}
func search(es *elasticsearch.Client, index string, q string, size int) ([]HitSource, error) {
query := map[string]any{
"query": map[string]any{
"multi_match": map[string]any{
"query": q,
"fields": []string{"title^2", "tags"},
},
},
"size": size,
}
var buf bytes.Buffer
_ = json.NewEncoder(&buf).Encode(query)
resp, err := es.Search(
es.Search.WithIndex(index),
es.Search.WithBody(&buf),
)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.IsError() {
b, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("es search error: %s", string(b))
}
var r ESSearchResp
if err := json.NewDecoder(resp.Body).Decode(&r); err != nil {
return nil, err
}
out := make([]HitSource, 0, len(r.Hits.Hits))
for _, h := range r.Hits.Hits {
out = append(out, h.Source)
}
return out, nil
}
func fetchRawByHit(hdfsClient *hdfs.Client, hit HitSource) (RawDoc, []byte, error) {
payload, err := readRecord(hdfsClient, hit.HDFSPath, hit.Offset)
if err != nil {
return RawDoc{}, nil, err
}
if hit.Checksum != "" {
if sha256Hex(payload) != hit.Checksum {
return RawDoc{}, nil, fmt.Errorf("checksum mismatch doc_id=%s", hit.DocID)
}
}
var doc RawDoc
if err := json.Unmarshal(payload, &doc); err != nil {
return RawDoc{}, payload, fmt.Errorf("unmarshal payload: %w", err)
}
return doc, payload, nil
}
func main() {
// HDFS client
hdfsClient, err := hdfs.New("namenode1:8020")
if err != nil {
log.Fatal(err)
}
// ES client
es, err := elasticsearch.NewClient(elasticsearch.Config{
Addresses: []string{"http://es1:9200"},
})
if err != nil {
log.Fatal(err)
}
const (
hdfsBaseDir = "/data/app"
esIndex = "my_index"
)
// --- Ingest demo ---
now := time.Now()
doc := RawDoc{
DocID: fmt.Sprintf("doc-%d", now.UnixNano()),
Title: "方案B:length-prefix + offset 回读",
Tags: []string{"demo", "hdfs", "es"},
TS: now,
Body: map[string]any{
"message": "this is raw data stored in HDFS append log",
"ts": now.Format(time.RFC3339Nano),
},
}
idx, err := ingestOne(hdfsClient, es, hdfsBaseDir, esIndex, doc)
if err != nil {
log.Fatal("ingest error:", err)
}
log.Printf("ingested doc_id=%s hdfs=%s offset=%d length=%d\n", idx.DocID, idx.HDFSPath, idx.Offset, idx.Length)
// --- Search + fetch demo ---
hits, err := search(es, esIndex, "length-prefix", 5)
if err != nil {
log.Fatal("search error:", err)
}
for _, hit := range hits {
raw, _, err := fetchRawByHit(hdfsClient, hit)
if err != nil {
log.Fatal("fetch error:", err)
}
log.Printf("fetched doc_id=%s title=%s body=%v\n", raw.DocID, raw.Title, raw.Body)
}
}
好处:
-
扛规模:避免 NameNode 元数据爆炸 小文件模式每个文件都会占用 NameNode 内存(inode、block 元数据等)。数据量上来后,瓶颈不是 DataNode 磁盘,而是 NameNode 内存与 RPC。大文件模式把 "百万 / 千万文件" 变成 "每天几百 / 几千个文件",NameNode 压力直接数量级下降。
-
吞吐更高:顺序写 / 顺序读更符合 HDFS 设计 HDFS 为大文件流式 IO 优化:pipeline 写入、磁盘顺序写、网络顺序传输。小文件则会产生大量 open/create/close 与 block 分配的 RPC 往返,吞吐被元操作吃掉。
-
成本更低:相同副本下更少的管理开销 + 更好的压缩空间
- 文件数少 → 元数据与小块浪费少
- 如果用 Parquet/ORC 等列存,压缩比与编码效率通常远好于原始 JSON
-
更利于下游计算:天然对接 Spark/Flink/Hive 大文件(尤其 Parquet)是大数据计算的 "主粮"。小文件会导致 大量 task 启动 / 调度 /scan 开销,计算性能和稳定性都差。
-
生命周期管理更简单:按天 / 批次做归档、冷热分层、删除大文件按分区(dt=2026-02-19)管理:过期直接删分区目录即可。小文件删除 / 迁移涉及海量 RPC,速度慢、对集群冲击大。
-
写入更稳:更少对象 = 更少失败点小文件模式下任何一次抖动都可能导致 "部分文件成功、部分失败" 的碎片化问题;大文件批量写更容易做幂等与重试(例如按批次文件名、manifest)。