问题现象
执行以下代码,使用search_after从opensearch查历史数据,接着查实时数据,发现每次查询实时数据的时候,就存在重复数据或丢失部分数据。
排序使用@timestamp字段,这个在索引模板里是用的纳秒级的,完全可以用于排序
perl
"@timestamp": {
"format": "strict_date_optional_time_nanos||epoch_millis",
"type": "date_nanos"
},
查询代码
go
type Source struct {
Timestamp string `json:"@timestamp"`
Log LogEntry `json:"log"`
}
type HitItem struct {
Index string `json:"_index"`
ID string `json:"_id"`
Score float64 `json:"_score"`
Source Source `json:"_source"`
Sort []interface{} `json:"sort"`
}
type Hits struct {
Hits []HitItem `json:"hits"`
}
type Result struct {
Hits Hits `json:"hits"`
}
func fetchFromOpenSearch(client *opensearch.Client, size int, lastSort []interface{}) ([]HitItem, error) {
query := map[string]interface{}{
"query": map[string]interface{}{
"match_all": map[string]interface{}{},
},
"size": size,
"sort": []map[string]interface{}{
{"@timestamp": "asc"},
},
}
if lastSort != nil {
query["search_after"] = lastSort
}
var buf bytes.Buffer
if err := json.NewEncoder(&buf).Encode(query); err != nil {
return nil, nil, fmt.Errorf("error encoding query: %v", err)
}
res, err := client.Search(
client.Search.WithContext(context.Background()),
client.Search.WithIndex(openSearchIndex),
client.Search.WithBody(&buf),
)
if err != nil {
return nil, fmt.Errorf("error executing search: %v", err)
}
defer res.Body.Close()
if res.IsError() {
return nil, fmt.Errorf("search API error: %s", res.String())
}
ret := &Result{}
if err := json.NewDecoder(res.Body).Decode(&ret); err != nil {
return nil, fmt.Errorf("error decoding response: %v", err)
}
return ret.Hits.Hits, nil
}
func main() {
client := initOpensearch()
var lastSortValue []interface{}
for {
hits, err := fetchFromOpenSearch(client, 10000, lastSortValue)
if err != nil {
logx.Errorf("error querying OpenSearch: %v", err)
return
}
if len(hits) > 0 {
lastSortValue = hits[len(hits)-1].Sort
logx.Infof("Total: %d queries succeeded, first doc time: %s, last doc time: %s", len(hits), hits[0].Source.Timestamp, hits[len(hits)-1].Source.Timestamp)
} else {
logx.Infof("no doc found")
time.Sleep(time.Second * 5)
}
}
}
排查过程
写了个最简单的shell脚本,查一条doc,然后使用search_after查询它的下一条数据,发现一直打印的是当前的这条数据
bash
#!/bin/bash
# 第一个查询,获取最新的一个日志条目
response=$(curl -k \
-X GET -H "Content-Type: application/json" \
'https://localhost:9200/logs-data/_search?size=1' -d '{
"sort": [
{"@timestamp": {"order": "desc"}}
],
"fields": [
{ "field": "@timestamp", "format": "strict_date_optional_time_nanos" }
],
"query": {
"bool": {
"must": [
{
"term": {
"log.Pod": "test"
}
}
]
}
}
}')
echo "response: ${response}"
hits=$(echo "$response" | jq -r '.hits' )
echo "response2: ${hits}"
sort_value=$(echo "$response" | jq -r '.hits.hits[0].sort[0] | tostring')
echo "sort value: ${sort_value}"
query="curl -k \
-X GET -H \"Content-Type: application/json\" \
'https://localhost:9200/logs-data/_search?size=1' -d '{
\"sort\": [
{\"@timestamp\": {\"order\": \"asc\"}}
],
\"search_after\": ['$sort_value'],
\"query\": {
\"bool\": {
\"must\": [
{
\"term\": {
\"log.Pod\": \"test\"
}
}
]
}
}
}'"
echo "$query"
sleep 10
res=$(eval "$query")
echo "${res}"
将各种中间数据打印出来查看,发现sort的值跟原始返回的值不一样,如下
css
response: {...,"sort":[1752494844446320667]}]}}
response2: {...,"sort": [1752494844446320600]}]}}
最后两位的精度丢失了!!
原因总结
JSON 数值精度限制 :JavaScript/JSON 规范中数值类型(IEEE 754 双精度浮点数)仅能安全表示 53 位整数(最大
9007199254740991
),而纳秒级时间戳通常为 18-19 位十进制数(如1752494314834613658
),超出安全范围后会被截断或近似
jq
工具的处理 :jq
在解析 JSON 时默认将数值转换为浮点数,导致高精度时间戳末尾几位被修改(如1752494314834613658
→1752494314834613800
)
修改方案
将结构体里的sort改为int64类型,所有使用到sort的地方均使用int64
代码示例
go
type HitItem struct {
Index string `json:"_index"`
ID string `json:"_id"`
Score float64 `json:"_score"`
Source Source `json:"_source"`
Sort []int64 `json:"sort"`
}
func fetchFromOpenSearch(client *opensearch.Client, size int, lastSort int64, timeStamp string) ([]HitItem, error) {
...
if lastSort != 0 {
query["search_after"] = []int64{lastSort}
}
...
}
func main() {
client := initOpensearch()
var lastSortValue int64
lastSortValue = 0
for {
hits, err := fetchFromOpenSearch(client, 10000, lastSortValue)
if err != nil {
logx.Errorf("error querying OpenSearch: %v", err)
return
}
if len(hits) > 0 {
lastSortValue = hits[len(hits)-1].Sort[0]
logx.Infof("Total: %d queries succeeded, first doc time: %s, last doc time: %s", len(hits), hits[0].Source.Timestamp, hits[len(hits)-1].Source.Timestamp)
} else {
logx.Infof("no doc found")
time.Sleep(time.Second * 5)
}
}
}