基于上篇双向流式解析方案,为进一步验证JSON的洋葱式双向解析器的理论可行性,实现了如下Ruby代码演示。
一、代码演示
ruby
# frozen_string_literal: true
require 'thread'
require 'json'
# ==========================================
# 正向解析器
# ==========================================
class ForwardParser
attr_reader :root, :raw_incomplete, :pending_key
attr_reader :stack, :current_container
def initialize
@root = nil
@stack = []
@current_container = nil
@pending_key = nil
@state = :value
@buffer = ""
@raw_incomplete = ""
end
def parse_chunk(chunk)
@buffer += chunk
end
def finalize
safe_end = find_safe_boundary(@buffer)
if safe_end > 0 && safe_end < @buffer.length
@raw_incomplete = @buffer[safe_end..-1]
parse_internal(@buffer[0...safe_end])
elsif safe_end == @buffer.length
parse_internal(@buffer)
@raw_incomplete = ""
else
@raw_incomplete = @buffer
end
@root
end
private
def find_safe_boundary(data)
in_str, esc, safe = false, false, 0
data.each_char.with_index do |c, i|
if esc then esc = false; next end
esc = true if c == '\\'
in_str = !in_str if c == '"'
safe = i + 1 if !in_str && '{}[],:'.include?(c)
end
safe
end
def parse_internal(data)
i = 0
while i < data.length
c = data[i]
case c
when ' ', "\t", "\n", "\r" then nil
when '{' then push_container({}, :key)
when '[' then push_container([], :value)
when '}', ']' then pop_container
when ':' then @state = :value
when ',' then @state = @current_container.is_a?(Hash) ? :key : :value
when '"'
if (e = find_str_end(data, i + 1))
handle_string(unescape(data[(i+1)...e]))
i = e
end
when '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
e = i
e += 1 while e < data.length && data[e] =~ /[\d.eE+\-]/
attach_value(parse_num(data[i...e]))
i = e - 1
when 't' then attach_value(true); i += 3 if data[i, 4] == 'true'
when 'f' then attach_value(false); i += 4 if data[i, 5] == 'false'
when 'n' then attach_value(nil); i += 3 if data[i, 4] == 'null'
end
i += 1
end
end
def push_container(cont, state)
attach_value(cont)
@stack.push({ container: @current_container, key: @pending_key })
@current_container = cont
@pending_key = nil
@state = state
end
def pop_container
@current_container = @stack.pop[:container] if @stack.any?
@state = :comma
end
def handle_string(str)
if @state == :key
@pending_key = str
@state = :colon
else
attach_value(str)
@state = :comma
end
end
def attach_value(val)
if @root.nil?
@root = val
@current_container = val if val.is_a?(Hash) || val.is_a?(Array)
elsif @current_container.is_a?(Hash) && @pending_key
@current_container[@pending_key] = val
@pending_key = nil
elsif @current_container.is_a?(Array)
@current_container << val
end
end
def find_str_end(data, start)
i = start
while i < data.length
return i if data[i] == '"'
i += data[i] == '\\' ? 2 : 1
end
nil
end
def parse_num(s)
s.to_s.include?('.') || s.to_s.downcase.include?('e') ? s.to_f : s.to_i
end
def unescape(s)
s.to_s.gsub('\\\\', "\x00").gsub('\\"', '"')
.gsub('\\n', "\n").gsub('\\r', "\r").gsub('\\t', "\t").gsub("\x00", '\\')
end
end
# ==========================================
# 逆向解析器
# ==========================================
class ReverseParser
ANON = "__ANON__"
attr_reader :result, :raw_incomplete
def initialize(debug: false)
@debug = debug
@result = nil
@raw_incomplete = ""
end
def parse(data)
@result, end_pos = parse_value(data, data.length - 1)
@raw_incomplete = end_pos >= 0 ? data[0..end_pos] : ""
@result
end
private
def log(msg)
puts "[Rev] #{msg}" if @debug
end
def parse_value(data, pos)
pos = skip_ws(data, pos)
return [nil, pos] if pos < 0
c = data[pos]
log "val@#{pos}:'#{c}'"
case c
when '}' then parse_object(data, pos)
when ']' then parse_array(data, pos)
when '"' then parse_string(data, pos)
when '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
parse_number(data, pos)
when 'e'
pos >= 3 && data[(pos-3)..pos] == 'true' ? [true, pos - 4] : [nil, pos - 1]
when 's'
pos >= 4 && data[(pos-4)..pos] == 'false' ? [false, pos - 5] : [nil, pos - 1]
when 'l'
pos >= 3 && data[(pos-3)..pos] == 'null' ? [nil, pos - 4] : [nil, pos - 1]
else
[nil, pos - 1]
end
end
def parse_object(data, pos)
result = {}
pos -= 1
pos = skip_ws(data, pos)
while pos >= 0
c = data[pos]
log "obj@#{pos}:'#{c}'"
case c
when '{' then return [result, pos - 1]
when ','
pos -= 1
pos = skip_ws(data, pos)
else
value, pos = parse_value(data, pos)
pos = skip_ws(data, pos)
if pos >= 0 && data[pos] == ':'
pos -= 1
pos = skip_ws(data, pos)
if pos >= 0 && data[pos] == '"'
key, pos = parse_string(data, pos)
result[key] = value
log "kv: #{key}=#{value.inspect[0..20]}"
else
result[ANON] = value
log "anon-val: #{value.inspect[0..20]}"
end
else
result[ANON] = value
log "anon-no-colon: #{value.inspect[0..20]}"
end
pos = skip_ws(data, pos)
end
end
[result, pos]
end
def parse_array(data, pos)
result = []
pos -= 1
pos = skip_ws(data, pos)
while pos >= 0
c = data[pos]
log "arr@#{pos}:'#{c}'"
case c
when '[' then return [result, pos - 1]
when ','
pos -= 1
pos = skip_ws(data, pos)
else
value, pos = parse_value(data, pos)
result.unshift(value)
pos = skip_ws(data, pos)
end
end
# 未闭合数组
[{ANON => result}, pos]
end
def parse_string(data, pos)
end_pos = pos
pos -= 1
while pos >= 0
if data[pos] == '"'
esc = 0
j = pos - 1
while j >= 0 && data[j] == '\\'
esc += 1
j -= 1
end
return [unescape(data[(pos+1)...end_pos]), pos - 1] if esc.even?
end
pos -= 1
end
["", -1]
end
def parse_number(data, pos)
end_pos = pos
pos -= 1 while pos >= 0 && data[pos] =~ /[\d.eE+\-]/
pos -= 1 if pos >= 0 && data[pos] == '-'
s = data[(pos+1)..end_pos]
[s.include?('.') || s.downcase.include?('e') ? s.to_f : s.to_i, pos]
end
def skip_ws(data, pos)
pos -= 1 while pos >= 0 && data[pos] =~ /\s/
pos
end
def unescape(s)
s.to_s.gsub('\\\\', "\x00").gsub('\\"', '"')
.gsub('\\n', "\n").gsub('\\r', "\r").gsub('\\t', "\t").gsub("\x00", '\\')
end
end
# ==========================================
# 洋葱式合并器
# ==========================================
class OnionMerger
ANON = ReverseParser::ANON
def initialize(debug: false)
@debug = debug
end
def merge(fwd_parser, rev_parser)
fwd = fwd_parser.root
rev = rev_parser.result
fwd_stack = fwd_parser.stack
fwd_current = fwd_parser.current_container
fwd_pending = fwd_parser.pending_key
fwd_incomplete = fwd_parser.raw_incomplete
rev_incomplete = rev_parser.raw_incomplete
log "===== 洋葱合并 ====="
log "正向: #{fmt(fwd)}"
log "正向栈深: #{fwd_stack.length}"
log "正向当前: #{fwd_current.class} => #{fmt(fwd_current)}"
log "正向待定键: #{fwd_pending.inspect}"
log "逆向: #{fmt(rev)}"
return fwd if rev.nil?
return rev if fwd.nil?
middle = fwd_incomplete.to_s + rev_incomplete.to_s
middle_keys = extract_keys(middle)
log "中间键: #{middle_keys}"
# 提取匿名层
extracted = extract_anon_layers(rev)
anon_layers = extracted[:layers]
normal = extracted[:normal]
log "匿名层: #{anon_layers.map { |l| fmt(l) }}"
log "普通部分: #{fmt(normal)}"
# 从最内层开始合并
if anon_layers.any?
innermost = anon_layers.pop
if fwd_pending
# 正向等待一个键的值
log "填充待定键: #{fwd_pending} = #{fmt(innermost)}"
if innermost.is_a?(Array) && innermost.first.is_a?(Hash) && innermost.first.key?(ANON)
# 数组第一个元素有匿名值,需要填入当前容器
first_item = innermost.shift
anon_val = first_item.delete(ANON)
fwd_current[fwd_pending] = anon_val
log " 填入值: #{anon_val}"
# 剩余数组元素需要追加到父数组
if innermost.any?
parent_array = find_parent_array(fwd_stack)
if parent_array
innermost.each { |item| parent_array << clean_anon(item) }
log " 追加到父数组: #{innermost.length}项"
end
end
else
fwd_current[fwd_pending] = clean_anon(innermost)
end
elsif fwd_current.is_a?(Array)
# 追加到当前数组
if innermost.is_a?(Array)
innermost.each { |v| fwd_current << clean_anon(v) }
else
fwd_current << clean_anon(innermost)
end
log "追加到数组"
elsif !middle_keys.empty?
key = middle_keys.shift
inject_to_container(fwd_current, key, innermost)
log "注入到键: #{key}"
end
# 处理剩余层
anon_layers.reverse.each_with_index do |layer, idx|
if idx < fwd_stack.length
target = fwd_stack[-(idx+1)][:container]
if target.is_a?(Array) && layer.is_a?(Array)
layer.each { |v| target << clean_anon(v) }
elsif target.is_a?(Hash) && idx < middle_keys.length
target[middle_keys[idx]] = clean_anon(layer)
end
end
end
end
# 合并普通部分
merge_normal(fwd, normal)
log "结果: #{fmt(fwd)}"
fwd
end
private
def log(msg)
puts "[Merge] #{msg}" if @debug
end
def fmt(obj)
return "nil" if obj.nil?
s = obj.inspect
s.length > 50 ? s[0..50] + "..." : s
end
def extract_keys(middle)
keys = []
middle.scan(/"([^"]+)"\s*:/) { |m| keys << m[0] }
keys.empty? ? middle.scan(/"([^"]+)"/).flatten : keys
end
def extract_anon_layers(obj)
layers = []
normal = {}
return {layers: [], normal: obj} unless obj.is_a?(Hash)
obj.each do |k, v|
if k == ANON
if v.is_a?(Hash) && v.key?(ANON)
inner = extract_anon_layers(v)
layers = inner[:layers]
layers.unshift(inner[:normal]) unless inner[:normal].empty?
elsif v.is_a?(Array)
layers.unshift(v)
elsif v.is_a?(Hash)
inner = extract_anon_layers(v)
if inner[:layers].any?
layers = inner[:layers]
layers.unshift(inner[:normal]) unless inner[:normal].empty?
else
layers.unshift(v)
end
else
layers.unshift(v)
end
else
normal[k] = v.is_a?(Hash) ? clean_anon(v) : v
end
end
{layers: layers, normal: normal}
end
def find_parent_array(stack)
stack.reverse.each do |frame|
return frame[:container] if frame[:container].is_a?(Array)
end
nil
end
def inject_to_container(container, key, value)
return unless container.is_a?(Hash)
container[key] = clean_anon(value)
end
def clean_anon(obj)
case obj
when Hash
result = {}
obj.each { |k, v| result[k] = clean_anon(v) unless k == ANON }
result
when Array
obj.map { |v| clean_anon(v) }
else
obj
end
end
def merge_normal(fwd, normal)
return unless fwd.is_a?(Hash) && normal.is_a?(Hash)
normal.each do |k, v|
if fwd.key?(k)
if fwd[k].is_a?(Hash) && v.is_a?(Hash)
merge_normal(fwd[k], v)
elsif fwd[k].is_a?(Array) && v.is_a?(Array)
v.each { |item| fwd[k] << clean_anon(item) }
end
else
fwd[k] = clean_anon(v)
end
end
end
end
# ==========================================
# 洋葱式双向解析器
# ==========================================
class OnionBiDirectionalParser
def initialize(filepath, chunk_size: 8192, debug: false)
@filepath = filepath
@file_size = File.size(filepath)
@chunk_size = chunk_size
@debug = debug
end
def parse
t = Time.now
log "解析: #{@filepath} (#{@file_size} bytes)"
content = File.read(@filepath)
return parse_single(content) if @file_size <= @chunk_size
mid = @file_size / 2
front = content[0...mid]
back = content[mid..-1]
fwd = ForwardParser.new
rev = ReverseParser.new(debug: @debug)
t1 = Thread.new { fwd.parse_chunk(front); fwd.finalize }
t2 = Thread.new { rev.parse(back) }
t1.join; t2.join
log "正向: #{fmt(fwd.root)}"
log "逆向: #{fmt(rev.result)}"
result = OnionMerger.new(debug: @debug).merge(fwd, rev)
puts "[Onion] #{(Time.now - t).round(3)}s"
result
end
private
def log(msg)
puts "[Onion] #{msg}" if @debug
end
def fmt(obj)
s = obj.inspect
s.length > 60 ? s[0..60] + "..." : s
end
def parse_single(content)
p = ForwardParser.new
p.parse_chunk(content)
p.finalize
end
end
# ==========================================
# 测试
# ==========================================
if __FILE__ == $0
def test(name, json, debug: false)
file = "test_#{name}.json"
File.write(file, json)
expected = JSON.parse(json)
puts "\n#{'=' * 60}"
puts "#{name}: #{json[0..50]}..."
result = OnionBiDirectionalParser.new(file, chunk_size: 16, debug: debug).parse
ok = result == expected
unless ok
puts "预期: #{expected.inspect[0..80]}"
puts "结果: #{result.inspect[0..80]}"
diff(expected, result)
end
puts ok ? '✅' : '❌'
File.delete(file) rescue nil
ok
end
def diff(e, a, p = "$")
return if e == a
if e.nil? || a.nil? || e.class != a.class
return puts " #{p}: #{e.class}≠#{a.class}"
end
case e
when Hash
(e.keys | a.keys).each do |k|
if !e.key?(k) then puts " #{p}.#{k}: 多余"
elsif !a.key?(k) then puts " #{p}.#{k}: 缺失"
else diff(e[k], a[k], "#{p}.#{k}")
end
end
when Array
if e.length != a.length
puts " #{p}: 长度#{e.length}≠#{a.length}"
else
e.each_with_index { |v, i| diff(v, a[i], "#{p}[#{i}]") }
end
else
puts " #{p}: #{e}≠#{a}"
end
end
r = []
r << test("t1", '{"a": 1, "b": 2, "c": 3}', debug: true)
r << test("t2", '{"x": {"y": 1}, "z": 2}', debug: true)
r << test("t3", '[1, 2, 3, 4, 5]', debug: true)
r << test("t4", '{"arr": [1, 2, 3], "obj": {"k": "v"}}', debug: true)
m = {"users" => (1..10).map { |i| {"id" => i} }, "meta" => {"total" => 10}}
r << test("t5", JSON.generate(m), debug: true)
puts "\n总结: #{r.count(true)}/#{r.length}"
end
二、架构解析

三、核心设计点
| 特性 | 实现 |
|---|---|
| 无预扫描 | 直接从中间切,不需要找安全点 |
| 匿名键 | 逆向解析时未知的键用 __ANON__ 占位 |
| 层层向外 | 嵌套的匿名键递归展开成层 |
| 栈对应 | 匿名层与正向解析器的容器栈深度对应 |
| 待定键填充 | 正向 pending_key 接收最内层匿名值 |
四、复杂场景处理
# t5: 数组中间切断
正向: {"users" => [{id:1}, {id:2}, ..., {id:5}, {}]} # 最后一个对象不完整
逆向: {__ANON__ => {__ANON__ => [{__ANON__=>6}, {id:7}, ...]}, meta: {...}}
合并:
1. 展开匿名层得到数组 [{__ANON__=>6}, {id:7}, ...]
2. 从第一个元素提取 6,填入待定键 id
3. 剩余元素追加到父数组 users
4. 合并 meta 到根对象
这就是洋葱式双向解析的设计理念,这只是逻辑上的实现,理论验证,不考虑任何性能优化。