PDD 直播间 评论 , wss hex Protobuf 解析流程分析学习

介绍

本文章中所有内容仅供学习交流使用,不用于其他任何目的,不提供完整代码,抓包内容、敏感网址、数据接口等均已做脱敏处理,严禁用于商业用途和非法用途,否则由此产生的一切后果均与作者无关.本文章未经许可禁止转载,禁止任何修改后二次传播,擅自使用本文讲解的技术而导致的任何意外,作者均不负责 若有侵权,请联系作者立即删除!

核心流程

复制代码
完整 WSS ArrayBuffer
  ↓
解析 16 字节固定包头
  ↓
解析 TitanPayload Protobuf
  ↓
如果 compress === GZIP,对 TitanPayload.body 解 gzip
  ↓
解析 MulticastLite Protobuf
  ↓
读取 MulticastLite.payload
  ↓
UTF-8 decode
  ↓
清洗 JSON 字符串
  ↓
JSON.parse
  ↓
得到 checked_show_id / message_type / message_data 等业务对象

js 版本代码

javascript 复制代码
async function gunzipBrowser(bytes) {
  const stream = new Blob([bytes]).stream();
  const decompressedStream = stream.pipeThrough(
    new DecompressionStream("gzip")
  );

  const arrayBuffer = await new Response(decompressedStream).arrayBuffer();

  return new Uint8Array(arrayBuffer);
}

/**
 * hex 字符串转 Uint8Array
 */
function hexToUint8Array(hex) {
  hex = hex.replace(/\s+/g, "");

  if (hex.length % 2 !== 0) {
    throw new Error("Invalid hex length");
  }

  const arr = new Uint8Array(hex.length / 2);

  for (let i = 0; i < arr.length; i++) {
    arr[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
  }

  return arr;
}

/**
 * Uint8Array 转 ArrayBuffer 视图
 */
function toArrayBufferView(input) {
  if (input instanceof Uint8Array) {
    return input;
  }

  if (input instanceof ArrayBuffer) {
    return new Uint8Array(input);
  }

  throw new Error("Unsupported input type");
}

/**
 * Protobuf varint reader
 */
class ProtoReader {
  constructor(bytes) {
    this.bytes = bytes;
    this.pos = 0;
    this.len = bytes.length;
  }

  eof() {
    return this.pos >= this.len;
  }

  uint32() {
    let value = 0;
    let shift = 0;

    while (true) {
      if (this.pos >= this.len) {
        throw new Error("Unexpected EOF while reading varint");
      }

      const b = this.bytes[this.pos++];
      value |= (b & 0x7f) << shift;

      if ((b & 0x80) === 0) {
        return value >>> 0;
      }

      shift += 7;

      if (shift > 35) {
        throw new Error("Varint too long");
      }
    }
  }

  uint64AsBigInt() {
    let value = 0n;
    let shift = 0n;

    while (true) {
      if (this.pos >= this.len) {
        throw new Error("Unexpected EOF while reading varint64");
      }

      const b = this.bytes[this.pos++];
      value |= BigInt(b & 0x7f) << shift;

      if ((b & 0x80) === 0) {
        return value;
      }

      shift += 7n;

      if (shift > 70n) {
        throw new Error("Varint64 too long");
      }
    }
  }

  bool() {
    return this.uint32() !== 0;
  }

  bytesField() {
    const length = this.uint32();

    if (this.pos + length > this.len) {
      throw new Error("Length-delimited field exceeds buffer");
    }

    const value = this.bytes.slice(this.pos, this.pos + length);
    this.pos += length;

    return value;
  }

  string() {
     const encoder = new TextEncoder();
     const buf = encoder.encode(this.bytesField());
    return buf;
  }

  skipType(wireType) {
    switch (wireType) {
      case 0:
        this.uint64AsBigInt();
        break;

      case 1:
        this.pos += 8;
        break;

      case 2: {
        const length = this.uint32();
        this.pos += length;
        break;
      }

      case 5:
        this.pos += 4;
        break;

      default:
        throw new Error(`Unsupported wire type: ${wireType}`);
    }

    if (this.pos > this.len) {
      throw new Error("Skipped beyond buffer length");
    }
  }
}

/**
 * 解析 WSS 固定 16 字节包头
 */
function parseSocketFrame(input) {
  const bytes = toArrayBufferView(input);

  if (bytes.length < 16) {
    throw new Error("Socket frame too short");
  }

  const view = new DataView(
    bytes.buffer,
    bytes.byteOffset,
    bytes.byteLength
  );

  const magic = view.getInt16(0, false);
  const cmd = view.getInt16(2, false);
  const ctx = view.getInt32(4, false);
  const reserve = view.getInt32(8, false);
  const bodyLen = view.getInt32(12, false);
  const payloadBytes = bytes.slice(16);

  return {
    magic,
    cmd,
    ctx,
    reserve,
    bodyLen,
    payloadBytes
  };
}

/**
 * 解析 TitanPayload
 *
 * 根据样例和 buildTitanData 推断:
 * field 1  => command string
 * field 2  => protocol varint
 * field 6  => appId / protocol 相关 varint,样例里为 1
 * field 10 => body bytes
 * field 12 => upstreamSeq / timestamp 类 varint
 * field 13 => host / extra 类字段,样例中为 varint
 * field 14 => compress varint
 *
 * 这里保留 unknownFields,避免字段名未完全确认影响解析主流程。
 */
function decodeTitanPayload(bytes) {
  const reader = new ProtoReader(bytes);

  const payload = {
    command: "",
    protocol: undefined,
    compress: 0,
    body: new Uint8Array(),
    extension: undefined,
    unknownFields: []
  };

  while (!reader.eof()) {
    const tag = reader.uint32();
    const fieldNo = tag >>> 3;
    const wireType = tag & 7;

    switch (fieldNo) {
      case 1:
        if (wireType !== 2) {
          reader.skipType(wireType);
        } else {
          payload.command = reader.string();
        }
        break;

      case 2:
        if (wireType === 0) {
          payload.protocol = Number(reader.uint64AsBigInt());
        } else {
          reader.skipType(wireType);
        }
        break;

      case 10:
        if (wireType !== 2) {
          reader.skipType(wireType);
        } else {
          payload.body = reader.bytesField();
        }
        break;

      case 11:
        if (wireType !== 2) {
          reader.skipType(wireType);
        } else {
          payload.extension = reader.bytesField();
        }
        break;

      case 14:
        if (wireType === 0) {
          payload.compress = Number(reader.uint64AsBigInt());
        } else {
          reader.skipType(wireType);
        }
        break;

      default: {
        const start = reader.pos;
        reader.skipType(wireType);
        payload.unknownFields.push({
          fieldNo,
          wireType,
          start
        });
      }
    }
  }

  return payload;
}

/**
 * 解析 MulticastLite
 *
 * message MulticastLite {
 *   uint32 bizType = 1;
 *   string groupId = 2;
 *   string msgId = 3;
 *   bytes payload = 4;
 *   bool needAck = 5;
 * }
 */
function decodeMulticastLite(bytes) {
  const reader = new ProtoReader(bytes);

  const message = {
    bizType: 0,
    groupId: "",
    msgId: "",
    payload: new Uint8Array(),
    needAck: false
  };

  while (!reader.eof()) {
    const tag = reader.uint32();
    const fieldNo = tag >>> 3;
    const wireType = tag & 7;

    switch (fieldNo) {
      case 1:
        message.bizType = reader.uint32();
        break;

      case 2:
        message.groupId = reader.string();
        break;

      case 3:
        message.msgId = reader.string();
        break;

      case 4:
        message.payload = reader.bytesField();
        break;

      case 5:
        message.needAck = reader.bool();
        break;

      default:
        reader.skipType(wireType);
    }
  }

  return message;
}
function normalizeJsonString(str) {
  return str
    .replace(/\n/g, "\\\\n")
    .replace(/\r/g, "\\\\r")
    .replace(/\t/g, "\\\\t")
    .replace(/\u2028/g, "");
}

function safeJsonParse(value, fallback) {
  if (typeof value !== "string") {
    return fallback;
  }

  try {
    return JSON.parse(value) || fallback;
  } catch (err) {
    console.warn("[safeJsonParse failed]", err, value);
    return fallback;
  }
}

function bytesToUtf8String(input) {
  if (!input) {
    return "";
  }

  if (typeof input === "string") {
    return input;
  }

  if (input instanceof ArrayBuffer) {
    return new TextDecoder("utf-8").decode(new Uint8Array(input));
  }

  if (input instanceof Uint8Array) {
    return new TextDecoder("utf-8").decode(input);
  }

  if (Array.isArray(input)) {
    return new TextDecoder("utf-8").decode(new Uint8Array(input));
  }

  // 兜底:如果是 protobufjs ByteBuffer / Buffer-like
  if (input.buffer instanceof ArrayBuffer) {
    return new TextDecoder("utf-8").decode(
      new Uint8Array(input.buffer, input.byteOffset || 0, input.byteLength || input.length)
    );
  }

  return "";
}

/**
 * 最终业务 payload 解码:
 * Uint8Array -> UTF-8 string -> JSON.parse
 */
function decodeBusinessPayload(payloadBytes) {
  console.log("[decodeBusinessPayload input]", payloadBytes);

  let text = bytesToUtf8String(payloadBytes);

  console.log("[decodeBusinessPayload text]", text);

  text = normalizeJsonString(text);

  const parsed = safeJsonParse(text, text);

  console.log("[decodeBusinessPayload parsed]", parsed);

  return parsed;
}

/**
 * 完整解析入口
 */
async function parseWssMessage(input) {
  const frame = parseSocketFrame(input);

  const titanPayload = decodeTitanPayload(frame.payloadBytes);

  // 从你的样例看 compress === 1 表示 GZIP
   const GZIP = 1;

  if (titanPayload.compress === GZIP && titanPayload.body.byteLength > 0) {
    titanPayload.body = await gunzipBrowser(titanPayload.body);
  }

  const multicastLite = decodeMulticastLite(titanPayload.body);
  const businessData = decodeBusinessPayload(multicastLite.payload);

  return {
    frame,
    titanPayload,
    multicastLite,
    businessData
  };
}
/**
 * 将 N(payload) 得到的 businessData 统一规整成数组
 *
 * businessData 可能是:
 * 1. [{...}]
 * 2. {...}
 * 3. JSON 字符串
 */
function normalizeBusinessMessageList(businessData) {
  if (!businessData) {
    return [];
  }

  if (Array.isArray(businessData)) {
    return businessData;
  }

  if (typeof businessData === "string") {
    try {
      const parsed = JSON.parse(businessData);
      return normalizeBusinessMessageList(parsed);
    } catch {
      return [];
    }
  }

  if (typeof businessData === "object") {
    return [businessData];
  }

  return [];
}

/**
 * 提取 live_chat_list
 */
function extractLiveChatList(message) {
  const messageData = message && message.message_data;

  if (!messageData || typeof messageData !== "object") {
    return [];
  }

  if (Array.isArray(messageData.live_chat_list)) {
    return messageData.live_chat_list;
  }

  return [];
}

/**
 * 提取你最关心的核心业务字段
 */
function extractCoreBusinessMessages(businessData) {
  const messages = normalizeBusinessMessageList(businessData);

  return messages.map((message) => {
    const liveChatList = extractLiveChatList(message);

    return {
      message_type: message.message_type,
      live_msg_id: message.live_msg_id,
      push_mills: message.push_mills,
      checked_show_id: message.checked_show_id,
      message_data: message.message_data,
      live_chat_list: liveChatList,

      // 如果是 live_chat,顺手把聊天摘要展开,方便直接看
      chat_summary: liveChatList.map((chat) => ({
        uid: chat.uid,
        nickname: chat.nickname,
        chat_message: chat.chat_message,
        live_msg_id: chat.live_msg_id,
        timestamp: chat.timestamp,
        priority: chat.priority,
        sub_type: chat.sub_type,
        chat_sub_type: chat.chat_sub_type,
        can_reply: chat.can_reply
      }))
    };
  });
}

python 版本

python 复制代码
import gzip
import json
import struct
from dataclasses import dataclass
from typing import Any, Dict, List, Optional


def hex_to_bytes(hex_str: str) -> bytes:
    hex_str = "".join(hex_str.split())

    if len(hex_str) % 2 != 0:
        raise ValueError("Invalid hex length")

    return bytes.fromhex(hex_str)


class ProtoReader:
    def __init__(self, data: bytes):
        self.data = data
        self.pos = 0
        self.length = len(data)

    def eof(self) -> bool:
        return self.pos >= self.length

    def uint64(self) -> int:
        shift = 0
        value = 0

        while True:
            if self.pos >= self.length:
                raise EOFError("Unexpected EOF while reading varint")

            b = self.data[self.pos]
            self.pos += 1

            value |= (b & 0x7F) << shift

            if (b & 0x80) == 0:
                return value

            shift += 7

            if shift > 70:
                raise ValueError("Varint too long")

    def uint32(self) -> int:
        return self.uint64() & 0xFFFFFFFF

    def bool(self) -> bool:
        return self.uint64() != 0

    def bytes_field(self) -> bytes:
        size = self.uint64()

        if self.pos + size > self.length:
            raise EOFError("Length-delimited field exceeds buffer")

        value = self.data[self.pos:self.pos + size]
        self.pos += size

        return value

    def string(self) -> str:
        return self.bytes_field().decode("utf-8")

    def skip_type(self, wire_type: int) -> None:
        if wire_type == 0:
            self.uint64()

        elif wire_type == 1:
            self.pos += 8

        elif wire_type == 2:
            size = self.uint64()
            self.pos += size

        elif wire_type == 5:
            self.pos += 4

        else:
            raise ValueError(f"Unsupported wire type: {wire_type}")

        if self.pos > self.length:
            raise EOFError("Skipped beyond buffer length")


@dataclass
class SocketFrame:
    magic: int
    cmd: int
    ctx: int
    reserve: int
    body_len: int
    payload_bytes: bytes


@dataclass
class TitanPayload:
    command: str = ""
    protocol: Optional[int] = None
    compress: int = 0
    body: bytes = b""
    extension: Optional[bytes] = None
    unknown_fields: Optional[List[Dict[str, Any]]] = None


@dataclass
class MulticastLite:
    biz_type: int = 0
    group_id: str = ""
    msg_id: str = ""
    payload: bytes = b""
    need_ack: bool = False


def parse_socket_frame(data: bytes) -> SocketFrame:
    if len(data) < 16:
        raise ValueError("Socket frame too short")

    magic, cmd, ctx, reserve, body_len = struct.unpack(">hhiii", data[:16])
    payload_bytes = data[16:]

    return SocketFrame(
        magic=magic,
        cmd=cmd,
        ctx=ctx,
        reserve=reserve,
        body_len=body_len,
        payload_bytes=payload_bytes
    )


def decode_titan_payload(data: bytes) -> TitanPayload:
    reader = ProtoReader(data)

    payload = TitanPayload(
        unknown_fields=[]
    )

    while not reader.eof():
        tag = reader.uint32()
        field_no = tag >> 3
        wire_type = tag & 7

        if field_no == 1:
            if wire_type == 2:
                payload.command = reader.string()
            else:
                reader.skip_type(wire_type)

        elif field_no == 2:
            if wire_type == 0:
                payload.protocol = reader.uint64()
            else:
                reader.skip_type(wire_type)

        elif field_no == 10:
            if wire_type == 2:
                payload.body = reader.bytes_field()
            else:
                reader.skip_type(wire_type)

        elif field_no == 11:
            if wire_type == 2:
                payload.extension = reader.bytes_field()
            else:
                reader.skip_type(wire_type)

        elif field_no == 14:
            if wire_type == 0:
                payload.compress = reader.uint64()
            else:
                reader.skip_type(wire_type)

        else:
            start = reader.pos
            reader.skip_type(wire_type)

            payload.unknown_fields.append({
                "field_no": field_no,
                "wire_type": wire_type,
                "start": start
            })

    return payload


def decode_multicast_lite(data: bytes) -> MulticastLite:
    reader = ProtoReader(data)

    message = MulticastLite()

    while not reader.eof():
        tag = reader.uint32()
        field_no = tag >> 3
        wire_type = tag & 7

        if field_no == 1:
            message.biz_type = reader.uint32()

        elif field_no == 2:
            message.group_id = reader.string()

        elif field_no == 3:
            message.msg_id = reader.string()

        elif field_no == 4:
            message.payload = reader.bytes_field()

        elif field_no == 5:
            message.need_ack = reader.bool()

        else:
            reader.skip_type(wire_type)

    return message


def normalize_json_string(text: str) -> str:
    return (
        text
        .replace("\n", "\\\\n")
        .replace("\r", "\\\\r")
        .replace("\t", "\\\\t")
        .replace("\u2028", "")
    )


def safe_json_parse(text: Any, fallback: Any) -> Any:
    if not isinstance(text, str):
        return fallback

    try:
        return json.loads(text) or fallback
    except Exception:
        return fallback


def decode_business_payload(payload: bytes, should_decode: bool = True) -> Any:
    """
    对应 JS 原始 N 函数:

    1. Uint8Array -> String.fromCharCode 拼 binary string
    2. c().decode(binary) -> UTF-8 decode
    3. z(text) -> JSON 字符串清洗
    4. _(text, text) -> JSON.parse
    """

    if should_decode:
        text = payload.decode("utf-8")
        text = normalize_json_string(text)
    else:
        text = "".join(chr(b) for b in payload)

    return safe_json_parse(text, text)


def parse_wss_message(data: bytes) -> Dict[str, Any]:
    frame = parse_socket_frame(data)
    titan_payload = decode_titan_payload(frame.payload_bytes)

    # 从源码逻辑和样例看,compress == 1 表示 GZIP
    GZIP = 1

    if titan_payload.compress == GZIP and titan_payload.body:
        titan_payload.body = gzip.decompress(titan_payload.body)

    if titan_payload.compress == GZIP and titan_payload.extension:
        titan_payload.extension = gzip.decompress(titan_payload.extension)

    multicast_lite = None
    business_data = None

    if titan_payload.command == "titan.mLite":
        multicast_lite = decode_multicast_lite(titan_payload.body)
        business_data = decode_business_payload(multicast_lite.payload)

    return {
        "frame": frame,
        "titan_payload": titan_payload,
        "multicast_lite": multicast_lite,
        "business_data": business_data
    }


if __name__ == "__main__":
    hex_str = """
    000a00660000000000000000000001d71001689ec4c4bafdffffffff01700160c3d5b1d1b6e5d1826a0a0b746974616e2e6d4c697465300152ac031f8b080000000000000075524d4f1b311015e20248bd70e881a3c581c346b2bd5e7fec8d34a842804a3f248a686479372671e3dd8d6c6fab0871e85fed1f696743daa2aab5a5f1d89e79f3ded87bdfbeef1fbea498725c10a6091639a38271a6313b3a66b8e0981a3232bc5223868d1cc99af291c25660852bc954817eecde3da0c6c668e656a7f5caa21279f7c5ea7a6112ca9efc26ceb59bc18d528260c9a98282009f4b0221ab3e2e74e3bc8fa8244248916349299738fb0d3c33c9a0f2e10fb2f62e265442ed2aa0f2def86833d40f3518169ca9bc203c43b56975b02bbf46650a3d44c4beda922418e037487f9db5ae5eb6a619745c7521add180dbc2ee553ebea437d76fae6e73c62fde4f6e6fcedf5d8f8bb7974cbf3efb38391d9404d705073965f10b7dab00f2c976fcb7293ca7580a2e098690e420339966f5ac27442a91215369d7de771bf1e02f2d941b1cdb9aca5bd03befbd09baee63ea1a1bb46dbacf0e1037abae7c572f2117dd7dea0b3e3903ab7206767c4aa768fa38cc81baad9776a6e3a2fbfa44f2df9f043d4e4f76e8d1c13e500de903703e7cf1ec0915fd0939ef5d5862020000
    """

    raw = hex_to_bytes(hex_str)
    result = parse_wss_message(raw)

    frame = result["frame"]
    titan = result["titan_payload"]
    lite = result["multicast_lite"]
    business = result["business_data"]

    print("=== Socket Frame ===")
    print({
        "magic": frame.magic,
        "cmd": frame.cmd,
        "ctx": frame.ctx,
        "reserve": frame.reserve,
        "body_len": frame.body_len
    })

    print("\n=== Titan Payload ===")
    print({
        "command": titan.command,
        "protocol": titan.protocol,
        "compress": titan.compress,
        "body_length": len(titan.body)
    })

    print("\n=== MulticastLite ===")
    if lite:
        print({
            "biz_type": lite.biz_type,
            "group_id": lite.group_id,
            "msg_id": lite.msg_id,
            "need_ack": lite.need_ack,
            "payload_length": len(lite.payload)
        })

    print("\n=== Business Data ===")
    print(json.dumps(business, ensure_ascii=False, indent=2))
相关推荐
小雅痞1 小时前
[Java][Leetcode simple] 205. 同构字符串
java·算法·leetcode
东风破1371 小时前
DM8达梦分布式计算数据库集群DPC安装部署学习记录
数据库·学习
ikoala1 小时前
用了几周明基 RD280UG,我终于明白程序员为什么需要一台“专用显示器”
前端·后端·程序员
空太Jun1 小时前
Git 使用学习笔记
笔记·git·学习
文心快码BaiduComate1 小时前
Comate搭载DeepSeek-V4
前端·后端
豹哥学前端2 小时前
5分钟搞懂事件委托
前端·javascript·面试
多加点辣也没关系2 小时前
设计模式-策略模式
java·设计模式·策略模式
2601_953660372 小时前
Java Map集合详解与实战
java·开发语言·python
Awu12272 小时前
🍎把数学公式搬进 Web 表格:一个 VTable 实战案例
前端