前端基建：AST 的基本逻辑

什么是 AST

对于前端来说，ast 就是用 js 对象的方式去描述一种数据结构。ast 编译在构建工具中有大量应用，例如 es6 转 es5、ts 转 js 等等。 ast 编译的过程简单来说可以分为四步：

词法分析，将完整的代码拆分为一个个点，在计算机中可以称为 token；
语法分析，将 token 进行翻译，例如 const -> var，解析为 ast；
代码转换，将一种数据结构转换为目标语言的数据结构，ast -> newAst，例如 loader；
生成代码，将 newAst 输出为代码。

接下来参照最小编译器 the-super-tiny-compiler 的例子，简单梳理一下 ast 的逻辑，这个例子是将 input 的语法转为 output 语法：

javascript 复制代码

const input = "(add 2 (subtract 4 2))";
const output = "add(2, subtract(4, 2));";

实现词法分析

此时不涉及结构，我们可以遍历每个点，并添加到对象中。

javascript 复制代码

const tokenizer = (input) => {
  let current = 0;
  let tokens = [];

  while (current < input.length) {
    let char = input[current];

    // 匹配括号
    if (char === "(") {
      tokens.push({
        type: "paren",
        value: "(",
      });
      current++;
      continue;
    }
    if (char === ")") {
      tokens.push({
        type: "paren",
        value: ")",
      });
      current++;
      continue;
    }

    // 对于较小的点可以用正则匹配 匹配空格
    const WHITESPEACE = /\s/;
    if (WHITESPEACE.test(char)) {
      current++;
      continue;
    }

    // 匹配数字
    const NUMBERS = /[0-9]/;
    if (NUMBERS.test(char)) {
      let value = "";
      while (NUMBERS.test(char)) {
        // 拼接完整的数值
        value += char;
        char = input[++current];
      }
      tokens.push({
        type: "number",
        value,
      });
      continue;
    }

    // 匹配加减乘除的描述词
    const LETTERS = /[a-z]/i;
    if (LETTERS.test(char)) {
      let value = "";
      while (LETTERS.test(char)) {
        // 拼接完整的单词
        value += char;
        char = input[++current];
      }
      tokens.push({
        type: "name",
        value,
      });
      continue;
    }

    // 边界处理
    if (char === '"') {
      let value = "";
      char = input[++current];
      while (char !== '"') {
        value += char;
        char = input[++current];
      }
      char = input[++current];
      tokens.push({
        type: "string",
        value,
      });
      continue;
    }

    throw new typeError("未知类型");
  }

  return tokens;
};

运行 tokenizer(input) 得到以下内容：

javascript 复制代码

const tokens = [
  { type: "paren", value: "(" },
  { type: "name", value: "add" },
  { type: "number", value: "2" },
  { type: "paren", value: "(" },
  { type: "name", value: "subtract" },
  { type: "number", value: "4" },
  { type: "number", value: "2" },
  { type: "paren", value: ")" },
  { type: "paren", value: ")" },
]

实现语法分析

刚刚通过词法分析方法，得到 tokens 对象，接下来通过递归将 tokens 这个扁平对象整合到树形结构中：

javascript 复制代码

const parser = (tokens) => {
  let current = 0;

  const walk = () => {
    let token = tokens[current];

    // 分析 number 类型
    if (token.type === "number") {
      current++;
      return {
        type: "NumberLiteral",
        value: token.value,
      };
    }

    // 分析 string 类型
    if (token.type === "string") {
      current++;
      return {
        type: "StringLiteral",
        value: token.value,
      };
    }

    // 分析括号，层级关系产生
    if (token.type === "paren" && token.value === "(") {
      // 遇到 "(" 则把下一个 token 加入到表达式对象
      token = tokens[++current];
      let node = {
        type: "CallExpression",
        name: token.value,
        params: [],
      };

      // 整和 "表达式 token" 后面的内容部分
      token = tokens[++current];
      console.log("token3", token);
      while (
        token.type !== "paren" ||
        (token.type === "paren" && token.value !== ")")
      ) {
        node.params.push(walk());
        // 回归当前下标点
        token = tokens[current];
      }
      current++;
      return node;
    }
  };

  let ast = {
    type: "Program",
    body: [],
  };

  while (current < tokens.length) {
    ast.body.push(walk());
  }

  return ast;
};

运行语法分析函数 parser(tokens)，可以得到下面的数据结构，也就是 ast。

javascript 复制代码

const ast = {
  type: "Program",
  body: [
    {
      type: "CallExpression",
      name: "add",
      params: [
        { type: "NumberLiteral", value: "2" },
        {
          type: "CallExpression",
          name: "subtract",
          params: [
            { type: "NumberLiteral", value: "4" },
            { type: "NumberLiteral", value: "2" },
          ],
        },
      ],
    },
  ],
};

实现代码转换

代码转换的过程跟构建工具的勾子函数作用是一样的，对 ast 进行转换，得到新的 ast。

javascript 复制代码

// 插件 plugin 执行勾子，也就是观察者 visitor
// 例如 webpack 中的 enter、exit
const traverser = (ast, visitor) => {
  // 数组转换，parent 表示为前端中的 context，用于维护层级关系
  const traverserArr = (arr, parent) => {
    arr.forEach((child) => {
      traverserNode(child, parent);
    });
  };

  const traverserNode = (node, parent) => {
    // 这里通过定义观察者函数，也就是构建工具中的勾子，来约束传入的数据，是其按照我们的时机执行
    const methods = visitor[node.type];

    // 进入
    if (methods && methods.enter) {
      methods.enter(node, parent);
    }

    // 进行中
    switch (node.type) {
      case "Program":
        traverserArr(node.body, node);
        break;
      case "CallExpression":
        traverserArr(node.params, node);
        break;
      case "NumberLiteral":
      case "StringLiteral":
        break;
      default:
        throw new TypeError(node.type);
        break;
    }

    // 离开
    if (methods && methods.exit) {
      methods.exit(node, parent);
    }
  };

  traverserNode(ast, null);
};

const transformer = (ast) => {
  // 创建新的 ast，用于接受转换的数据结构
  let newAst = {
    type: "Program",
    body: [],
  };
  ast._context = newAst.body;

  traverser(ast, {
    // 添加数字节点
    NumberLiteral: {
      enter(node, parent) {
        parent._context.push({
          type: "NumberLiteral",
          value: node.value,
        });
      },
    },

    // 添加字符节点
    StringLiteral: {
      enter(node, parent) {
        parent._context.push({
          type: "StringLiteral",
          value: node.value,
        });
      },
    },

    // 添加表达式节点
    CallExpression: {
      enter(node, parent) {
        let expression = {
          type: "CallExpression",
          callee: {
            type: "Identifier",
            name: node.name,
          },
          arguments: [],
        };
        // 定义新的上下文
        node._context = expression.arguments;
        if (parent.type !== "CallExpression") {
          expression = {
            type: "ExpressionStatement",
            expression: expression,
          };
        }
        parent._context.push(expression);
      },
    },
  });

  return newAst;
};

执行 transformer(ast) 可以得到新的 ast：

javascript 复制代码

const newAst = {
  type: "Program",
  body: [
    {
      type: "ExpressionStatement",
      expression: {
        type: "CallExpression",
        callee: {
          type: "Identifier",
          name: "add",
        },
        arguments: [
          {
            type: "NumberLiteral",
            value: "2",
          },
          {
            type: "CallExpression",
            callee: {
              type: "Identifier",
              name: "subtract",
            },
            arguments: [
              {
                type: "NumberLiteral",
                value: "4",
              },
              {
                type: "NumberLiteral",
                value: "2",
              },
            ],
          },
        ],
      },
    },
  ],
};

实现代码生成

最后一个阶段：根据将要转换的代码的格式，对对象中不同的节点做处理，递归生成真实代码。

javascript 复制代码

const codeGenerator = (node) => {
  switch (node.type) {
    // 生成 program 节点
    case "Program":
      return node.body.map(codeGenerator).join("\n");

    // 生成完整表达式语句
    case "ExpressionStatement":
      return codeGenerator(node.expression) + ";";

    // 生成表达式
    case "CallExpression":
      return (
        codeGenerator(node.callee) +
        "(" +
        node.arguments.map(codeGenerator).join(", ") +
        ")"
      );

    // 生成标识符名称
    case "Identifier":
      return node.name;

    // 生成数值
    case "NumberLiteral":
      return node.value;

    // 生成字符
    case "StringLiteral":
      return '"' + node.value + '"';

    default:
      throw new TypeError(node.type);
  }
};

执行 codeGenerator(newAst) 得到最终输出结果：

javascript 复制代码

add(2, subtract(4, 2));

最后是整合函数：

javascript 复制代码

const compiler = (input) => {
  let tokens = tokenizer(input);
  let ast    = parser(tokens);
  let newAst = transformer(ast);
  let output = codeGenerator(newAst);
  
  return output;
}