LuaJit分析（十一）去除string.dump函数

Lua脚本中的string.dump函数用于生成字节码文件，根据对 luajit -b命令的分析可以得出，最终dump出字节码文件都是使用的string.dump函数。

因此即使我们的指令顺序被打乱，通过loadfile系统调用，再通过string.dump调用，可以得到字节码文件，再通过与标准的luajit生成的字节码文件比对，即可找出差异所在。

String.dump的实现在lib_string.c文件中，实现如下：

cpp 复制代码

LJLIB_CF(string_dump)
{
GCfunc *fn = lj_lib_checkfunc(L, 1);
int strip = L->base+1 < L->top && tvistruecond(L->base+1);
SBuf *sb = lj_buf_tmp_(L); /* Assumes lj_bcwrite() doesn't use tmpbuf. */
L->top = L->base+1;
if (!isluafunc(fn) || lj_bcwrite(L, funcproto(fn), writer_buf, sb, strip))
lj_err_caller(L, LJ_ERR_STRDUMP);
setstrV(L, L->top-1, lj_buf_str(L, sb));
lj_gc_check(L);
return 1;
}

可以看出，它最终是调用lj_bcwriter函数，来完成字节码的生成。但是如果直接将它注释掉，在编译时会出现如下错误：

提示lj_libdef.h这个文件中没有声明lj_cf_string_dump这个标识符，很明显string_dump这个函数的声明为LJLIB_CF(string_dump)，其中LJLIB_CF为宏定义，定义如下：

cpp 复制代码

#define LJLIB_CF(name)    static int lj_cf_##name(lua_State *L)

从这个定义就可以得出，注释了string.dump后，这个函数的实现找不到了，但是又被引用，那么找到lj_libdef.h中的第123行，如下：

cpp 复制代码

#ifdef LJLIB_MODULE_string
#undef LJLIB_MODULE_string
static const lua_CFunction
lj_lib_cf_string[] = {
  lj_ffh_string_byte,
  lj_ffh_string_char,
  lj_ffh_string_sub,
  lj_cf_string_rep,
  lj_ffh_string_reverse,
  lj_cf_string_dump,
  lj_cf_string_find,
  lj_cf_string_match,
  lj_cf_string_gmatch,
  lj_cf_string_gsub,
  lj_cf_string_format
};

这里记录了这个函数的实现，同时在lj_ffdef.h中也记录了string.dump：

cpp 复制代码

FFDEF(string_upper)
FFDEF(string_dump)
FFDEF(string_find)

同时lj_libdef.h和lj_ffdef.h这两个文件都是自动生成的，在msvcbuild.bat中，使用如下命令生成

bash 复制代码

buildvm -m ffdef -o lj_ffdef.h %ALL_LIB%
buildvm -m libdef -o lj_libdef.h %ALL_LIB%

其中%ALL_LIB%是所有lib_*.c的文件，即各种类型的库的c源文件。

从这里就可以得出，这两个头文件都是使用buildvm生成，附带不同的 -m参数，和各种库文件的c源文件。

那么我们直接定位到buildvm.c的main函数中，从main函数中可以看出它先处理了输入的参数，然后在一个switch结构中根据不同的 -m参数进行不同转换：

cpp 复制代码

case BUILD_ffdef:
case BUILD_libdef:
case BUILD_recdef:
emit_lib(ctx);
break;

可以看到这两个头文件都是使用 emit_lib函数生成，emit_lib实现如下：

cpp 复制代码

/* Emit C source code for library function definitions. */
void emit_lib(BuildCtx *ctx)
{
  const char *fname;
  if (ctx->mode == BUILD_ffdef || ctx->mode == BUILD_libdef ||
      ctx->mode == BUILD_recdef)
    fprintf(ctx->fp, "/* This is a generated file. DO NOT EDIT! */\n\n");
  else if (ctx->mode == BUILD_vmdef)
    fprintf(ctx->fp, "ffnames = {\n[0]=\"Lua\",\n\"C\",\n");
  if (ctx->mode == BUILD_recdef)
    fprintf(ctx->fp, "static const uint16_t recff_idmap[] = {\n0,\n0x0100");
  recffid = ffid = FF_C+1;
  ffasmfunc = 0;
 
  while ((fname = *ctx->args++)) {
    char buf[256];  /* We don't care about analyzing lines longer than that. */
    FILE *fp;
    if (fname[0] == '-' && fname[1] == '\0') {
      fp = stdin;
    } else {
      fp = fopen(fname, "r");
      if (!fp) {
  fprintf(stderr, "Error: cannot open input file '%s': %s\n",
    fname, strerror(errno));
  exit(1);
      }
    }
    modstate = 0;
    regfunc = REGFUNC_OK;
    while (fgets(buf, sizeof(buf), fp) != NULL) {
      char *p;
      /* Simplistic pre-processor. Only handles top-level #if/#endif. */
      if (buf[0] == '#' && buf[1] == 'i' && buf[2] == 'f') {
  int ok = 1;
  if (!strcmp(buf, "#if LJ_52\n"))
    ok = LJ_52;
  else if (!strcmp(buf, "#if LJ_HASJIT\n"))
    ok = LJ_HASJIT;
  else if (!strcmp(buf, "#if LJ_HASFFI\n"))
    ok = LJ_HASFFI;
  if (!ok) {
    int lvl = 1;
    while (fgets(buf, sizeof(buf), fp) != NULL) {
      if (buf[0] == '#' && buf[1] == 'e' && buf[2] == 'n') {
        if (--lvl == 0) break;
      } else if (buf[0] == '#' && buf[1] == 'i' && buf[2] == 'f') {
        lvl++;
      }
    }
    continue;
  }
      }
      for (p = buf; (p = strstr(p, LIBDEF_PREFIX)) != NULL; ) {
  const LibDefHandler *ldh;
  p += sizeof(LIBDEF_PREFIX)-1;
  for (ldh = libdef_handlers; ldh->suffix != NULL; ldh++) {
    size_t n, len = strlen(ldh->suffix);
    if (!strncmp(p, ldh->suffix, len)) {
      p += len;
      n = ldh->stop ? strcspn(p, ldh->stop) : 0;
      if (!p[n]) break;
      p[n] = '\0';
      ldh->func(ctx, p, ldh->arg);
      p += n+1;
      break;
    }
  }
  if (ldh->suffix == NULL) {
    buf[strlen(buf)-1] = '\0';
    fprintf(stderr, "Error: unknown library definition tag %s%s\n",
      LIBDEF_PREFIX, p);
    exit(1);
  }
      }
    }
    fclose(fp);
    if (ctx->mode == BUILD_libdef) {
      libdef_endmodule(ctx);
    }
  }
 
  if (ctx->mode == BUILD_ffdef) {
    fprintf(ctx->fp, "\n#undef FFDEF\n\n");
    fprintf(ctx->fp,
      "#ifndef FF_NUM_ASMFUNC\n#define FF_NUM_ASMFUNC %d\n#endif\n\n",
      ffasmfunc);
  } else if (ctx->mode == BUILD_vmdef) {
    fprintf(ctx->fp, "},\n\n");
  } else if (ctx->mode == BUILD_bcdef) {
    int i;
    fprintf(ctx->fp, "\n};\n\n");
    fprintf(ctx->fp, "LJ_DATADEF const uint16_t lj_bc_mode[] = {\n");
    fprintf(ctx->fp, "BCDEF(BCMODE)\n");
    for (i = ffasmfunc-1; i > 0; i--)
      fprintf(ctx->fp, "BCMODE_FF,\n");
    fprintf(ctx->fp, "BCMODE_FF\n};\n\n");
  } else if (ctx->mode == BUILD_recdef) {
    char *p = (char *)obuf;
    fprintf(ctx->fp, "\n};\n\n");
    fprintf(ctx->fp, "static const RecordFunc recff_func[] = {\n"
      "recff_nyi,\n"
      "recff_c");
    while (*p) {
      fprintf(ctx->fp, ",\nrecff_%s", p);
      p += strlen(p)+1;
    }
    fprintf(ctx->fp, "\n};\n\n");
  }
}

从这个函数可以看到，这是一个生成头文件的总的入口函数，并实现了所有内容的生成，它通过读取源文件，匹配关键字符串，提取出宏定义中的函数名，求中有一个关键的循环：

cpp 复制代码

for (ldh = libdef_handlers; ldh->suffix != NULL; ldh++) {
  size_t n, len = strlen(ldh->suffix);
  if (!strncmp(p, ldh->suffix, len)) {
    p += len;
    n = ldh->stop ? strcspn(p, ldh->stop) : 0;
    if (!p[n]) break;
    p[n] = '\0';
    ldh->func(ctx, p, ldh->arg);
    p += n+1;
    break;
  }
}

它从libdef_handlers中读取了所有保存的函数并调用。它的定义如下：

cpp 复制代码

static const LibDefHandler libdef_handlers[] = {
  { "MODULE_",  " \t\r\n",  libdef_module,    0 },
  { "CF(",  ")",    libdef_func,    LIBINIT_CF },
  { "ASM(", ")",    libdef_func,    LIBINIT_ASM },
  { "ASM_(",  ")",    libdef_func,    LIBINIT_ASM_ },
  { "LUA(", ")",    libdef_lua,   0 },
  { "REC(", ")",    libdef_rec,   0 },
  { "PUSH(",  ")",    libdef_push,    0 },
  { "SET(", ")",    libdef_set,   0 },
  { "NOREGUV",  NULL,   libdef_regfunc,   REGFUNC_NOREGUV },
  { "NOREG",  NULL,   libdef_regfunc,   REGFUNC_NOREG },
  { NULL, NULL,   (LibDefFunc)0,    0 }
};

从这里可以看出，它对应了所有类型函数的处理，string_dump由libdef_func处理，libdef_func定义如下：

cpp 复制代码

static void libdef_func(BuildCtx *ctx, char *p, int arg)
{
  if (arg != LIBINIT_CF)
    ffasmfunc++;
  if (ctx->mode == BUILD_libdef) {
    if (modstate == 0) {
      fprintf(stderr, "Error: no module for function definition %s\n", p);
      exit(1);
    }
    if (regfunc == REGFUNC_NOREG) {
      if (optr+1 > obuf+sizeof(obuf)) {
  fprintf(stderr, "Error: output buffer overflow\n");
  exit(1);
      }
      *optr++ = LIBINIT_FFID;
    } else {
      if (arg != LIBINIT_ASM_) {
  if (modstate != 1) fprintf(ctx->fp, ",\n");
  modstate = 2;
  fprintf(ctx->fp, "  %s%s", arg ? LABEL_PREFIX_FFH : LABEL_PREFIX_CF, p);
      }
      if (regfunc != REGFUNC_NOREGUV) obuf[2]++;  /* Bump hash table size. */
      libdef_name(regfunc == REGFUNC_NOREGUV ? "" : p, arg);
    }
  } else if (ctx->mode == BUILD_ffdef) {
    fprintf(ctx->fp, "FFDEF(%s)\n", p);
  } else if (ctx->mode == BUILD_recdef) {
    if (strlen(p) > sizeof(funcname)-1) {
      fprintf(stderr, "Error: function name too long: '%s'\n", p);
      exit(1);
    }
    strcpy(funcname, p);
  } else if (ctx->mode == BUILD_vmdef) {
    int i;
    for (i = 1; p[i] && modname[i-1]; i++)
      if (p[i] == '_') p[i] = '.';
    fprintf(ctx->fp, "\"%s\",\n", p);
  } else if (ctx->mode == BUILD_bcdef) {
    if (arg != LIBINIT_CF)
      fprintf(ctx->fp, ",\n%d", find_ffofs(ctx, p));
  }
  ffid++;
  regfunc = REGFUNC_OK;
}

这里输出了与lib_*.c中对应的函数名。
总结：luajit中库函数的实现使用了宏定义，buildvm在生成对应头文件时，直接扫描源文件中的标志，如LJLIB_CF(string_dump) 并提取出得到 string_dump名称，并将其输出到生成的头文件中，因此直接注释依然会识别到该函数名。