以下是美团龙猫初稿,我改正,DeepSeek重新格式化的代码。
重要改正点:
1.二分查找用goto控制迭代,返回<row的正确位置
2.在缓冲区头填上父标签使expat能连续解析不报错
c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <expat.h>
#define MAX_CELL_CONTENT 256
typedef struct {
int start_row;
int end_row;
char start_col;
char end_col;
} ParseRange;
typedef struct {
ParseRange range;
FILE *csv;
FILE *xml_file;
XML_Parser parser;
int in_row;
int current_row;
char current_col;
int value_started;
char temp_value[MAX_CELL_CONTENT];
int value_len;
int skip_row;
long row_start_pos;
int first_row_processed;
char first_row_max_col;
} ParserState;
int parse_excel_range(const char *range_str, ParseRange *range) {
if (sscanf(range_str, "%c%d:%c%d",
&range->start_col, &range->start_row,
&range->end_col, &range->end_row) != 4) {
return -1;
}
if (range->start_col > range->end_col) return -1;
if (range->start_row > range->end_row) return -1;
return 0;
}
long binary_search_row(FILE *file, int target_row) {
long low = 0;
fseek(file, 0, SEEK_END);
long high = ftell(file);
long mid = 0;
char buffer[1024];
int found_row = -1;
long found_pos = 0;
while (low <= high) {
A:
mid = (low + high) / 2;
fseek(file, mid, SEEK_SET);
int c;
while ((c = fgetc(file)) != EOF) {
if (c == '<') {
char tag[128] = {c};
int tag_len = 1;
while ((c = fgetc(file)) != EOF && c != '>' && tag_len < sizeof(tag) - 1) {
tag[tag_len++] = c;
}
tag[tag_len] = '\0';
int rt = (strncmp(tag, "<row", 4) == 0);
if (rt) {
long row_start_pos = mid + (tag_len);
char *row_attr = strstr(tag, " r=\"");
if (row_attr) {
int row_num = atoi(row_attr + 4);
found_pos = ftell(file) - tag_len - 1;
found_row = row_num;
if (row_num == target_row) {
return found_pos;
} else if (row_num < target_row) {
low = mid + 1;
goto A;
} else {
high = mid - 1;
goto A;
}
}
}
}
}
if (c == EOF) break;
}
if (found_row < target_row) {
return found_pos;
}
return 0;
}
void XMLCALL start_element(void *user_data, const XML_Char *name, const XML_Char **attrs) {
ParserState *state = (ParserState*)user_data;
if (strcmp(name, "row") == 0) {
state->row_start_pos = XML_GetCurrentByteIndex(state->parser);
state->in_row = 1;
state->current_row = -1;
state->skip_row = 0;
for (int i = 0; attrs[i]; i += 2) {
if (strcmp(attrs[i], "r") == 0) {
state->current_row = atoi(attrs[i+1]);
break;
}
}
if (state->current_row > state->range.end_row) {
XML_StopParser(state->parser, 0);
return;
}
if (state->current_row < state->range.start_row) {
state->skip_row = 1;
return;
}
fprintf(state->csv, "%d", state->current_row);
}
else if (strcmp(name, "c") == 0 && state->in_row && !state->skip_row) {
for (int i = 0; attrs[i]; i += 2) {
if (strcmp(attrs[i], "r") == 0) {
state->current_col = attrs[i+1][0];
break;
}
}
}
else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->in_row && !state->skip_row) {
if (state->current_col >= state->range.start_col &&
state->current_col <= state->range.end_col) {
state->value_started = 1;
state->value_len = 0;
state->temp_value[0] = '\0';
}
}
}
void XMLCALL character_data(void *user_data, const XML_Char *s, int len) {
ParserState *state = (ParserState*)user_data;
if (state->value_started && state->value_len + len < MAX_CELL_CONTENT - 1) {
memcpy(state->temp_value + state->value_len, s, len);
state->value_len += len;
state->temp_value[state->value_len] = '\0';
}
}
void XMLCALL end_element(void *user_data, const XML_Char *name) {
ParserState *state = (ParserState*)user_data;
if (strcmp(name, "row") == 0 && state->in_row && !state->skip_row) {
for (char col = state->current_col + 1; col <= state->range.end_col; col++) {
fprintf(state->csv, ",");
}
fprintf(state->csv, "\n");
state->in_row = 0;
}
else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->value_started) {
if (state->current_col >= state->range.start_col &&
state->current_col <= state->range.end_col) {
static char last_col = 0;
if (last_col == 0) last_col = state->range.start_col;
for (char col = last_col; col < state->current_col; col++) {
fprintf(state->csv, ",");
}
fprintf(state->csv, ",%s", state->temp_value);
last_col = state->current_col + 1;
}
state->value_started = 0;
}
}
int main(int argc, char *argv[]) {
if (argc != 3) {
printf("用法: %s <xml文件> <范围(A1:Z100)>\n", argv[0]);
return 1;
}
ParseRange range;
if (parse_excel_range(argv[2], &range) != 0) {
printf("错误: 无效范围格式\n");
return 1;
}
char csv_filename[256];
strncpy(csv_filename, argv[1], sizeof(csv_filename) - 1);
char *ext = strrchr(csv_filename, '.');
if (ext) strcpy(ext, ".csv");
else strncat(csv_filename, ".csv", sizeof(csv_filename) - strlen(csv_filename) - 1);
FILE *csv = fopen(csv_filename, "w");
if (!csv) {
printf("错误: 无法创建CSV\n");
return 1;
}
fprintf(csv, "Row");
for (char col = range.start_col; col <= range.end_col; col++) {
fprintf(csv, ",%c", col);
}
fprintf(csv, "\n");
FILE *file = fopen(argv[1], "rb");
if (!file) {
printf("错误: 无法打开文件 %s\n", argv[1]);
fclose(csv);
return -1;
}
long start_pos = binary_search_row(file, range.start_row);
if (start_pos > 0) {
fseek(file, start_pos, SEEK_SET);
} else {
fseek(file, 0, SEEK_SET);
}
XML_Parser parser = XML_ParserCreate(NULL);
ParserState state = {0};
state.range = range;
state.csv = csv;
state.parser = parser;
XML_SetUserData(parser, &state);
XML_SetElementHandler(parser, start_element, end_element);
XML_SetCharacterDataHandler(parser, character_data);
fseek(file, start_pos, SEEK_SET);
char buffer[8192] = "<sheetData>";
int done;
int i = 0;
do {
if (XML_GetErrorCode(parser) == XML_ERROR_FINISHED) break;
size_t len = fread(buffer + 11 * (i == 0), 1, sizeof(buffer) - 11 * (i == 0), file);
done = (len < sizeof(buffer) - 11 * (i == 0));
size_t actual_len = len;
if (!done) {
if (XML_Parse(parser, buffer, actual_len + 11 * (i == 0), done) == XML_STATUS_ERROR) {
break;
}
i++;
}
} while (!done);
fclose(file);
fclose(csv);
XML_ParserFree(parser);
printf("CSV已保存到 %s\n", csv_filename);
return 0;
}
编译运行和比较
gcc expatfmt.c -o expatfmt -lexpat -O3
root@66d4e20ec1d7:/par# time ./expatfmt lineitem/xl/worksheets/sheet1.xml A500000:Z600000
CSV已保存到 lineitem/xl/worksheets/sheet1.csv
real 0m1.865s
user 0m1.836s
sys 0m0.028s
root@66d4e20ec1d7:/par# time ./aich2 lineitem/xl/worksheets/sheet1.xml A500000:Z600000 out.csv
real 0m2.870s
user 0m1.064s
sys 0m0.076s