我改写的二分法XML转CSV文件程序速度追上了张泽鹏先生的

以下是美团龙猫初稿,我改正,DeepSeek重新格式化的代码。

重要改正点:

1.二分查找用goto控制迭代,返回<row的正确位置

2.在缓冲区头填上父标签使expat能连续解析不报错

c 复制代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <expat.h>

#define MAX_CELL_CONTENT 256

typedef struct {
    int start_row;
    int end_row;
    char start_col;
    char end_col;
} ParseRange;

typedef struct {
    ParseRange range;
    FILE *csv;
    FILE *xml_file;
    XML_Parser parser;
    int in_row;
    int current_row;
    char current_col;
    int value_started;
    char temp_value[MAX_CELL_CONTENT];
    int value_len;
    int skip_row;
    long row_start_pos;
    int first_row_processed;
    char first_row_max_col;
} ParserState;

int parse_excel_range(const char *range_str, ParseRange *range) {
    if (sscanf(range_str, "%c%d:%c%d", 
               &range->start_col, &range->start_row,
               &range->end_col, &range->end_row) != 4) {
        return -1;
    }
    if (range->start_col > range->end_col) return -1;
    if (range->start_row > range->end_row) return -1;

    return 0;
}

long binary_search_row(FILE *file, int target_row) {
    long low = 0;
    fseek(file, 0, SEEK_END);
    long high = ftell(file);
    long mid = 0;
    char buffer[1024];
    int found_row = -1;
    long found_pos = 0;
    
    while (low <= high) {
A:
        mid = (low + high) / 2;
        fseek(file, mid, SEEK_SET);
        
        int c;
        while ((c = fgetc(file)) != EOF) { 
            if (c == '<') {
                char tag[128] = {c};
                int tag_len = 1;
                while ((c = fgetc(file)) != EOF && c != '>' && tag_len < sizeof(tag) - 1) {
                    tag[tag_len++] = c;
                }
                
                tag[tag_len] = '\0';
                int rt = (strncmp(tag, "<row", 4) == 0);

                if (rt) {
                    long row_start_pos = mid + (tag_len);
                    char *row_attr = strstr(tag, " r=\"");
                    if (row_attr) {
                        int row_num = atoi(row_attr + 4);
                        found_pos = ftell(file) - tag_len - 1;
                        found_row = row_num;
                        
                        if (row_num == target_row) {
                            return found_pos;
                        } else if (row_num < target_row) {
                            low = mid + 1;
                            goto A;
                        } else {
                            high = mid - 1;
                            goto A;
                        }
                    }
                }
            }
        }
        
        if (c == EOF) break;
    }
    
    if (found_row < target_row) {
        return found_pos;
    }
    return 0;
}

void XMLCALL start_element(void *user_data, const XML_Char *name, const XML_Char **attrs) {
    ParserState *state = (ParserState*)user_data;
    
    if (strcmp(name, "row") == 0) {
        state->row_start_pos = XML_GetCurrentByteIndex(state->parser);
        state->in_row = 1;
        state->current_row = -1;
        state->skip_row = 0;
        
        for (int i = 0; attrs[i]; i += 2) {
            if (strcmp(attrs[i], "r") == 0) {
                state->current_row = atoi(attrs[i+1]);
                break;
            }
        }
        
        if (state->current_row > state->range.end_row) {
            XML_StopParser(state->parser, 0);
            return;
        }        

        if (state->current_row < state->range.start_row) {
            state->skip_row = 1;
            return;
        }
   
        fprintf(state->csv, "%d", state->current_row);
    }
    else if (strcmp(name, "c") == 0 && state->in_row && !state->skip_row) {
        for (int i = 0; attrs[i]; i += 2) {
            if (strcmp(attrs[i], "r") == 0) {
                state->current_col = attrs[i+1][0];
                break;
            }
        }
    }
    else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->in_row && !state->skip_row) {
        if (state->current_col >= state->range.start_col && 
            state->current_col <= state->range.end_col) {
            state->value_started = 1;
            state->value_len = 0;
            state->temp_value[0] = '\0';
        }
    }
}

void XMLCALL character_data(void *user_data, const XML_Char *s, int len) {
    ParserState *state = (ParserState*)user_data;
    if (state->value_started && state->value_len + len < MAX_CELL_CONTENT - 1) {
        memcpy(state->temp_value + state->value_len, s, len);
        state->value_len += len;
        state->temp_value[state->value_len] = '\0';
    }
}

void XMLCALL end_element(void *user_data, const XML_Char *name) {
    ParserState *state = (ParserState*)user_data;
    
    if (strcmp(name, "row") == 0 && state->in_row && !state->skip_row) {
        for (char col = state->current_col + 1; col <= state->range.end_col; col++) {
            fprintf(state->csv, ",");
        }
        fprintf(state->csv, "\n");
        state->in_row = 0;
    }
    else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->value_started) {
        if (state->current_col >= state->range.start_col && 
            state->current_col <= state->range.end_col) {
            static char last_col = 0;
            if (last_col == 0) last_col = state->range.start_col;
            
            for (char col = last_col; col < state->current_col; col++) {
                fprintf(state->csv, ",");
            }
            fprintf(state->csv, ",%s", state->temp_value);
            last_col = state->current_col + 1;
        }
        state->value_started = 0;
    }
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        printf("用法: %s <xml文件> <范围(A1:Z100)>\n", argv[0]);
        return 1;
    }
    
    ParseRange range;
    if (parse_excel_range(argv[2], &range) != 0) {
        printf("错误: 无效范围格式\n");
        return 1;
    }
    
    char csv_filename[256];
    strncpy(csv_filename, argv[1], sizeof(csv_filename) - 1);
    char *ext = strrchr(csv_filename, '.');
    if (ext) strcpy(ext, ".csv");
    else strncat(csv_filename, ".csv", sizeof(csv_filename) - strlen(csv_filename) - 1);
    
    FILE *csv = fopen(csv_filename, "w");
    if (!csv) {
        printf("错误: 无法创建CSV\n");
        return 1;
    }
    
    fprintf(csv, "Row");
    for (char col = range.start_col; col <= range.end_col; col++) {
        fprintf(csv, ",%c", col);
    }
    fprintf(csv, "\n");

    FILE *file = fopen(argv[1], "rb");
    if (!file) {
        printf("错误: 无法打开文件 %s\n", argv[1]);
        fclose(csv);
        return -1;
    }
    
    long start_pos = binary_search_row(file, range.start_row);
    if (start_pos > 0) {
        fseek(file, start_pos, SEEK_SET);
    } else {
        fseek(file, 0, SEEK_SET);
    }
    
    XML_Parser parser = XML_ParserCreate(NULL);
    ParserState state = {0};
    state.range = range;
    state.csv = csv;
    state.parser = parser;
    XML_SetUserData(parser, &state);
    XML_SetElementHandler(parser, start_element, end_element);
    XML_SetCharacterDataHandler(parser, character_data);
    
    fseek(file, start_pos, SEEK_SET);
    char buffer[8192] = "<sheetData>";
    
    int done;
    int i = 0;
    do {
        if (XML_GetErrorCode(parser) == XML_ERROR_FINISHED) break;
        size_t len = fread(buffer + 11 * (i == 0), 1, sizeof(buffer) - 11 * (i == 0), file);
        done = (len < sizeof(buffer) - 11 * (i == 0));
        size_t actual_len = len;
        if (!done) {
            if (XML_Parse(parser, buffer, actual_len + 11 * (i == 0), done) == XML_STATUS_ERROR) {
                break;
            }
            i++;
        }
    } while (!done);
    
    fclose(file);
    fclose(csv);
    XML_ParserFree(parser);
    printf("CSV已保存到 %s\n", csv_filename);
    return 0;
}

编译运行和比较

复制代码
gcc expatfmt.c -o expatfmt -lexpat -O3
root@66d4e20ec1d7:/par# time ./expatfmt lineitem/xl/worksheets/sheet1.xml A500000:Z600000
CSV已保存到 lineitem/xl/worksheets/sheet1.csv

real	0m1.865s
user	0m1.836s
sys	0m0.028s

root@66d4e20ec1d7:/par# time ./aich2 lineitem/xl/worksheets/sheet1.xml A500000:Z600000 out.csv

real	0m2.870s
user	0m1.064s
sys	0m0.076s
相关推荐
Blacol2 分钟前
【MCP】Caldav个人日程助手
人工智能·mcp
l12345sy11 分钟前
Day31_【 NLP _1.文本预处理 _(4)文本特征处理、文本数据增强】
人工智能·深度学习·自然语言处理
说私域12 分钟前
开源AI智能名片链动2+1模式S2B2C商城小程序在公益课裂变法中的应用与影响研究
人工智能·小程序
jndingxin16 分钟前
算法面试(6)------mAP 是什么?如何计算?P-R 曲线怎么看?
开发语言·算法
0xCode 小新21 分钟前
【C语言内存函数完全指南】:memcpy、memmove、memset、memcmp 的用法、区别与模拟实现(含代码示例)
linux·c语言·人工智能·深度学习·机器学习·容器·内存函数
Elastic 中国社区官方博客22 分钟前
如何在 vscode 里配置 MCP 并连接到 Elasticsearch
大数据·人工智能·vscode·elasticsearch·搜索引擎·ai·mcp
三掌柜66638 分钟前
2025三掌柜赠书活动第三十五期 AI辅助React Web应用开发实践:基于React 19和GitHub Copilot
前端·人工智能·react.js
无限进步_43 分钟前
【C语言】计算两个整数二进制表示中不同位的个数
c语言·开发语言
机器之心1 小时前
强强联手!深度求索、寒武纪同步发布DeepSeek-V3.2模型架构和基于vLLM的模型适配源代码
人工智能·openai
机器之心1 小时前
Claude Sonnet 4.5来了!能连续编程30多小时、1.1万行代码
人工智能·openai