我改写的二分法XML转CSV文件程序速度追上了张泽鹏先生的

以下是美团龙猫初稿,我改正,DeepSeek重新格式化的代码。

重要改正点:

1.二分查找用goto控制迭代,返回<row的正确位置

2.在缓冲区头填上父标签使expat能连续解析不报错

c 复制代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <expat.h>

#define MAX_CELL_CONTENT 256

typedef struct {
    int start_row;
    int end_row;
    char start_col;
    char end_col;
} ParseRange;

typedef struct {
    ParseRange range;
    FILE *csv;
    FILE *xml_file;
    XML_Parser parser;
    int in_row;
    int current_row;
    char current_col;
    int value_started;
    char temp_value[MAX_CELL_CONTENT];
    int value_len;
    int skip_row;
    long row_start_pos;
    int first_row_processed;
    char first_row_max_col;
} ParserState;

int parse_excel_range(const char *range_str, ParseRange *range) {
    if (sscanf(range_str, "%c%d:%c%d", 
               &range->start_col, &range->start_row,
               &range->end_col, &range->end_row) != 4) {
        return -1;
    }
    if (range->start_col > range->end_col) return -1;
    if (range->start_row > range->end_row) return -1;

    return 0;
}

long binary_search_row(FILE *file, int target_row) {
    long low = 0;
    fseek(file, 0, SEEK_END);
    long high = ftell(file);
    long mid = 0;
    char buffer[1024];
    int found_row = -1;
    long found_pos = 0;
    
    while (low <= high) {
A:
        mid = (low + high) / 2;
        fseek(file, mid, SEEK_SET);
        
        int c;
        while ((c = fgetc(file)) != EOF) { 
            if (c == '<') {
                char tag[128] = {c};
                int tag_len = 1;
                while ((c = fgetc(file)) != EOF && c != '>' && tag_len < sizeof(tag) - 1) {
                    tag[tag_len++] = c;
                }
                
                tag[tag_len] = '\0';
                int rt = (strncmp(tag, "<row", 4) == 0);

                if (rt) {
                    long row_start_pos = mid + (tag_len);
                    char *row_attr = strstr(tag, " r=\"");
                    if (row_attr) {
                        int row_num = atoi(row_attr + 4);
                        found_pos = ftell(file) - tag_len - 1;
                        found_row = row_num;
                        
                        if (row_num == target_row) {
                            return found_pos;
                        } else if (row_num < target_row) {
                            low = mid + 1;
                            goto A;
                        } else {
                            high = mid - 1;
                            goto A;
                        }
                    }
                }
            }
        }
        
        if (c == EOF) break;
    }
    
    if (found_row < target_row) {
        return found_pos;
    }
    return 0;
}

void XMLCALL start_element(void *user_data, const XML_Char *name, const XML_Char **attrs) {
    ParserState *state = (ParserState*)user_data;
    
    if (strcmp(name, "row") == 0) {
        state->row_start_pos = XML_GetCurrentByteIndex(state->parser);
        state->in_row = 1;
        state->current_row = -1;
        state->skip_row = 0;
        
        for (int i = 0; attrs[i]; i += 2) {
            if (strcmp(attrs[i], "r") == 0) {
                state->current_row = atoi(attrs[i+1]);
                break;
            }
        }
        
        if (state->current_row > state->range.end_row) {
            XML_StopParser(state->parser, 0);
            return;
        }        

        if (state->current_row < state->range.start_row) {
            state->skip_row = 1;
            return;
        }
   
        fprintf(state->csv, "%d", state->current_row);
    }
    else if (strcmp(name, "c") == 0 && state->in_row && !state->skip_row) {
        for (int i = 0; attrs[i]; i += 2) {
            if (strcmp(attrs[i], "r") == 0) {
                state->current_col = attrs[i+1][0];
                break;
            }
        }
    }
    else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->in_row && !state->skip_row) {
        if (state->current_col >= state->range.start_col && 
            state->current_col <= state->range.end_col) {
            state->value_started = 1;
            state->value_len = 0;
            state->temp_value[0] = '\0';
        }
    }
}

void XMLCALL character_data(void *user_data, const XML_Char *s, int len) {
    ParserState *state = (ParserState*)user_data;
    if (state->value_started && state->value_len + len < MAX_CELL_CONTENT - 1) {
        memcpy(state->temp_value + state->value_len, s, len);
        state->value_len += len;
        state->temp_value[state->value_len] = '\0';
    }
}

void XMLCALL end_element(void *user_data, const XML_Char *name) {
    ParserState *state = (ParserState*)user_data;
    
    if (strcmp(name, "row") == 0 && state->in_row && !state->skip_row) {
        for (char col = state->current_col + 1; col <= state->range.end_col; col++) {
            fprintf(state->csv, ",");
        }
        fprintf(state->csv, "\n");
        state->in_row = 0;
    }
    else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->value_started) {
        if (state->current_col >= state->range.start_col && 
            state->current_col <= state->range.end_col) {
            static char last_col = 0;
            if (last_col == 0) last_col = state->range.start_col;
            
            for (char col = last_col; col < state->current_col; col++) {
                fprintf(state->csv, ",");
            }
            fprintf(state->csv, ",%s", state->temp_value);
            last_col = state->current_col + 1;
        }
        state->value_started = 0;
    }
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        printf("用法: %s <xml文件> <范围(A1:Z100)>\n", argv[0]);
        return 1;
    }
    
    ParseRange range;
    if (parse_excel_range(argv[2], &range) != 0) {
        printf("错误: 无效范围格式\n");
        return 1;
    }
    
    char csv_filename[256];
    strncpy(csv_filename, argv[1], sizeof(csv_filename) - 1);
    char *ext = strrchr(csv_filename, '.');
    if (ext) strcpy(ext, ".csv");
    else strncat(csv_filename, ".csv", sizeof(csv_filename) - strlen(csv_filename) - 1);
    
    FILE *csv = fopen(csv_filename, "w");
    if (!csv) {
        printf("错误: 无法创建CSV\n");
        return 1;
    }
    
    fprintf(csv, "Row");
    for (char col = range.start_col; col <= range.end_col; col++) {
        fprintf(csv, ",%c", col);
    }
    fprintf(csv, "\n");

    FILE *file = fopen(argv[1], "rb");
    if (!file) {
        printf("错误: 无法打开文件 %s\n", argv[1]);
        fclose(csv);
        return -1;
    }
    
    long start_pos = binary_search_row(file, range.start_row);
    if (start_pos > 0) {
        fseek(file, start_pos, SEEK_SET);
    } else {
        fseek(file, 0, SEEK_SET);
    }
    
    XML_Parser parser = XML_ParserCreate(NULL);
    ParserState state = {0};
    state.range = range;
    state.csv = csv;
    state.parser = parser;
    XML_SetUserData(parser, &state);
    XML_SetElementHandler(parser, start_element, end_element);
    XML_SetCharacterDataHandler(parser, character_data);
    
    fseek(file, start_pos, SEEK_SET);
    char buffer[8192] = "<sheetData>";
    
    int done;
    int i = 0;
    do {
        if (XML_GetErrorCode(parser) == XML_ERROR_FINISHED) break;
        size_t len = fread(buffer + 11 * (i == 0), 1, sizeof(buffer) - 11 * (i == 0), file);
        done = (len < sizeof(buffer) - 11 * (i == 0));
        size_t actual_len = len;
        if (!done) {
            if (XML_Parse(parser, buffer, actual_len + 11 * (i == 0), done) == XML_STATUS_ERROR) {
                break;
            }
            i++;
        }
    } while (!done);
    
    fclose(file);
    fclose(csv);
    XML_ParserFree(parser);
    printf("CSV已保存到 %s\n", csv_filename);
    return 0;
}

编译运行和比较

复制代码
gcc expatfmt.c -o expatfmt -lexpat -O3
root@66d4e20ec1d7:/par# time ./expatfmt lineitem/xl/worksheets/sheet1.xml A500000:Z600000
CSV已保存到 lineitem/xl/worksheets/sheet1.csv

real	0m1.865s
user	0m1.836s
sys	0m0.028s

root@66d4e20ec1d7:/par# time ./aich2 lineitem/xl/worksheets/sheet1.xml A500000:Z600000 out.csv

real	0m2.870s
user	0m1.064s
sys	0m0.076s
相关推荐
roshy7 小时前
MCP(模型上下文协议)入门教程1
人工智能·大模型·agent
一碗白开水一7 小时前
【论文阅读】Far3D: Expanding the Horizon for Surround-view 3D Object Detection
论文阅读·人工智能·深度学习·算法·目标检测·计算机视觉·3d
nju_spy7 小时前
李沐深度学习论文精读(二)Transformer + GAN
人工智能·深度学习·机器学习·transformer·gan·注意力机制·南京大学
山烛7 小时前
OpenCV 银行卡号识别
人工智能·opencv·计算机视觉·图像识别·数字识别
用户5191495848457 小时前
强大的OSINT情报工具:Blackbird用户名与邮箱搜索分析平台
人工智能·aigc
非晓为骁7 小时前
【AI】AI 评测入门(二):Prompt 迭代实战从“能跑通”到“能落地”
人工智能·prompt·提示词工程·ai评测
张较瘦_7 小时前
[论文阅读] 人工智能 + 软件工程 | TDD痛点破解:LLM自动生成测试骨架靠谱吗?静态分析+专家评审给出答案
论文阅读·人工智能·软件工程
轮到我狗叫了8 小时前
力扣.1054距离相等的条形码力扣767.重构字符串力扣47.全排列II力扣980.不同路径III力扣509.斐波那契数列(记忆化搜索)
java·算法·leetcode
字节数据平台8 小时前
一客一策:Data Agent 如何重构大模型时代的智能营销
大数据·人工智能·重构