我改写的二分法XML转CSV文件程序速度追上了张泽鹏先生的

以下是美团龙猫初稿,我改正,DeepSeek重新格式化的代码。

重要改正点:

1.二分查找用goto控制迭代,返回<row的正确位置

2.在缓冲区头填上父标签使expat能连续解析不报错

c 复制代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <expat.h>

#define MAX_CELL_CONTENT 256

typedef struct {
    int start_row;
    int end_row;
    char start_col;
    char end_col;
} ParseRange;

typedef struct {
    ParseRange range;
    FILE *csv;
    FILE *xml_file;
    XML_Parser parser;
    int in_row;
    int current_row;
    char current_col;
    int value_started;
    char temp_value[MAX_CELL_CONTENT];
    int value_len;
    int skip_row;
    long row_start_pos;
    int first_row_processed;
    char first_row_max_col;
} ParserState;

int parse_excel_range(const char *range_str, ParseRange *range) {
    if (sscanf(range_str, "%c%d:%c%d", 
               &range->start_col, &range->start_row,
               &range->end_col, &range->end_row) != 4) {
        return -1;
    }
    if (range->start_col > range->end_col) return -1;
    if (range->start_row > range->end_row) return -1;

    return 0;
}

long binary_search_row(FILE *file, int target_row) {
    long low = 0;
    fseek(file, 0, SEEK_END);
    long high = ftell(file);
    long mid = 0;
    char buffer[1024];
    int found_row = -1;
    long found_pos = 0;
    
    while (low <= high) {
A:
        mid = (low + high) / 2;
        fseek(file, mid, SEEK_SET);
        
        int c;
        while ((c = fgetc(file)) != EOF) { 
            if (c == '<') {
                char tag[128] = {c};
                int tag_len = 1;
                while ((c = fgetc(file)) != EOF && c != '>' && tag_len < sizeof(tag) - 1) {
                    tag[tag_len++] = c;
                }
                
                tag[tag_len] = '\0';
                int rt = (strncmp(tag, "<row", 4) == 0);

                if (rt) {
                    long row_start_pos = mid + (tag_len);
                    char *row_attr = strstr(tag, " r=\"");
                    if (row_attr) {
                        int row_num = atoi(row_attr + 4);
                        found_pos = ftell(file) - tag_len - 1;
                        found_row = row_num;
                        
                        if (row_num == target_row) {
                            return found_pos;
                        } else if (row_num < target_row) {
                            low = mid + 1;
                            goto A;
                        } else {
                            high = mid - 1;
                            goto A;
                        }
                    }
                }
            }
        }
        
        if (c == EOF) break;
    }
    
    if (found_row < target_row) {
        return found_pos;
    }
    return 0;
}

void XMLCALL start_element(void *user_data, const XML_Char *name, const XML_Char **attrs) {
    ParserState *state = (ParserState*)user_data;
    
    if (strcmp(name, "row") == 0) {
        state->row_start_pos = XML_GetCurrentByteIndex(state->parser);
        state->in_row = 1;
        state->current_row = -1;
        state->skip_row = 0;
        
        for (int i = 0; attrs[i]; i += 2) {
            if (strcmp(attrs[i], "r") == 0) {
                state->current_row = atoi(attrs[i+1]);
                break;
            }
        }
        
        if (state->current_row > state->range.end_row) {
            XML_StopParser(state->parser, 0);
            return;
        }        

        if (state->current_row < state->range.start_row) {
            state->skip_row = 1;
            return;
        }
   
        fprintf(state->csv, "%d", state->current_row);
    }
    else if (strcmp(name, "c") == 0 && state->in_row && !state->skip_row) {
        for (int i = 0; attrs[i]; i += 2) {
            if (strcmp(attrs[i], "r") == 0) {
                state->current_col = attrs[i+1][0];
                break;
            }
        }
    }
    else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->in_row && !state->skip_row) {
        if (state->current_col >= state->range.start_col && 
            state->current_col <= state->range.end_col) {
            state->value_started = 1;
            state->value_len = 0;
            state->temp_value[0] = '\0';
        }
    }
}

void XMLCALL character_data(void *user_data, const XML_Char *s, int len) {
    ParserState *state = (ParserState*)user_data;
    if (state->value_started && state->value_len + len < MAX_CELL_CONTENT - 1) {
        memcpy(state->temp_value + state->value_len, s, len);
        state->value_len += len;
        state->temp_value[state->value_len] = '\0';
    }
}

void XMLCALL end_element(void *user_data, const XML_Char *name) {
    ParserState *state = (ParserState*)user_data;
    
    if (strcmp(name, "row") == 0 && state->in_row && !state->skip_row) {
        for (char col = state->current_col + 1; col <= state->range.end_col; col++) {
            fprintf(state->csv, ",");
        }
        fprintf(state->csv, "\n");
        state->in_row = 0;
    }
    else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->value_started) {
        if (state->current_col >= state->range.start_col && 
            state->current_col <= state->range.end_col) {
            static char last_col = 0;
            if (last_col == 0) last_col = state->range.start_col;
            
            for (char col = last_col; col < state->current_col; col++) {
                fprintf(state->csv, ",");
            }
            fprintf(state->csv, ",%s", state->temp_value);
            last_col = state->current_col + 1;
        }
        state->value_started = 0;
    }
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        printf("用法: %s <xml文件> <范围(A1:Z100)>\n", argv[0]);
        return 1;
    }
    
    ParseRange range;
    if (parse_excel_range(argv[2], &range) != 0) {
        printf("错误: 无效范围格式\n");
        return 1;
    }
    
    char csv_filename[256];
    strncpy(csv_filename, argv[1], sizeof(csv_filename) - 1);
    char *ext = strrchr(csv_filename, '.');
    if (ext) strcpy(ext, ".csv");
    else strncat(csv_filename, ".csv", sizeof(csv_filename) - strlen(csv_filename) - 1);
    
    FILE *csv = fopen(csv_filename, "w");
    if (!csv) {
        printf("错误: 无法创建CSV\n");
        return 1;
    }
    
    fprintf(csv, "Row");
    for (char col = range.start_col; col <= range.end_col; col++) {
        fprintf(csv, ",%c", col);
    }
    fprintf(csv, "\n");

    FILE *file = fopen(argv[1], "rb");
    if (!file) {
        printf("错误: 无法打开文件 %s\n", argv[1]);
        fclose(csv);
        return -1;
    }
    
    long start_pos = binary_search_row(file, range.start_row);
    if (start_pos > 0) {
        fseek(file, start_pos, SEEK_SET);
    } else {
        fseek(file, 0, SEEK_SET);
    }
    
    XML_Parser parser = XML_ParserCreate(NULL);
    ParserState state = {0};
    state.range = range;
    state.csv = csv;
    state.parser = parser;
    XML_SetUserData(parser, &state);
    XML_SetElementHandler(parser, start_element, end_element);
    XML_SetCharacterDataHandler(parser, character_data);
    
    fseek(file, start_pos, SEEK_SET);
    char buffer[8192] = "<sheetData>";
    
    int done;
    int i = 0;
    do {
        if (XML_GetErrorCode(parser) == XML_ERROR_FINISHED) break;
        size_t len = fread(buffer + 11 * (i == 0), 1, sizeof(buffer) - 11 * (i == 0), file);
        done = (len < sizeof(buffer) - 11 * (i == 0));
        size_t actual_len = len;
        if (!done) {
            if (XML_Parse(parser, buffer, actual_len + 11 * (i == 0), done) == XML_STATUS_ERROR) {
                break;
            }
            i++;
        }
    } while (!done);
    
    fclose(file);
    fclose(csv);
    XML_ParserFree(parser);
    printf("CSV已保存到 %s\n", csv_filename);
    return 0;
}

编译运行和比较

复制代码
gcc expatfmt.c -o expatfmt -lexpat -O3
root@66d4e20ec1d7:/par# time ./expatfmt lineitem/xl/worksheets/sheet1.xml A500000:Z600000
CSV已保存到 lineitem/xl/worksheets/sheet1.csv

real	0m1.865s
user	0m1.836s
sys	0m0.028s

root@66d4e20ec1d7:/par# time ./aich2 lineitem/xl/worksheets/sheet1.xml A500000:Z600000 out.csv

real	0m2.870s
user	0m1.064s
sys	0m0.076s
相关推荐
一匹电信狗1 天前
【C++11】Lambda表达式+新的类功能
服务器·c++·算法·leetcode·小程序·stl·visual studio
say_fall1 天前
C语言编程实战:每日刷题 - day 1
c语言·开发语言·学习
在等晚安么1 天前
力扣面试150题打卡
算法·leetcode·面试
机器之心1 天前
GPT-5.1发布,OpenAI开始拼情商
人工智能·openai
YangYang9YangYan1 天前
高职单招与统招比较及职业发展指南
大数据·人工智能·数据分析
AI科技星1 天前
宇宙膨胀速度的光速极限:基于张祥前统一场论的第一性原理推导与观测验证
数据结构·人工智能·经验分享·python·算法·计算机视觉
IoT智慧学堂1 天前
C语言流程控制:if判断语句全解析
c语言·开发语言
Mintopia1 天前
🚀 共绩算力:3分钟拥有自己的图像优化服务-CodeFormer:先进的图像算法优化、修复马赛克、提升图片清晰度等
前端·人工智能·ai编程
EXtreme351 天前
C语言指针深度剖析(2):从“数组名陷阱”到“二级指针操控”的进阶指南
c语言·开发语言·算法
南山安1 天前
🚀 从“Hulk”扩展开发实战,聊聊我找到的“Vibe Coding”最佳姿势
人工智能·设计