我改写的二分法XML转CSV文件程序速度追上了张泽鹏先生的

以下是美团龙猫初稿,我改正,DeepSeek重新格式化的代码。

重要改正点:

1.二分查找用goto控制迭代,返回<row的正确位置

2.在缓冲区头填上父标签使expat能连续解析不报错

c 复制代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <expat.h>

#define MAX_CELL_CONTENT 256

typedef struct {
    int start_row;
    int end_row;
    char start_col;
    char end_col;
} ParseRange;

typedef struct {
    ParseRange range;
    FILE *csv;
    FILE *xml_file;
    XML_Parser parser;
    int in_row;
    int current_row;
    char current_col;
    int value_started;
    char temp_value[MAX_CELL_CONTENT];
    int value_len;
    int skip_row;
    long row_start_pos;
    int first_row_processed;
    char first_row_max_col;
} ParserState;

int parse_excel_range(const char *range_str, ParseRange *range) {
    if (sscanf(range_str, "%c%d:%c%d", 
               &range->start_col, &range->start_row,
               &range->end_col, &range->end_row) != 4) {
        return -1;
    }
    if (range->start_col > range->end_col) return -1;
    if (range->start_row > range->end_row) return -1;

    return 0;
}

long binary_search_row(FILE *file, int target_row) {
    long low = 0;
    fseek(file, 0, SEEK_END);
    long high = ftell(file);
    long mid = 0;
    char buffer[1024];
    int found_row = -1;
    long found_pos = 0;
    
    while (low <= high) {
A:
        mid = (low + high) / 2;
        fseek(file, mid, SEEK_SET);
        
        int c;
        while ((c = fgetc(file)) != EOF) { 
            if (c == '<') {
                char tag[128] = {c};
                int tag_len = 1;
                while ((c = fgetc(file)) != EOF && c != '>' && tag_len < sizeof(tag) - 1) {
                    tag[tag_len++] = c;
                }
                
                tag[tag_len] = '\0';
                int rt = (strncmp(tag, "<row", 4) == 0);

                if (rt) {
                    long row_start_pos = mid + (tag_len);
                    char *row_attr = strstr(tag, " r=\"");
                    if (row_attr) {
                        int row_num = atoi(row_attr + 4);
                        found_pos = ftell(file) - tag_len - 1;
                        found_row = row_num;
                        
                        if (row_num == target_row) {
                            return found_pos;
                        } else if (row_num < target_row) {
                            low = mid + 1;
                            goto A;
                        } else {
                            high = mid - 1;
                            goto A;
                        }
                    }
                }
            }
        }
        
        if (c == EOF) break;
    }
    
    if (found_row < target_row) {
        return found_pos;
    }
    return 0;
}

void XMLCALL start_element(void *user_data, const XML_Char *name, const XML_Char **attrs) {
    ParserState *state = (ParserState*)user_data;
    
    if (strcmp(name, "row") == 0) {
        state->row_start_pos = XML_GetCurrentByteIndex(state->parser);
        state->in_row = 1;
        state->current_row = -1;
        state->skip_row = 0;
        
        for (int i = 0; attrs[i]; i += 2) {
            if (strcmp(attrs[i], "r") == 0) {
                state->current_row = atoi(attrs[i+1]);
                break;
            }
        }
        
        if (state->current_row > state->range.end_row) {
            XML_StopParser(state->parser, 0);
            return;
        }        

        if (state->current_row < state->range.start_row) {
            state->skip_row = 1;
            return;
        }
   
        fprintf(state->csv, "%d", state->current_row);
    }
    else if (strcmp(name, "c") == 0 && state->in_row && !state->skip_row) {
        for (int i = 0; attrs[i]; i += 2) {
            if (strcmp(attrs[i], "r") == 0) {
                state->current_col = attrs[i+1][0];
                break;
            }
        }
    }
    else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->in_row && !state->skip_row) {
        if (state->current_col >= state->range.start_col && 
            state->current_col <= state->range.end_col) {
            state->value_started = 1;
            state->value_len = 0;
            state->temp_value[0] = '\0';
        }
    }
}

void XMLCALL character_data(void *user_data, const XML_Char *s, int len) {
    ParserState *state = (ParserState*)user_data;
    if (state->value_started && state->value_len + len < MAX_CELL_CONTENT - 1) {
        memcpy(state->temp_value + state->value_len, s, len);
        state->value_len += len;
        state->temp_value[state->value_len] = '\0';
    }
}

void XMLCALL end_element(void *user_data, const XML_Char *name) {
    ParserState *state = (ParserState*)user_data;
    
    if (strcmp(name, "row") == 0 && state->in_row && !state->skip_row) {
        for (char col = state->current_col + 1; col <= state->range.end_col; col++) {
            fprintf(state->csv, ",");
        }
        fprintf(state->csv, "\n");
        state->in_row = 0;
    }
    else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->value_started) {
        if (state->current_col >= state->range.start_col && 
            state->current_col <= state->range.end_col) {
            static char last_col = 0;
            if (last_col == 0) last_col = state->range.start_col;
            
            for (char col = last_col; col < state->current_col; col++) {
                fprintf(state->csv, ",");
            }
            fprintf(state->csv, ",%s", state->temp_value);
            last_col = state->current_col + 1;
        }
        state->value_started = 0;
    }
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        printf("用法: %s <xml文件> <范围(A1:Z100)>\n", argv[0]);
        return 1;
    }
    
    ParseRange range;
    if (parse_excel_range(argv[2], &range) != 0) {
        printf("错误: 无效范围格式\n");
        return 1;
    }
    
    char csv_filename[256];
    strncpy(csv_filename, argv[1], sizeof(csv_filename) - 1);
    char *ext = strrchr(csv_filename, '.');
    if (ext) strcpy(ext, ".csv");
    else strncat(csv_filename, ".csv", sizeof(csv_filename) - strlen(csv_filename) - 1);
    
    FILE *csv = fopen(csv_filename, "w");
    if (!csv) {
        printf("错误: 无法创建CSV\n");
        return 1;
    }
    
    fprintf(csv, "Row");
    for (char col = range.start_col; col <= range.end_col; col++) {
        fprintf(csv, ",%c", col);
    }
    fprintf(csv, "\n");

    FILE *file = fopen(argv[1], "rb");
    if (!file) {
        printf("错误: 无法打开文件 %s\n", argv[1]);
        fclose(csv);
        return -1;
    }
    
    long start_pos = binary_search_row(file, range.start_row);
    if (start_pos > 0) {
        fseek(file, start_pos, SEEK_SET);
    } else {
        fseek(file, 0, SEEK_SET);
    }
    
    XML_Parser parser = XML_ParserCreate(NULL);
    ParserState state = {0};
    state.range = range;
    state.csv = csv;
    state.parser = parser;
    XML_SetUserData(parser, &state);
    XML_SetElementHandler(parser, start_element, end_element);
    XML_SetCharacterDataHandler(parser, character_data);
    
    fseek(file, start_pos, SEEK_SET);
    char buffer[8192] = "<sheetData>";
    
    int done;
    int i = 0;
    do {
        if (XML_GetErrorCode(parser) == XML_ERROR_FINISHED) break;
        size_t len = fread(buffer + 11 * (i == 0), 1, sizeof(buffer) - 11 * (i == 0), file);
        done = (len < sizeof(buffer) - 11 * (i == 0));
        size_t actual_len = len;
        if (!done) {
            if (XML_Parse(parser, buffer, actual_len + 11 * (i == 0), done) == XML_STATUS_ERROR) {
                break;
            }
            i++;
        }
    } while (!done);
    
    fclose(file);
    fclose(csv);
    XML_ParserFree(parser);
    printf("CSV已保存到 %s\n", csv_filename);
    return 0;
}

编译运行和比较

复制代码
gcc expatfmt.c -o expatfmt -lexpat -O3
root@66d4e20ec1d7:/par# time ./expatfmt lineitem/xl/worksheets/sheet1.xml A500000:Z600000
CSV已保存到 lineitem/xl/worksheets/sheet1.csv

real	0m1.865s
user	0m1.836s
sys	0m0.028s

root@66d4e20ec1d7:/par# time ./aich2 lineitem/xl/worksheets/sheet1.xml A500000:Z600000 out.csv

real	0m2.870s
user	0m1.064s
sys	0m0.076s
相关推荐
YGGP3 分钟前
【Golang】LeetCode 279. 完全平方数
算法·leetcode
过期的秋刀鱼!4 分钟前
week3-机器学习-逻辑回归模型介绍和决策边界
人工智能·机器学习·逻辑回归
小麦嵌入式6 分钟前
Linux驱动开发实战(十三):RGB LED驱动并发控制——自旋锁与信号量对比详解
linux·c语言·驱动开发·stm32·单片机·嵌入式硬件·物联网
im_AMBER9 分钟前
Leetcode 87 等价多米诺骨牌对的数量
数据结构·笔记·学习·算法·leetcode
好奇龙猫11 分钟前
【AI学习-comfyUI学习-第二十一-LMSD线段预处理器(建筑概念设计图)-各个部分学习】
人工智能·学习
启途AI12 分钟前
实测国内支持Nano Banana pro的ai工具,解锁PPT可编辑新体验!
人工智能·powerpoint·ppt
WitsMakeMen12 分钟前
大语言模型要用分组注意力机制GQA
人工智能·语言模型·自然语言处理
Godspeed Zhao15 分钟前
自动驾驶中的传感器技术84——Sensor Fusion(7)
人工智能·机器学习·自动驾驶
IT_陈寒17 分钟前
Redis高频踩坑实录:5个不报错但会导致性能腰斩的'隐秘'配置项
前端·人工智能·后端
import_random19 分钟前
[算法]时间序列(介绍)
算法