利用美团龙猫添加xlsx的sheet.xml读取sharedStrings.xml中共享字符串输出到csv功能

提示词

请添加对sharedStrings.xml的支持。

结构如下

xml 复制代码

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="1421" uniqueCount="686"><si><t>Unique Key</t></si><si><t>Created Date</t></si><si><t>Closed Date</t></si><si><t>Agency</t></si></sst>

uniqueCount="686"代表它保存的条目数，每个字符串索引从0开始递增

然后sheet.xml

xml 复制代码

<sheetData><row r="1" spans="1:41"><c r="A1" t="s"><v>0</v></c><c r="B1" t="s"><v>1</v></c><c r="C1" t="s"><v>2</v></c><c r="D1" t="s"><v>3</v></c><c r="E1" t="s"><v>4</v></c><c r="F1" t="s"><v>5</v></c>

会引用sharedStrings.xml的字符串，当t="s"时，<v>和</v>中间的值就是<si><t>和</t></si>中间的字符串索引，比如0代表Unique Key，在csv中输出Unique Key。同一个索引可能多次出现，

思路是把sharedStrings.xml中每个条目的开始字节和长度存入数组a,b，然后memcpy a[index],b[index]

请用2个函数实现读取共享字符串条目和从索引恢复条目，并给出原始代码中要增加的部分，不做别的。

他按要求给出了，但是我改到原代码里出现一个编译错误，

c 复制代码

error: assignment of read-only location '*(val + (sizetype)(shared_strings + (sizetype)((long unsigned int)idx * 16))->len)'
  376 |                     val[shared_strings[idx].len] = 0;
      |                                                  ^

原因是val的定义是const char* val = NULL;，但是后面对它引用的地址中的内容做了修改，通过引入另一个变量char* val2解决了。具体见如下代码

c 复制代码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>


// 读取sharedStrings.xml，返回每个字符串的起始位置和长度
// 返回数组，每个元素为{start, len}
struct str_pos {
    const char* start;
    int len;
};
struct str_pos* read_shared_strings(const char* fname, int* count) {
    int fd = open(fname, O_RDONLY);
    if (fd < 0) return NULL;
    struct stat sb;
    if (fstat(fd, &sb) < 0) { close(fd); return NULL; }
    char* mapped = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    if (mapped == MAP_FAILED) { close(fd); return NULL; }

    // 获取uniqueCount
    const char* unique_attr = strstr(mapped, "uniqueCount=");
    if (!unique_attr) unique_attr = strstr(mapped, "count=");
    int unique_count = 0;
    if (unique_attr) {
        unique_attr = strchr(unique_attr, '"');
        if (unique_attr) {
            unique_attr++;
            unique_count = atoi(unique_attr);
        }
    }
    if (unique_count == 0) unique_count = 1024; // fallback

    struct str_pos* ret = (struct str_pos*)calloc(unique_count, sizeof(struct str_pos));
    int idx = 0;
    const char* p = mapped;
    const char* end = mapped + sb.st_size;
    while (p < end) {
        const char* si = strstr(p, "<si>");
        if (!si) break;
        si += 4;
        const char* t = strstr(si, "<t>");
        if (!t || t > end) break;
        t += 3;
        const char* t_end = strstr(t, "</t>");
        if (!t_end || t_end > end) break;
        ret[idx].start = t;
        ret[idx].len = t_end - t;
        idx++;
        p = t_end;
    }
    *count = idx;
    // 保存映射
    ret = (struct str_pos*)realloc(ret, sizeof(struct str_pos) * idx);
    // 将mapped指针保存到ret数组尾部，便于后续释放
    struct str_pos* r = (struct str_pos*)malloc(sizeof(struct str_pos) * (idx + 1));
    memcpy(r, ret, sizeof(struct str_pos) * idx);
    r[idx].start = mapped;
    r[idx].len = sb.st_size;
    free(ret);
    close(fd);
    return r;
}

// 根据索引获取共享字符串
const char* get_shared_string(struct str_pos* arr, int index) {
    if (arr[index].len == 0) return "";
    return arr[index].start;
}

// 释放sharedStrings内存
void free_shared_strings(struct str_pos* arr) {
    if (arr) {
        // 最后一个元素保存mapped指针
        const char* mapped = arr[arr[-1].len].start; // 这里需要调整
        // 更简单：保存mapped和len到数组尾部
        // 实际上我们在read_shared_strings中多分配1个，最后一个存mapped指针
        int count = 0;
        while (arr[count].start && count < 100000) count++;
        if (count > 0) {
            const char* last_mapped = arr[count-1].start;
            if (last_mapped) munmap((void*)last_mapped, arr[count-1].len);
        }
        free(arr);
    }
}


// 辅助函数：在指定范围内查找子串
char* strrstr(const char* haystack, const char* end, const char* needle) {
    char* p = (char*)end - strlen(needle);
    while (p >= haystack) {
        if (strncmp(p, needle, strlen(needle)) == 0)
            return p;
        p--;
    }
    return NULL;
}
// 字母列号转数字，A1 -> (0,0), B1 -> (1,0)
int col_letter_to_num(const char* col, int len) {
    int num = 0;
    for (int i = 0; i < len; i++) {
        if (col[i] >= 'A' && col[i] <= 'Z')
            num = num * 26 + (col[i] - 'A' + 1);
        else if (col[i] >= 'a' && col[i] <= 'z')
            num = num * 26 + (col[i] - 'a' + 1);
        else
            break;
    }
    return num - 1;
}

// 查找标签属性值
const char* get_attr(const char* start, const char* end, const char* attr) {
    const char* p = start;
    int attr_len = strlen(attr);
    while (p < end - attr_len) {
        if (strncmp(p, attr, attr_len) == 0 && p[attr_len] == '=') {
            p += attr_len + 1;
            if (*p == '"' || *p == '\'') {
                char q = *p;
                p++;
                const char* val = p;
                while (p < end && *p != q) p++;
                char* ret = (char*)malloc(p - val + 1);
                strncpy(ret, val, p - val);
                ret[p - val] = 0;
                return ret;
            }
        }
        p++;
    }
    return NULL;
}

// 查找标签名
const char* get_tagname(const char* start, const char* end, const char* tag) {
    const char* p = start;
    int tag_len = strlen(tag);
    while (p < end - tag_len - 1) {
        if (strncmp(p, "<", 1) == 0 && strncmp(p+1, tag, tag_len) == 0 && (p[1+tag_len]==' ' || p[1+tag_len]=='>')) {
            return p;
        }
        p++;
    }
    return NULL;
}

// 查找标签内容
const char* get_tagcontent(const char* start, const char* end, const char* tag) {
    const char* p = start;
    int tag_len = strlen(tag);
    while (p < end - tag_len - 1) {
        if (strncmp(p, "<", 1) == 0 && strncmp(p+1, tag, tag_len) == 0) {
            const char* tag_end = strchr(p, '>');
            if (!tag_end) return NULL;
            const char* content = tag_end + 1;
            const char* close = strstr(content, "</");
            if (!close || close > end) return NULL;
            const char* close_tag = close + 2;
            if (strncmp(close_tag, tag, tag_len) == 0 && close_tag[tag_len] == '>') {
                char* ret = (char*)malloc(close - content + 1);
                strncpy(ret, content, close - content);
                ret[close - content] = 0;
                return ret;
            }
        }
        p++;
    }
    return NULL;
}

// 查找下一个标签
const char* next_tag(const char* start, const char* end) {
    const char* p = start;
    while (p < end) {
        if (*p == '<') return p;
        p++;
    }
    return NULL;
}

// 查找标签结束
const char* tag_end(const char* tag) {
    const char* p = tag;
    while (*p && *p != '>' && *p != ' ') p++;
    return p;
}

// 查找属性值，返回指向属性值的指针
const char* tag_attrval(const char* tag, const char* attr, const char** val_end) {
    int attr_len = strlen(attr);
    const char* p = tag;
    while (*p && *p != '>') {
        if (strncmp(p, attr, attr_len) == 0 && p[attr_len] == '=') {
            p += attr_len + 1;
            if (*p == '"' || *p == '\'') {
                char q = *p;
                p++;
                const char* val = p;
                while (*p && *p != q) p++;
                *val_end = p;
                return val;
            }
        }
        p++;
    }
    *val_end = NULL;
    return NULL;
}

// 主函数
int main(int argc, char* argv[]) {

   // 命令行参数改为：argv[1]为sheet.xml, argv[2]为sharedStrings.xml（可选）

    if (argc != 2 && argc != 3) {
        fprintf(stderr, "Usage: %s <input.xml> [sharedStrings.xml]\n", argv[0]);
        exit(1);
    }
    // 在main函数开头添加： 
    int sst_count = 0; 
    struct str_pos* shared_strings = NULL; 
    if (argc > 2) { shared_strings = read_shared_strings(argv[2], &sst_count); } 

    const char* fname = argv[1];
    int fd = open(fname, O_RDONLY);
    if (fd < 0) { perror("open"); exit(1); }
    struct stat sb;
    if (fstat(fd, &sb) < 0) { perror("fstat"); exit(1); }
    size_t flen = sb.st_size;
    char* mapped = (char*)mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0);
    if (mapped == MAP_FAILED) { perror("mmap"); exit(1); }

    // 查找第一个<row
    const char* first_row = strstr(mapped, "<row");
    if (!first_row) {
        munmap(mapped, flen);
        close(fd);
        return 0;
    }
    // 查找最后一个</row>

    char* last_row_end = (char*)strrstr(mapped, mapped + flen, "</row>");
    if (!last_row_end) {
        munmap(mapped, flen);
        close(fd);
        return 0;
    }
    last_row_end = strstr(last_row_end, ">");
    if (last_row_end) last_row_end += 1;
    else last_row_end = mapped + flen;
    const char* file_tail_start = last_row_end;

    // 当前行最大列
    int max_col = 0;
    // 当前解析位置
    const char* p = first_row;
    // 64K块
    const size_t BUF_SIZE = 65536;
    //char* chunk = (char*)malloc(BUF_SIZE + 100);

    // 行号缓存
    int last_row = 0;

    // 解析一行
    void output_row(const char* row_start, const char* row_end, int* max_col, int last_row) {
        // 提取r属性
        const char* r_attr_val = get_attr(row_start, row_end, "r");
        int row_num = r_attr_val ? atoi(r_attr_val) : -1;
        if (r_attr_val) free((void*)r_attr_val);
        if (row_num < 1) return;

        // 统计最大列
        int this_max_col = -1;
        const char* c = row_start;
        while (1) {
            const char* c_tag = get_tagname(c, row_end, "c");
            if (!c_tag) break;
            const char* c_end = strstr(c_tag, "</c>");
            if (!c_end || c_end > row_end) break;
            c_end = strstr(c_end, ">");
            if (c_end) c_end += 1;
            else c_end = row_end;
            const char* c_r_attr = get_attr(c_tag, c_end, "r");
            if (c_r_attr) {
                int col_len = 0;
                while (c_r_attr[col_len] && c_r_attr[col_len] >= 'A' && (c_r_attr[col_len] <= 'Z' || c_r_attr[col_len] <= 'z')) col_len++;
                int col_num = col_letter_to_num(c_r_attr, col_len);
                if (col_num > this_max_col) this_max_col = col_num;
                free((void*)c_r_attr);
            }
            c = c_end;
        }
        if (this_max_col < 0) return;
        if (this_max_col > *max_col) *max_col = this_max_col;

        // 补空行
        while (last_row < row_num - 1) {
            last_row++;
            int has_data = 0;
            for (int c = 0; c <= *max_col; c++) {
                // 全空
            }
            if (!has_data) continue;
            printf("%d", last_row);
            for (int c = 0; c <= *max_col; c++) printf(",");
            printf("\n");
        }



        // 解析c节点
        char** cells = (char**)calloc(this_max_col + 1, sizeof(char*));
        c = row_start;
        while (1) {
            const char* c_tag = get_tagname(c, row_end, "c");
            if (!c_tag) break;
            const char* c_end = strstr(c_tag, "</c>");
            if (!c_end || c_end > row_end) break;
            c_end = strstr(c_end, ">");
            if (c_end) c_end += 1;
            else c_end = row_end;
            const char* c_r_attr = get_attr(c_tag, c_end, "r");
            if (c_r_attr) {
                int col_len = 0;
                while (c_r_attr[col_len] && c_r_attr[col_len] >= 'A' && (c_r_attr[col_len] <= 'Z' || c_r_attr[col_len] <= 'z')) col_len++;
                int col_num = col_letter_to_num(c_r_attr, col_len);
                free((void*)c_r_attr);

                // 判断t属性
                const char* t_attr = get_attr(c_tag, c_end, "t");
                int is_str = (t_attr && strcmp(t_attr, "inlineStr") == 0);

                int is_sst = (t_attr && strcmp(t_attr, "s") == 0);
                if (t_attr) free((void*)t_attr);

                const char* val = NULL;
                if (is_str) {
                    const char* t_tag = get_tagcontent(c_tag, c_end, "t");
                    val = t_tag ? t_tag : strdup("");
                } 
                //新增共享字符串处理
                else if (is_sst) {
                const char* v_tag = get_tagcontent(c_tag, c_end, "v");
                int idx = v_tag ? atoi(v_tag) : 0;
                if (shared_strings && idx < sst_count && idx >= 0) {
                    char* val2 = (char*)malloc(shared_strings[idx].len + 1);
                    memcpy(val2, shared_strings[idx].start, shared_strings[idx].len);
                    val2[shared_strings[idx].len] = 0;
                    val=val2;
                } else {
                    val = strdup("");
                }
                if (v_tag) free((void*)v_tag);
            }
            else {
                    const char* v_tag = get_tagcontent(c_tag, c_end, "v");
                    val = v_tag ? v_tag : strdup("");
                }
                cells[col_num] = (char*)val;
            }
            c = c_end;
        }

        // 输出行
        int has_data = 0;
        for (int c = 0; c <= this_max_col; c++) {
            if (cells[c] && strlen(cells[c]) > 0) {
                has_data = 1; break;
            }
        }
        if (has_data) {
            printf("%d", row_num);
            for (int c = 0; c <= this_max_col; c++) {
                if (cells[c] && strlen(cells[c]) > 0)
                    printf(",%s", cells[c]);
                else
                    printf(",");
            }
            printf("\n");
        }
        // 释放
        for (int c = 0; c <= this_max_col; c++) {
            if (cells[c]) free(cells[c]);
        }
        free(cells);
    }

    // 顺序解析
    while (p < file_tail_start) {
        // 找下一个<row
        const char* row_start = p;//get_tagname(p, file_tail_start, "row");
        if (!row_start) break;
        const char* row_end = strstr(row_start, "</row>");

        if (!row_end || row_end > file_tail_start) break;
        row_end = strstr(row_end, ">");
        if (row_end) row_end += 1;
        else row_end = file_tail_start;
        // 解析并输出一行
        output_row(row_start, row_end, &max_col, last_row);
        last_row = last_row > 0 ? last_row : atoi(get_attr(row_start, row_end, "r"));

        // 继续
        p = row_end;
    }

    // 释放
    //free(chunk);
    munmap(mapped, flen);
    close(fd);

// 在main函数结尾添加： 
 if (shared_strings) free_shared_strings(shared_strings); 
    return 0;
}

编译和执行

bash 复制代码

gcc catxmls3.c -o catxmls3 -O3
./catxmls3 wp//xl/worksheets/sheet1.xml
2,42254749,,,,,,,,11235,,,,,,,,,,,,,,,,3088060140,,1002973,152924,,,,,,,,,,,40.5863974,-73.9325913,


./catxmls3 wp/xl/worksheets/sheet1.xml wp/xl/sharedStrings.xml
1,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Description,Resolution Action Updated Date,Community Board,BBL,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Open Data Channel Type,Park Facility Name,Park Borough,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location
2,42254749,04/18/2019 09:55:45 PM,04/19/2019 03:45:24 AM,NYPD,New York City Police Department,Noise - Residential,Banging/Pounding,Residential Building/House,11235,3855 SHORE PARKWAY,SHORE PARKWAY,BRAGG STREET,BELT PARKWAY WB KNAPP STREET EN,,,ADDRESS,BROOKLYN,,Precinct,Closed,04/19/2019 05:55:45 AM,The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.,04/19/2019 03:45:24 AM,15 BROOKLYN,3088060140,BROOKLYN,1002973,152924,PHONE,Unspecified,BROOKLYN,,,,,,,,40.5863974,-73.9325913,(40.5863974, -73.9325913)

因为sheet1.xml第一行全是共享字符串，没有提供sharedStrings.xml时不处理，故没有输出。

提供以后，确实正确输出了共享字符串。