utf8转utf16 - 技术栈

参考

在线字符编码转换工具.qr9.net

main.cpp

c 复制代码

/**
 * https://qr9.net/string-encoding
 */

#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <windows.h>
/**
 *  UTF-8 到 UTF-16 (无 BOM, 小端) 转换函数
 *
 *  返回值: 实际写入 utf16_buf 的字节数 (不含空终止符)
 *  注意: 不支持 4 字节 UTF-8 序列 (U+10000 以上)，遇到时跳过
 */
uint32_t utf8_to_utf16_le(const uint8_t* utf8_str, uint8_t* utf16_buf, uint32_t buf_size) {
    uint32_t utf8_index  = 0;
    uint32_t utf16_index = 0;

    while (utf8_str[utf8_index] != '\0') {
        uint32_t codepoint = 0;
        uint8_t  byte      = utf8_str[utf8_index];

        // 1. 解析 UTF-8 码点
        if ((byte & 0x80) == 0) {
            // 单字节: 0xxxxxxx (U+0000 ~ U+007F)
            codepoint = byte;
            utf8_index += 1;

        } else if ((byte & 0xE0) == 0xC0) {
            // 双字节: 110xxxxx 10xxxxxx (U+0080 ~ U+07FF)
            uint8_t b1 = utf8_str[utf8_index + 1];
            if ((b1 & 0xC0) != 0x80) {
                // 续字节非法（含提前遇到 \0 的情况），跳过首字节
                utf8_index += 1;
                continue;
            }
            codepoint   = ((byte & 0x1F) << 6) | (b1 & 0x3F);
            utf8_index += 2;

        } else if ((byte & 0xF0) == 0xE0) {
            // 三字节: 1110xxxx 10xxxxxx 10xxxxxx (U+0800 ~ U+FFFF, 含常用中文)
            uint8_t b1 = utf8_str[utf8_index + 1];
            uint8_t b2 = utf8_str[utf8_index + 2];
            if ((b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80) {
                utf8_index += 1;
                continue;
            }
            codepoint   = ((byte & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
            utf8_index += 3;

        } else {
            // 4 字节序列或非法字节，跳过
            utf8_index += 1;
            continue;
        }

        // 2. 过滤代理码点 (U+D800 ~ U+DFFF)，这些码点在 UTF-16 中非法
        if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
            continue;
        }

        // 3. 输出 UTF-16 LE (此处 codepoint 必定 <= 0xFFFF)
        if (utf16_index + 2 > buf_size) {
            break;
        }
        utf16_buf[utf16_index++] = (uint8_t)(codepoint & 0xFF);         // 低字节
        utf16_buf[utf16_index++] = (uint8_t)((codepoint >> 8) & 0xFF);  // 高字节
    }

    // 4. 写入 UTF-16 空终止符 (占 2 字节)
    if (utf16_index + 2 <= buf_size) {
        utf16_buf[utf16_index]     = 0x00;
        utf16_buf[utf16_index + 1] = 0x00;
    }

    return utf16_index; // 返回写入字节数，不含终止符
}


/**
 *  单个 Unicode 码点 → UTF-16 LE
 *
 *  @param codepoint  Unicode 码点 (U+0000 ~ U+FFFF, 不支持代理对范围)
 *  @param utf16_buf  输出缓冲区，至少 4 字节 (2字节数据 + 2字节终止符)
 *  @param buf_size   缓冲区大小
 *  @return           写入字节数 (不含终止符)；码点非法或缓冲区不足返回 0
 */
uint32_t codepoint_to_utf16_le(uint32_t codepoint, uint8_t* utf16_buf, uint32_t buf_size) {
    // 拒绝代理码点和 BMP 以上的码点
    if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return 0;
    if (codepoint > 0xFFFF) return 0;

    if (buf_size < 4) return 0;

    utf16_buf[0] = (uint8_t)(codepoint & 0xFF);
    utf16_buf[1] = (uint8_t)((codepoint >> 8) & 0xFF);
    utf16_buf[2] = 0x00;
    utf16_buf[3] = 0x00;

    return 2;
}

// 辅助函数：以十六进制形式打印内存数据
void print_hex(const char* label, const uint8_t* data, uint32_t len) {
    printf("%s (Len: %d): ", label, len);
    for (uint32_t i = 0; i < len; i++) {
        printf("%02X ", data[i]);
    }
    printf("\n");
}

int main() {
    SetConsoleOutputCP(CP_UTF8);
    uint8_t utf16_buffer[64];

    // --- 测试1: 字符串转换 ---
    const char* test_str = "你好";
    memset(utf16_buffer, 0, sizeof(utf16_buffer));
    print_hex("UTF-8        ", (const uint8_t*)test_str, strlen(test_str));
    uint32_t len = utf8_to_utf16_le((const uint8_t*)test_str, utf16_buffer, sizeof(utf16_buffer));
    print_hex("UTF-16 LE    ", utf16_buffer, len);

    printf("\n");

    // --- 测试2: 单码点转换 ---
    // U+4F60 = '你',  U+0041 = 'A',  U+00E9 = 'é'
    uint32_t codepoints[] = { 0x4F60, 0x0041, 0x00E9 };
    const char* labels[]  = { "U+4F60 ('你')", "U+0041 ('A')", "U+00E9 ('é')" };

    for (int i = 0; i < 3; i++) {
        memset(utf16_buffer, 0, sizeof(utf16_buffer));
        len = codepoint_to_utf16_le(codepoints[i], utf16_buffer, sizeof(utf16_buffer));
        printf("Codepoint %-16s -> ", labels[i]);
        print_hex("UTF-16 LE", utf16_buffer, len);
    }

    return 0;
}

输出

bash 复制代码

C:\Users\PC\CLionProjects\untitled28\cmake-build-debug\untitled28.exe
UTF-8         (Len: 6): E4 BD A0 E5 A5 BD
UTF-16 LE     (Len: 4): 60 4F 7D 59

Codepoint U+4F60 ('你')   -> UTF-16 LE (Len: 2): 60 4F
Codepoint U+0041 ('A')     -> UTF-16 LE (Len: 2): 41 00
Codepoint U+00E9 ('é')    -> UTF-16 LE (Len: 2): E9 00

Process finished with exit code 0