c
复制代码
/**
* https://qr9.net/string-encoding
*/
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <windows.h>
/**
* UTF-8 到 UTF-16 (无 BOM, 小端) 转换函数
*
* 返回值: 实际写入 utf16_buf 的字节数 (不含空终止符)
* 注意: 不支持 4 字节 UTF-8 序列 (U+10000 以上),遇到时跳过
*/
uint32_t utf8_to_utf16_le(const uint8_t* utf8_str, uint8_t* utf16_buf, uint32_t buf_size) {
uint32_t utf8_index = 0;
uint32_t utf16_index = 0;
while (utf8_str[utf8_index] != '\0') {
uint32_t codepoint = 0;
uint8_t byte = utf8_str[utf8_index];
// 1. 解析 UTF-8 码点
if ((byte & 0x80) == 0) {
// 单字节: 0xxxxxxx (U+0000 ~ U+007F)
codepoint = byte;
utf8_index += 1;
} else if ((byte & 0xE0) == 0xC0) {
// 双字节: 110xxxxx 10xxxxxx (U+0080 ~ U+07FF)
uint8_t b1 = utf8_str[utf8_index + 1];
if ((b1 & 0xC0) != 0x80) {
// 续字节非法(含提前遇到 \0 的情况),跳过首字节
utf8_index += 1;
continue;
}
codepoint = ((byte & 0x1F) << 6) | (b1 & 0x3F);
utf8_index += 2;
} else if ((byte & 0xF0) == 0xE0) {
// 三字节: 1110xxxx 10xxxxxx 10xxxxxx (U+0800 ~ U+FFFF, 含常用中文)
uint8_t b1 = utf8_str[utf8_index + 1];
uint8_t b2 = utf8_str[utf8_index + 2];
if ((b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80) {
utf8_index += 1;
continue;
}
codepoint = ((byte & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
utf8_index += 3;
} else {
// 4 字节序列或非法字节,跳过
utf8_index += 1;
continue;
}
// 2. 过滤代理码点 (U+D800 ~ U+DFFF),这些码点在 UTF-16 中非法
if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
continue;
}
// 3. 输出 UTF-16 LE (此处 codepoint 必定 <= 0xFFFF)
if (utf16_index + 2 > buf_size) {
break;
}
utf16_buf[utf16_index++] = (uint8_t)(codepoint & 0xFF); // 低字节
utf16_buf[utf16_index++] = (uint8_t)((codepoint >> 8) & 0xFF); // 高字节
}
// 4. 写入 UTF-16 空终止符 (占 2 字节)
if (utf16_index + 2 <= buf_size) {
utf16_buf[utf16_index] = 0x00;
utf16_buf[utf16_index + 1] = 0x00;
}
return utf16_index; // 返回写入字节数,不含终止符
}
/**
* 单个 Unicode 码点 → UTF-16 LE
*
* @param codepoint Unicode 码点 (U+0000 ~ U+FFFF, 不支持代理对范围)
* @param utf16_buf 输出缓冲区,至少 4 字节 (2字节数据 + 2字节终止符)
* @param buf_size 缓冲区大小
* @return 写入字节数 (不含终止符);码点非法或缓冲区不足返回 0
*/
uint32_t codepoint_to_utf16_le(uint32_t codepoint, uint8_t* utf16_buf, uint32_t buf_size) {
// 拒绝代理码点和 BMP 以上的码点
if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return 0;
if (codepoint > 0xFFFF) return 0;
if (buf_size < 4) return 0;
utf16_buf[0] = (uint8_t)(codepoint & 0xFF);
utf16_buf[1] = (uint8_t)((codepoint >> 8) & 0xFF);
utf16_buf[2] = 0x00;
utf16_buf[3] = 0x00;
return 2;
}
// 辅助函数:以十六进制形式打印内存数据
void print_hex(const char* label, const uint8_t* data, uint32_t len) {
printf("%s (Len: %d): ", label, len);
for (uint32_t i = 0; i < len; i++) {
printf("%02X ", data[i]);
}
printf("\n");
}
int main() {
SetConsoleOutputCP(CP_UTF8);
uint8_t utf16_buffer[64];
// --- 测试1: 字符串转换 ---
const char* test_str = "你好";
memset(utf16_buffer, 0, sizeof(utf16_buffer));
print_hex("UTF-8 ", (const uint8_t*)test_str, strlen(test_str));
uint32_t len = utf8_to_utf16_le((const uint8_t*)test_str, utf16_buffer, sizeof(utf16_buffer));
print_hex("UTF-16 LE ", utf16_buffer, len);
printf("\n");
// --- 测试2: 单码点转换 ---
// U+4F60 = '你', U+0041 = 'A', U+00E9 = 'é'
uint32_t codepoints[] = { 0x4F60, 0x0041, 0x00E9 };
const char* labels[] = { "U+4F60 ('你')", "U+0041 ('A')", "U+00E9 ('é')" };
for (int i = 0; i < 3; i++) {
memset(utf16_buffer, 0, sizeof(utf16_buffer));
len = codepoint_to_utf16_le(codepoints[i], utf16_buffer, sizeof(utf16_buffer));
printf("Codepoint %-16s -> ", labels[i]);
print_hex("UTF-16 LE", utf16_buffer, len);
}
return 0;
}