【POSIX】使用iconv库将UTF-8字符串转换为UTF-16字符串

使用<iconv.h>来进行字符串编码的转换

cpp 复制代码
#include <iconv.h>
#include <iostream>
#include <string.h>
#include <unistd.h>
#include <memory>
#include <fcntl.h>

// 需要链接iconv库

// iconv -l 命令可列出所有支持的格式
// example: iconv将UTF-16转换为UTF-8
// iconv -f UTF-16 -t UTF-8 myfile

// 注意: UTF-16 默认使用的是 UTF-16 BE

void print_str_bytes(const char* str, size_t len) {
    for (int i = 0; i < len; i++) {
        char s = str[i];
        std::cout << i << ": " << ((int)s & 0xff) << std::endl;
    }
}

int convert_encoding(char** dst, size_t* dst_len, const char* src, size_t src_len, const char* to_encoding, const char* from_encoding) {
    iconv_t cd = iconv_open(to_encoding, from_encoding);
    if (cd == reinterpret_cast<iconv_t>(-1)) {
        std::cerr << "iconv_open error: " << strerror(errno) << std::endl;
        return -1;
    }

    size_t sl = src_len;
    size_t total = 2 * src_len; // BOM头占2字节 FEFF(UTF-16 BE)
    size_t tmp = total;
    char* outbuf = new char[total];

    std::unique_ptr<char[]> ptr(outbuf);    // 用智能指针控制内存自动释放

    int res = iconv(cd, const_cast<char**>(&src), &sl, &outbuf, &tmp);
    if (res == -1) {
        std::cerr << "iconv error: " << strerror(errno) << std::endl;
        iconv_close(cd);
        return -1;
    }

    // FIXME 此处使用的是全部重新转换方案,比较浪费性能,仅适用于演示,实际应该注重src未转换的长度值
    while (sl != 0) {
        total *= 2;  // 倍乘
        tmp = total;
        ptr.release();
        delete[] outbuf;

        outbuf = new char[total];    // 重新申请内存
        ptr = std::unique_ptr<char[]>(outbuf);

        res = iconv(cd, const_cast<char**>(&src), &sl, &outbuf, &tmp);
        if (res == -1) {
            std::cerr << "iconv error: " << strerror(errno) << std::endl;
            iconv_close(cd);
            return -1;
        }
    }

    std::cout << "use bytes: " << total - tmp << std::endl;
    *dst_len = total - tmp;
    *dst = ptr.release();   // 不再需要智能指针管控内存

    res = iconv_close(cd);
    if (res == -1) {
        std::cerr << "iconv_close error: " << strerror(errno) << std::endl;
        return -1;
    }
    
    return 0;
}

int main(int argc, char* argv[]) {
    if (argc != 2) {
        std::cout << "./iconv <src-str>" << std::endl;
        return 0;
    }

    char* str = argv[1];

    print_str_bytes(str, strlen(str));
 
    char* dst = nullptr;
    size_t dst_len = 0;

    int res = convert_encoding(&dst, &dst_len, str, strlen(str), "UTF-16", "UTF-8");
    if (res == -1) {
        std::cerr << "oops..." << std::endl;
        exit(-1);
    }
    std::cout << "dst_len: " << dst_len << std::endl;

    print_str_bytes(dst, dst_len);

    // 写入到文件
    int fd = open("out.txt", O_RDWR| O_CREAT | O_TRUNC, S_IRWXU);
    if (fd == -1) {
        std::cerr << "open out.txt error: " << strerror(errno) << std::endl;
        exit(-1);
    }

    write(fd, dst, dst_len);

    return 0;
}

编译:

cpp 复制代码
c++ -std=c++14 -liconv iconv.cpp -o iconv

输出:

cpp 复制代码
./iconv 你hao,世界
0: 228
1: 189
2: 160
3: 104
4: 97
5: 111
6: 239
7: 188
8: 140
9: 228
10: 184
11: 150
12: 231
13: 149
14: 140
use bytes: 16
dst_len: 16
0: 254
1: 255
2: 79
3: 96
4: 0
5: 104
6: 0
7: 97
8: 0
9: 111
10: 255
11: 12
12: 78
13: 22
14: 117
15: 76

使用iconv -l命令

cpp 复制代码
ANSI_X3.4-1968 ANSI_X3.4-1986 ASCII CP367 IBM367 ISO-IR-6 ISO646-US ISO_646.IRV:1991 US US-ASCII CSASCII
UTF-8 UTF8
UTF-8-MAC UTF8-MAC
ISO-10646-UCS-2 UCS-2 CSUNICODE
UCS-2BE UNICODE-1-1 UNICODEBIG CSUNICODE11
UCS-2LE UNICODELITTLE
ISO-10646-UCS-4 UCS-4 CSUCS4
UCS-4BE
UCS-4LE
UTF-16
UTF-16BE
UTF-16LE
UTF-32
UTF-32BE
UTF-32LE
UNICODE-1-1-UTF-7 UTF-7 CSUNICODE11UTF7
UCS-2-INTERNAL
UCS-2-SWAPPED
UCS-4-INTERNAL
UCS-4-SWAPPED
C99
JAVA
CP819 IBM819 ISO-8859-1 ISO-IR-100 ISO8859-1 ISO_8859-1 ISO_8859-1:1987 L1 LATIN1 CSISOLATIN1
ISO-8859-2 ISO-IR-101 ISO8859-2 ISO_8859-2 ISO_8859-2:1987 L2 LATIN2 CSISOLATIN2
ISO-8859-3 ISO-IR-109 ISO8859-3 ISO_8859-3 ISO_8859-3:1988 L3 LATIN3 CSISOLATIN3
ISO-8859-4 ISO-IR-110 ISO8859-4 ISO_8859-4 ISO_8859-4:1988 L4 LATIN4 CSISOLATIN4
CYRILLIC ISO-8859-5 ISO-IR-144 ISO8859-5 ISO_8859-5 ISO_8859-5:1988 CSISOLATINCYRILLIC
ARABIC ASMO-708 ECMA-114 ISO-8859-6 ISO-IR-127 ISO8859-6 ISO_8859-6 ISO_8859-6:1987 CSISOLATINARABIC
ECMA-118 ELOT_928 GREEK GREEK8 ISO-8859-7 ISO-IR-126 ISO8859-7 ISO_8859-7 ISO_8859-7:1987 ISO_8859-7:2003 CSISOLATINGREEK
HEBREW ISO-8859-8 ISO-IR-138 ISO8859-8 ISO_8859-8 ISO_8859-8:1988 CSISOLATINHEBREW
ISO-8859-9 ISO-IR-148 ISO8859-9 ISO_8859-9 ISO_8859-9:1989 L5 LATIN5 CSISOLATIN5
ISO-8859-10 ISO-IR-157 ISO8859-10 ISO_8859-10 ISO_8859-10:1992 L6 LATIN6 CSISOLATIN6
ISO-8859-11 ISO8859-11 ISO_8859-11
ISO-8859-13 ISO-IR-179 ISO8859-13 ISO_8859-13 L7 LATIN7
ISO-8859-14 ISO-CELTIC ISO-IR-199 ISO8859-14 ISO_8859-14 ISO_8859-14:1998 L8 LATIN8
ISO-8859-15 ISO-IR-203 ISO8859-15 ISO_8859-15 ISO_8859-15:1998 LATIN-9
ISO-8859-16 ISO-IR-226 ISO8859-16 ISO_8859-16 ISO_8859-16:2001 L10 LATIN10
KOI8-R CSKOI8R
KOI8-U
KOI8-RU
CP1250 MS-EE WINDOWS-1250
CP1251 MS-CYRL WINDOWS-1251
CP1252 MS-ANSI WINDOWS-1252
CP1253 MS-GREEK WINDOWS-1253
CP1254 MS-TURK WINDOWS-1254
CP1255 MS-HEBR WINDOWS-1255
CP1256 MS-ARAB WINDOWS-1256
CP1257 WINBALTRIM WINDOWS-1257
CP1258 WINDOWS-1258
850 CP850 IBM850 CSPC850MULTILINGUAL
862 CP862 IBM862 CSPC862LATINHEBREW
866 CP866 IBM866 CSIBM866
MAC MACINTOSH MACROMAN CSMACINTOSH
MACCENTRALEUROPE
MACICELAND
MACCROATIAN
MACROMANIA
MACCYRILLIC
MACUKRAINE
MACGREEK
MACTURKISH
MACHEBREW
MACARABIC
MACTHAI
HP-ROMAN8 R8 ROMAN8 CSHPROMAN8
NEXTSTEP
ARMSCII-8
GEORGIAN-ACADEMY
GEORGIAN-PS
KOI8-T
CP154 CYRILLIC-ASIAN PT154 PTCP154 CSPTCP154
MULELAO-1
CP1133 IBM-CP1133
ISO-IR-166 TIS-620 TIS620 TIS620-0 TIS620.2529-1 TIS620.2533-0 TIS620.2533-1
CP874 WINDOWS-874
VISCII VISCII1.1-1 CSVISCII
TCVN TCVN-5712 TCVN5712-1 TCVN5712-1:1993
ISO-IR-14 ISO646-JP JIS_C6220-1969-RO JP CSISO14JISC6220RO
JISX0201-1976 JIS_X0201 X0201 CSHALFWIDTHKATAKANA
ISO-IR-87 JIS0208 JIS_C6226-1983 JIS_X0208 JIS_X0208-1983 JIS_X0208-1990 X0208 CSISO87JISX0208
ISO-IR-159 JIS_X0212 JIS_X0212-1990 JIS_X0212.1990-0 X0212 CSISO159JISX02121990
CN GB_1988-80 ISO-IR-57 ISO646-CN CSISO57GB1988
CHINESE GB_2312-80 ISO-IR-58 CSISO58GB231280
CN-GB-ISOIR165 ISO-IR-165
ISO-IR-149 KOREAN KSC_5601 KS_C_5601-1987 KS_C_5601-1989 CSKSC56011987
EUC-JP EUCJP EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE CSEUCPKDFMTJAPANESE
MS_KANJI SHIFT-JIS SHIFT_JIS SJIS CSSHIFTJIS
CP932
ISO-2022-JP CSISO2022JP
ISO-2022-JP-1
ISO-2022-JP-2 CSISO2022JP2
CN-GB EUC-CN EUCCN GB2312 CSGB2312
GBK
CP936 MS936 WINDOWS-936
GB18030
ISO-2022-CN CSISO2022CN
ISO-2022-CN-EXT
HZ HZ-GB-2312
EUC-TW EUCTW CSEUCTW
BIG-5 BIG-FIVE BIG5 BIGFIVE CN-BIG5 CSBIG5
CP950
BIG5-HKSCS:1999
BIG5-HKSCS:2001
BIG5-HKSCS BIG5-HKSCS:2004 BIG5HKSCS
EUC-KR EUCKR CSEUCKR
CP949 UHC
CP1361 JOHAB
ISO-2022-KR CSISO2022KR
CP856
CP922
CP943
CP1046
CP1124
CP1129
CP1161 IBM-1161 IBM1161 CSIBM1161
CP1162 IBM-1162 IBM1162 CSIBM1162
CP1163 IBM-1163 IBM1163 CSIBM1163
DEC-KANJI
DEC-HANYU
437 CP437 IBM437 CSPC8CODEPAGE437
CP737
CP775 IBM775 CSPC775BALTIC
852 CP852 IBM852 CSPCP852
CP853
855 CP855 IBM855 CSIBM855
857 CP857 IBM857 CSIBM857
CP858
860 CP860 IBM860 CSIBM860
861 CP-IS CP861 IBM861 CSIBM861
863 CP863 IBM863 CSIBM863
CP864 IBM864 CSIBM864
865 CP865 IBM865 CSIBM865
869 CP-GR CP869 IBM869 CSIBM869
CP1125
EUC-JISX0213
SHIFT_JISX0213
ISO-2022-JP-3
BIG5-2003
ISO-IR-230 TDS565
ATARI ATARIST
RISCOS-LATIN1
相关推荐
ac-er8888几秒前
PHP弱类型安全问题
开发语言·安全·php
ac-er88881 分钟前
PHP网络爬虫常见的反爬策略
开发语言·爬虫·php
爱吃喵的鲤鱼11 分钟前
linux进程的状态之环境变量
linux·运维·服务器·开发语言·c++
DARLING Zero two♡37 分钟前
关于我、重生到500年前凭借C语言改变世界科技vlog.16——万字详解指针概念及技巧
c语言·开发语言·科技
7年老菜鸡38 分钟前
策略模式(C++)三分钟读懂
c++·qt·策略模式
Gu Gu Study39 分钟前
【用Java学习数据结构系列】泛型上界与通配符上界
java·开发语言
Ni-Guvara1 小时前
函数对象笔记
c++·算法
似霰1 小时前
安卓智能指针sp、wp、RefBase浅析
android·c++·binder
芊寻(嵌入式)1 小时前
C转C++学习笔记--基础知识摘录总结
开发语言·c++·笔记·学习
獨枭1 小时前
C++ 项目中使用 .dll 和 .def 文件的操作指南
c++