SAN 文件编码识别-对抗网络搜索—智能编程—仙盟创梦IDE

文件编码重要性

1. 字符表示与存储

计算机以二进制形式存储和处理数据，编码为每个字符分配特定的二进制数字组合，实现字符在计算机中的存储。例如 ASCII 编码用 7 位二进制数表示 128 个字符，UTF - 8 则能以可变长字节数表示全球几乎所有字符，确保各种文字信息都能在计算机中有效存储。

2. 跨系统与跨平台兼容

不同操作系统、软件和设备可能默认使用不同编码。统一且标准的编码（如 UTF - 8）能让文本在不同环境间准确无误地传输和显示。若编码不兼容，如 Windows 系统默认编码的文件在 Linux 系统打开，就可能因编码差异出现乱码，编码的标准化促进了信息的跨系统与跨平台交互。

3. 数据处理与应用运行

各类软件应用在处理文本数据时依赖正确编码。像数据库管理系统存储和检索数据，Web 应用解析和展示网页内容，都需依据准确编码。编码错误会导致数据处理出错，如数据库查询结果乱码，网站页面文字显示异常，影响应用正常功能和用户体验。

4. 国际交流与全球化

在全球化背景下，不同语言文字信息交流频繁。编码让不同语言文本能在全球范围内准确交换和理解。UTF - 8 编码支持多语言混合文本，使国际商务、学术交流、网络信息传播等不受语言编码限制，促进了全球信息的共享与交流。

QT跨平台代码-鸿蒙，win，mac，linux 国产

cs 复制代码

#include "encodingdetector.h"

QTextCodec* EncodingDetector::detectFileEncoding(const QString& filePath, QTextCodec* defaultCodec)
{
    if (defaultCodec == nullptr) {
        defaultCodec = QTextCodec::codecForName("UTF-8");
    }

    QFile file(filePath);
    if (!file.exists()) {
        qWarning() << "错误: 文件不存在 -" << filePath;
        return defaultCodec;
    }

    if (!file.open(QIODevice::ReadOnly)) {
        qWarning() << "无法打开文件 -" << filePath;
        return defaultCodec;
    }

    QByteArray bom = file.read(4);
    file.close();

    // 根据BOM判断编码
    if (bom.size() >= 2 && bom[0] == (char)0xFF && bom[1] == (char)0xFE) {
        return QTextCodec::codecForName("UTF-16LE");
    }
    if (bom.size() >= 2 && bom[0] == (char)0xFE && bom[1] == (char)0xFF) {
        return QTextCodec::codecForName("UTF-16BE");
    }
    if (bom.size() >= 3 && bom[0] == (char)0xEF && bom[1] == (char)0xBB && bom[2] == (char)0xBF) {
        return QTextCodec::codecForName("UTF-8");
    }
    if (bom.size() >= 4 && bom[0] == (char)0x00 && bom[1] == (char)0x00 && bom[2] == (char)0xFE && bom[3] == (char)0xFF) {
        return QTextCodec::codecForName("UTF-32BE");
    }

    // 没有BOM标记，尝试其他检测方法
    if (!file.open(QIODevice::ReadOnly)) {
        qWarning() << "无法打开文件 -" << filePath;
        return defaultCodec;
    }

    QTextCodec* result = detectEncodingWithoutBOM(file, defaultCodec);
    file.close();
    return result;
}

QTextCodec* EncodingDetector::detectEncodingWithoutBOM(QFile& file, QTextCodec* defaultCodec)
{
    file.seek(0);

    // 读取部分内容进行分析
    qint64 size = qMin(file.size(), (qint64)8192);
    QByteArray buffer = file.read(size);

    // 简单判断：如果前1024字节中包含0x00且位置不是偶数，则不太可能是UTF-16
    bool hasNullByteInOddPosition = false;
    for (int i = 0; i < qMin(buffer.size(), 1024); i++) {
        if (buffer[i] == 0x00 && i % 2 != 0) {
            hasNullByteInOddPosition = true;
            break;
        }
    }

    // 如果在奇数位置发现0x00，则不太可能是UTF-16
    if (hasNullByteInOddPosition) {
        // 检查是否可能是GB2312/GBK/GB18030 (中文编码)
        // 简单判断：如果存在大量0x81-0xFE范围内的字节后跟0x40-0xFE的字节，则可能是GBK
        int gbkCandidateCount = 0;
        int totalMultiByteChars = 0;

        for (int i = 0; i < buffer.size() - 1; i++) {
            uchar firstByte = (uchar)buffer[i];
            uchar secondByte = (uchar)buffer[i + 1];

            if (firstByte >= 0x81 && firstByte <= 0xFE) {
                totalMultiByteChars++;
                if (secondByte >= 0x40 && secondByte <= 0xFE) {
                    gbkCandidateCount++;
                }
            }
        }

        // 如果超过50%的多字节候选是GBK模式，则判定为GBK
        if (totalMultiByteChars > 0 && (float)gbkCandidateCount / totalMultiByteChars > 0.5) {
            QTextCodec* gbkCodec = QTextCodec::codecForName("GBK");
            if (gbkCodec) {
                return gbkCodec;
            }
        }

        return defaultCodec;
    }

    // 否则可能是UTF-16
    // 进一步判断是大端还是小端
    int littleEndianPairs = 0;
    int bigEndianPairs = 0;

    for (int i = 0; i < buffer.size() - 1; i += 2) {
        // 检查是否看起来像UTF-16LE (低字节在前)
        if (buffer[i] != 0 && buffer[i + 1] == 0) {
            littleEndianPairs++;
        }
        // 检查是否看起来像UTF-16BE (高字节在前)
        else if (buffer[i] == 0 && buffer[i + 1] != 0) {
            bigEndianPairs++;
        }
    }

    if (littleEndianPairs > bigEndianPairs * 2) {
        return QTextCodec::codecForName("UTF-16LE");
    } else if (bigEndianPairs > littleEndianPairs * 2) {
        return QTextCodec::codecForName("UTF-16BE");
    }

    // 无法确定，使用默认编码
    return defaultCodec;
}

C# 代码

cs 复制代码

        /// <summary>
        /// 检测文件编码
        /// </summary>
        /// <param name="filePath">文件路径</param>
        /// <param name="defaultEncoding">当无法确定编码时使用的默认编码，默认为UTF-8</param>
        /// <returns>检测到的文件编码</returns>
     
        public static Encoding 仙盟创梦_IDE_DetectEncoding(string filePath, Encoding defaultEncoding = null)
        {
            defaultEncoding = defaultEncoding ?? Encoding.UTF8;

            // 检查文件是否存在
            if (!File.Exists(filePath))
            {
                Console.WriteLine($"错误: 文件不存在 - {filePath}");
                return defaultEncoding;
            }

            try
            {
                // 读取文件前4个字节用于BOM检测
                using (var fileStream = File.OpenRead(filePath))
                {
                    byte[] bom = new byte[4];
                    int bytesRead = fileStream.Read(bom, 0, 4);

                    // 根据BOM判断编码
                    if (bytesRead >= 2 && bom[0] == 0xFF && bom[1] == 0xFE)
                    {
                        return Encoding.Unicode; // UTF-16LE
                    }
                    if (bytesRead >= 2 && bom[0] == 0xFE && bom[1] == 0xFF)
                    {
                        return Encoding.BigEndianUnicode; // UTF-16BE
                    }
                    if (bytesRead >= 3 && bom[0] == 0xEF && bom[1] == 0xBB && bom[2] == 0xBF)
                    {
                        return Encoding.UTF8; // UTF-8 with BOM
                    }
                    if (bytesRead >= 4 && bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0xFE && bom[3] == 0xFF)
                    {
                        return Encoding.UTF32; // UTF-32BE
                    }

                    // 没有BOM标记，尝试其他检测方法
                    return 仙盟创梦_IDE_DetectEncodingWithoutBOM(fileStream, defaultEncoding);
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine($"检测文件编码时出错 - {filePath}: {ex.Message}");
                return defaultEncoding;
            }
        }

        /// <summary>
        /// 检测没有BOM标记的文件编码
        /// </summary>
        private static Encoding 仙盟创梦_IDE_DetectEncodingWithoutBOM(FileStream fileStream, Encoding defaultEncoding)
        {
            // 重置流位置
            fileStream.Seek(0, SeekOrigin.Begin);

            // 读取部分内容进行分析
            byte[] buffer = new byte[Math.Min(fileStream.Length, 8192)];
            int bytesRead = fileStream.Read(buffer, 0, buffer.Length);

            // 简单判断：如果前1024字节中包含0x00且位置不是偶数，则不太可能是UTF-16
            bool hasNullByteInOddPosition = false;
            for (int i = 0; i < Math.Min(bytesRead, 1024); i++)
            {
                if (buffer[i] == 0x00 && i % 2 != 0)
                {
                    hasNullByteInOddPosition = true;
                    break;
                }
            }

            // 如果在奇数位置发现0x00，则不太可能是UTF-16
            if (hasNullByteInOddPosition)
            {
                // 检查是否可能是GB2312/GBK/GB18030 (中文编码)
                // 简单判断：如果存在大量0x81-0xFE范围内的字节后跟0x40-0xFE的字节，则可能是GBK
                int gbkCandidateCount = 0;
                int totalMultiByteChars = 0;

                for (int i = 0; i < bytesRead - 1; i++)
                {
                    if (buffer[i] >= 0x81 && buffer[i] <= 0xFE)
                    {
                        totalMultiByteChars++;
                        if (buffer[i + 1] >= 0x40 && buffer[i + 1] <= 0xFE)
                        {
                            gbkCandidateCount++;
                        }
                    }
                }

                // 如果超过50%的多字节候选是GBK模式，则判定为GBK
                if (totalMultiByteChars > 0 && (float)gbkCandidateCount / totalMultiByteChars > 0.5)
                {
                    try
                    {
                        return Encoding.GetEncoding("GBK");
                    }
                    catch
                    {
                        // 如果系统不支持GBK，回退到默认
                    }
                }

                return defaultEncoding;
            }

            // 否则可能是UTF-16
            // 进一步判断是大端还是小端
            int littleEndianPairs = 0;
            int bigEndianPairs = 0;

            for (int i = 0; i < bytesRead - 1; i += 2)
            {
                // 检查是否看起来像UTF-16LE (低字节在前)
                if (buffer[i] != 0 && buffer[i + 1] == 0)
                {
                    littleEndianPairs++;
                }
                // 检查是否看起来像UTF-16BE (高字节在前)
                else if (buffer[i] == 0 && buffer[i + 1] != 0)
                {
                    bigEndianPairs++;
                }
            }

            if (littleEndianPairs > bigEndianPairs * 2)
            {
                return Encoding.Unicode; // UTF-16LE
            }
            else if (bigEndianPairs > littleEndianPairs * 2)
            {
                return Encoding.BigEndianUnicode; // UTF-16BE
            }

            // 无法确定，使用默认编码
            return defaultEncoding;
        }

lua 跨平台代码

Lua 复制代码

-- 文件编码检测模块
local encoding_detector = {}

-- 编码常量
encoding_detector.UTF8 = "utf-8"
encoding_detector.UTF16_LE = "utf-16le"
encoding_detector.UTF16_BE = "utf-16be"
encoding_detector.UTF32_BE = "utf-32be"
encoding_detector.GBK = "gbk"
encoding_detector.UNKNOWN = "unknown"

-- 检测文件编码
function encoding_detector.detect_file_encoding(file_path, default_encoding)
    default_encoding = default_encoding or encoding_detector.UTF8
    
    local file = io.open(file_path, "rb")
    if not file then
        print(string.format("错误: 无法打开文件 - %s", file_path))
        return default_encoding
    end
    
    -- 读取前4个字节检测BOM
    local bom = file:read(4)
    file:close()
    
    -- 根据BOM判断编码
    if string.len(bom) >= 2 then
        local b1, b2 = string.byte(bom, 1, 2)
        if b1 == 0xFF and b2 == 0xFE then
            return encoding_detector.UTF16_LE
        end
        if b1 == 0xFE and b2 == 0xFF then
            return encoding_detector.UTF16_BE
        end
    end
    
    if string.len(bom) >= 3 then
        local b1, b2, b3 = string.byte(bom, 1, 3)
        if b1 == 0xEF and b2 == 0xBB and b3 == 0xBF then
            return encoding_detector.UTF8
        end
    end
    
    if string.len(bom) >= 4 then
        local b1, b2, b3, b4 = string.byte(bom, 1, 4)
        if b1 == 0x00 and b2 == 0x00 and b3 == 0xFE and b4 == 0xFF then
            return encoding_detector.UTF32_BE
        end
    end
    
    -- 没有BOM标记，尝试其他检测方法
    return encoding_detector.detect_encoding_without_bom(file_path, default_encoding)
end

-- 检测没有BOM标记的文件编码
function encoding_detector.detect_encoding_without_bom(file_path, default_encoding)
    local file = io.open(file_path, "rb")
    if not file then
        print(string.format("错误: 无法打开文件 - %s", file_path))
        return default_encoding
    end
    
    -- 读取部分内容进行分析
    file:seek("set", 0)
    local buffer = file:read(8192) or ""
    file:close()
    
    -- 简单判断：如果前1024字节中包含0x00且位置不是偶数，则不太可能是UTF-16
    local has_null_byte_in_odd_position = false
    local max_check = math.min(#buffer, 1024)
    
    for i = 1, max_check do
        if string.byte(buffer, i) == 0x00 and i % 2 ~= 0 then
            has_null_byte_in_odd_position = true
            break
        end
    end
    
    -- 如果在奇数位置发现0x00，则不太可能是UTF-16
    if has_null_byte_in_odd_position then
        -- 检查是否可能是GBK
        local gbk_candidate_count = 0
        local total_multi_byte_chars = 0
        
        for i = 1, #buffer - 1 do
            local b1 = string.byte(buffer, i)
            if b1 >= 0x81 and b1 <= 0xFE then
                total_multi_byte_chars = total_multi_byte_chars + 1
                local b2 = string.byte(buffer, i + 1)
                if b2 >= 0x40 and b2 <= 0xFE then
                    gbk_candidate_count = gbk_candidate_count + 1
                end
            end
        end
        
        -- 如果超过50%的多字节候选是GBK模式，则判定为GBK
        if total_multi_byte_chars > 0 and (gbk_candidate_count / total_multi_byte_chars) > 0.5 then
            return encoding_detector.GBK
        end
        
        return default_encoding
    end
    
    -- 否则可能是UTF-16
    -- 进一步判断是大端还是小端
    local little_endian_pairs = 0
    local big_endian_pairs = 0
    
    for i = 1, #buffer - 1, 2 do
        local b1 = string.byte(buffer, i)
        local b2 = string.byte(buffer, i + 1)
        
        -- 检查是否看起来像UTF-16LE (低字节在前)
        if b1 ~= 0 and b2 == 0 then
            little_endian_pairs = little_endian_pairs + 1
        -- 检查是否看起来像UTF-16BE (高字节在前)
        elseif b1 == 0 and b2 ~= 0 then
            big_endian_pairs = big_endian_pairs + 1
        end
    end
    
    if little_endian_pairs > big_endian_pairs * 2 then
        return encoding_detector.UTF16_LE
    elseif big_endian_pairs > little_endian_pairs * 2 then
        return encoding_detector.UTF16_BE
    end
    
    -- 无法确定，使用默认编码
    return default_encoding
end

return encoding_detector

php 代码

Lua 复制代码

<?php
/**
 * 文件编码检测类
 * 用于检测文件的编码格式，支持UTF-8、UTF-16LE、UTF-16BE、GBK等常见编码
 */
class EncodingDetector {
    const UTF8 = 'UTF-8';
    const UTF16_LE = 'UTF-16LE';
    const UTF16_BE = 'UTF-16BE';
    const UTF32_BE = 'UTF-32BE';
    const GBK = 'GBK';
    const UNKNOWN = 'UNKNOWN';

    /**
     * 检测文件编码
     * @param string $filePath 文件路径
     * @param string $defaultEncoding 默认编码，默认为UTF-8
     * @return string 检测到的编码
     */
    public static function detectFileEncoding($filePath, $defaultEncoding = self::UTF8) {
        if (!file_exists($filePath)) {
            trigger_error("错误: 文件不存在 - $filePath", E_USER_WARNING);
            return $defaultEncoding;
        }

        // 读取前4个字节检测BOM
        $file = fopen($filePath, 'rb');
        if (!$file) {
            trigger_error("错误: 无法打开文件 - $filePath", E_USER_WARNING);
            return $defaultEncoding;
        }

        $bom = fread($file, 4);
        fclose($file);

        // 根据BOM判断编码
        if (strlen($bom) >= 2) {
            $b1 = ord($bom[0]);
            $b2 = ord($bom[1]);
            if ($b1 == 0xFF && $b2 == 0xFE) {
                return self::UTF16_LE;
            }
            if ($b1 == 0xFE && $b2 == 0xFF) {
                return self::UTF16_BE;
            }
        }

        if (strlen($bom) >= 3) {
            $b1 = ord($bom[0]);
            $b2 = ord($bom[1]);
            $b3 = ord($bom[2]);
            if ($b1 == 0xEF && $b2 == 0xBB && $b3 == 0xBF) {
                return self::UTF8;
            }
        }

        if (strlen($bom) >= 4) {
            $b1 = ord($bom[0]);
            $b2 = ord($bom[1]);
            $b3 = ord($bom[2]);
            $b4 = ord($bom[3]);
            if ($b1 == 0x00 && $b2 == 0x00 && $b3 == 0xFE && $b4 == 0xFF) {
                return self::UTF32_BE;
            }
        }

        // 没有BOM标记，尝试其他检测方法
        return self::detectEncodingWithoutBOM($filePath, $defaultEncoding);
    }

    /**
     * 检测没有BOM标记的文件编码
     * @param string $filePath 文件路径
     * @param string $defaultEncoding 默认编码
     * @return string 检测到的编码
     */
    private static function detectEncodingWithoutBOM($filePath, $defaultEncoding) {
        $file = fopen($filePath, 'rb');
        if (!$file) {
            trigger_error("错误: 无法打开文件 - $filePath", E_USER_WARNING);
            return $defaultEncoding;
        }

        // 读取部分内容进行分析
        $buffer = fread($file, 8192);
        fclose($file);

        // 简单判断：如果前1024字节中包含0x00且位置不是偶数，则不太可能是UTF-16
        $hasNullByteInOddPosition = false;
        $maxCheck = min(strlen($buffer), 1024);

        for ($i = 0; $i < $maxCheck; $i++) {
            if (ord($buffer[$i]) == 0x00 && $i % 2 != 0) {
                $hasNullByteInOddPosition = true;
                break;
            }
        }

        // 如果在奇数位置发现0x00，则不太可能是UTF-16
        if ($hasNullByteInOddPosition) {
            // 检查是否可能是GB2312/GBK/GB18030 (中文编码)
            $gbkCandidateCount = 0;
            $totalMultiByteChars = 0;

            for ($i = 0; $i < strlen($buffer) - 1; $i++) {
                $firstByte = ord($buffer[$i]);
                if ($firstByte >= 0x81 && $firstByte <= 0xFE) {
                    $totalMultiByteChars++;
                    $secondByte = ord($buffer[$i + 1]);
                    if ($secondByte >= 0x40 && $secondByte <= 0xFE) {
                        $gbkCandidateCount++;
                    }
                }
            }

            // 如果超过50%的多字节候选是GBK模式，则判定为GBK
            if ($totalMultiByteChars > 0 && ($gbkCandidateCount / $totalMultiByteChars) > 0.5) {
                return self::GBK;
            }

            return $defaultEncoding;
        }

        // 否则可能是UTF-16
        // 进一步判断是大端还是小端
        $littleEndianPairs = 0;
        $bigEndianPairs = 0;

        for ($i = 0; $i < strlen($buffer) - 1; $i += 2) {
            $b1 = ord($buffer[$i]);
            $b2 = ord($buffer[$i + 1]);

            // 检查是否看起来像UTF-16LE (低字节在前)
            if ($b1 != 0 && $b2 == 0) {
                $littleEndianPairs++;
            }
            // 检查是否看起来像UTF-16BE (高字节在前)
            elseif ($b1 == 0 && $b2 != 0) {
                $bigEndianPairs++;
            }
        }

        if ($littleEndianPairs > $bigEndianPairs * 2) {
            return self::UTF16_LE;
        } elseif ($bigEndianPairs > $littleEndianPairs * 2) {
            return self::UTF16_BE;
        }

        // 无法确定，使用默认编码
        return $defaultEncoding;
    }
}
?>

aspx 代码

cs 复制代码

using System;
using System.IO;
using System.Text;
using System.Web.UI;

namespace YourNamespace
{
    public partial class EncodingDetector : System.Web.UI.Page
    {
        protected void DetectButton_Click(object sender, EventArgs e)
        {
            string filePath = Server.MapPath(filePath.Text.Trim());
            
            if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath))
            {
                resultLabel.Text = "错误: 文件不存在!";
                fileContent.Text = "";
                return;
            }

            try
            {
                // 检测文件编码
                Encoding encoding = DetectFileEncoding(filePath);
                resultLabel.Text = $"检测到的编码: {encoding.EncodingName} ({encoding.WebName})";

                // 使用检测到的编码读取文件内容
                string content = File.ReadAllText(filePath, encoding);
                fileContent.Text = content.Length > 5000 ? content.Substring(0, 5000) + "..." : content;
            }
            catch (Exception ex)
            {
                resultLabel.Text = $"错误: {ex.Message}";
                fileContent.Text = "";
            }
        }

        /// <summary>
        /// 检测文件编码
        /// </summary>
        private Encoding DetectFileEncoding(string filePath)
        {
            try
            {
                // 读取文件前4个字节用于BOM检测
                byte[] bom = new byte[4];
                using (FileStream fs = File.OpenRead(filePath))
                {
                    fs.Read(bom, 0, 4);
                }

                // 根据BOM判断编码
                if (bom[0] == 0xFF && bom[1] == 0xFE)
                {
                    return Encoding.Unicode; // UTF-16LE
                }
                if (bom[0] == 0xFE && bom[1] == 0xFF)
                {
                    return Encoding.BigEndianUnicode; // UTF-16BE
                }
                if (bom[0] == 0xEF && bom[1] == 0xBB && bom[2] == 0xBF)
                {
                    return Encoding.UTF8; // UTF-8 with BOM
                }
                if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0xFE && bom[3] == 0xFF)
                {
                    return Encoding.UTF32; // UTF-32BE
                }

                // 没有BOM标记，尝试其他检测方法
                return DetectEncodingWithoutBOM(filePath);
            }
            catch (Exception ex)
            {
                // 发生错误时返回默认编码
                Response.Write($"警告: 检测编码时出错 - {ex.Message}<br/>");
                return Encoding.UTF8;
            }
        }

        /// <summary>
        /// 检测没有BOM标记的文件编码
        /// </summary>
        private Encoding DetectEncodingWithoutBOM(string filePath)
        {
            try
            {
                using (FileStream fs = File.OpenRead(filePath))
                {
                    // 读取部分内容进行分析
                    byte[] buffer = new byte[Math.Min(fs.Length, 8192)];
                    fs.Read(buffer, 0, buffer.Length);

                    // 简单判断：如果前1024字节中包含0x00且位置不是偶数，则不太可能是UTF-16
                    bool hasNullByteInOddPosition = false;
                    for (int i = 0; i < Math.Min(buffer.Length, 1024); i++)
                    {
                        if (buffer[i] == 0x00 && i % 2 != 0)
                        {
                            hasNullByteInOddPosition = true;
                            break;
                        }
                    }

                    // 如果在奇数位置发现0x00，则不太可能是UTF-16
                    if (hasNullByteInOddPosition)
                    {
                        // 检查是否可能是GB2312/GBK/GB18030 (中文编码)
                        int gbkCandidateCount = 0;
                        int totalMultiByteChars = 0;

                        for (int i = 0; i < buffer.Length - 1; i++)
                        {
                            if (buffer[i] >= 0x81 && buffer[i] <= 0xFE)
                            {
                                totalMultiByteChars++;
                                if (buffer[i + 1] >= 0x40 && buffer[i + 1] <= 0xFE)
                                {
                                    gbkCandidateCount++;
                                }
                            }
                        }

                        // 如果超过50%的多字节候选是GBK模式，则判定为GBK
                        if (totalMultiByteChars > 0 && (float)gbkCandidateCount / totalMultiByteChars > 0.5)
                        {
                            try
                            {
                                return Encoding.GetEncoding("GBK");
                            }
                            catch
                            {
                                // 如果系统不支持GBK，回退到默认
                            }
                        }

                        return Encoding.UTF8;
                    }

                    // 否则可能是UTF-16
                    // 进一步判断是大端还是小端
                    int littleEndianPairs = 0;
                    int bigEndianPairs = 0;

                    for (int i = 0; i < buffer.Length - 1; i += 2)
                    {
                        // 检查是否看起来像UTF-16LE (低字节在前)
                        if (buffer[i] != 0 && buffer[i + 1] == 0)
                        {
                            littleEndianPairs++;
                        }
                        // 检查是否看起来像UTF-16BE (高字节在前)
                        else if (buffer[i] == 0 && buffer[i + 1] != 0)
                        {
                            bigEndianPairs++;
                        }
                    }

                    if (littleEndianPairs > bigEndianPairs * 2)
                    {
                        return Encoding.Unicode; // UTF-16LE
                    }
                    else if (bigEndianPairs > littleEndianPairs * 2)
                    {
                        return Encoding.BigEndianUnicode; // UTF-16BE
                    }

                    // 无法确定，使用默认编码
                    return Encoding.UTF8;
                }
            }
            catch (Exception ex)
            {
                // 发生错误时返回默认编码
                Response.Write($"警告: 检测编码时出错 - {ex.Message}<br/>");
                return Encoding.UTF8;
            }
        }
    }
}

python 代码

cs 复制代码

import chardet

def detect_file_encoding(file_path, default_encoding='utf-8'):
    """
    检测文件编码
    :param file_path: 文件路径
    :param default_encoding: 默认编码，默认为utf-8
    :return: 检测到的编码
    """
    try:
        # 读取文件前4个字节检测BOM
        with open(file_path, 'rb') as f:
            bom = f.read(4)

        # 根据BOM判断编码
        if bom.startswith(b'\xef\xbb\xbf'):
            return 'utf-8-sig'
        elif bom.startswith(b'\xff\xfe'):
            return 'utf-16le'
        elif bom.startswith(b'\xfe\xff'):
            return 'utf-16be'
        elif bom.startswith(b'\x00\x00\xfe\xff'):
            return 'utf-32be'

        # 没有BOM标记，使用chardet库检测
        with open(file_path, 'rb') as f:
            raw_data = f.read(8192)  # 读取前8KB数据进行检测

        result = chardet.detect(raw_data)
        confidence = result['confidence']
        encoding = result['encoding']

        # 如果chardet检测结果置信度高，则使用检测结果
        if confidence > 0.9 and encoding:
            return encoding.lower()
        else:
            # 否则使用默认编码
            return default_encoding
    except Exception as e:
        print(f"检测文件编码时出错: {e}")
        return default_encoding

def read_file_with_encoding(file_path, encoding=None):
    """
    使用指定编码读取文件内容
    :param file_path: 文件路径
    :param encoding: 文件编码，若为None则自动检测
    :return: 文件内容
    """
    if encoding is None:
        encoding = detect_file_encoding(file_path)
    
    try:
        with open(file_path, 'r', encoding=encoding) as f:
            return f.read()
    except UnicodeDecodeError:
        print(f"使用检测到的编码 {encoding} 读取文件失败，尝试使用其他编码...")
        # 尝试常见编码
        for fallback_encoding in ['utf-8', 'gbk', 'latin-1']:
            if fallback_encoding != encoding:
                try:
                    with open(file_path, 'r', encoding=fallback_encoding) as f:
                        print(f"使用 {fallback_encoding} 成功读取文件")
                        return f.read()
                except:
                    continue
        # 所有尝试都失败
        print("无法确定正确的编码")
        return None
    except Exception as e:
        print(f"读取文件时出错: {e}")
        return None

# 使用示例
if __name__ == "__main__":
    file_path = "example.txt"
    encoding = detect_file_encoding(file_path)
    print(f"检测到的编码: {encoding}")
    
    content = read_file_with_encoding(file_path, encoding)
    if content:
        print(f"文件内容 (前100个字符): {content[:100]}...")