[Wit]CnOCR模型训练全流程简化记录（包括排除BUG）

stepfile:

bash 复制代码

step 00 创建数据集
目录结构
yourproject
-data
--myset
---images #存放训练图片
---dev.tsv #测试标签 tsv格式 图片文件名\t内容
---train.tsv #训练标签 tsv格式 图片文件名\t内容
-train_config.json
-train_config_gpu.json
-fix_cnocr_encoding.py
step 01 创建虚拟环境
venv\Scripts\activate
step 02 安装开发包进行模型训练
pip install cnocr[dev]
step 03 更新pyarrow 到18.0.0版本，解决错误显示该版本中没有PyExtensionType属性
pip install --upgrade pyarrow==18.0.0
step 04 安装albumentations==1.3.1版本解决据错误信息， compression_type 参数要求是'jpeg'或'webp'字符串问题
pip install albumentations==1.3.1
step 05 修复gbk编码问题
python -m fix_cnocr_encoding
开始训练：
cnocr train -m densenet_lite_136-gru --index-dir data/myset --train-config-fp train_config.json
step 06 可视化训练结果
wandaAPI:dca541a51e980eea9bb52866363926f5ea6617edwt(请使用自己的API接口，测试发现powershell锁死，无法键入API，结果可视化不可用)

train_config.json

javascript 复制代码

{
    "vocab_fp": ".venv/label_cn.txt",
    "img_folder": "data/myset/images",

    "devices": 1,
    "accelerator": "cpu",
    "epochs": 20,
    "batch_size": 4,
    "num_workers": 0,
    "pin_memory": false,
    "optimizer": "adam",
    "learning_rate": 1e-3,
    "weight_decay": 0,
    "metrics": {
        "complete_match": {},
        "cer": {}
    },
    "lr_scheduler": {
        "name": "cos_warmup",
        "min_lr_mult_factor": 0.01,
        "warmup_epochs": 0.2
    },
    "precision": 32,
    "limit_train_batches": 1.0,
    "limit_val_batches": 1.0,
    "pl_checkpoint_monitor": "val-complete_match-epoch",
    "pl_checkpoint_mode": "max"
}

train_config_gpu.json

javascript 复制代码

{
    "vocab_fp": "cnocr/label_cn.txt",
    "img_folder": "/data/jinlong/ocr_data",

    "devices": 1,
    "accelerator": "gpu",
    "epochs": 30,
    "batch_size": 32,
    "num_workers": 8,
    "pin_memory": true,
    "optimizer": "adam",
    "learning_rate": 3e-4,
    "weight_decay": 0,
    "train_bucket_size": null,
    "metrics": {
        "complete_match": {},
        "cer": {}
    },
    "lr_scheduler": {
        "name": "cos_warmup",
        "min_lr_mult_factor": 0.01,
        "warmup_epochs": 0.2,
        "milestones": [5, 10, 16, 22, 30],
        "gamma": 0.5
    },
    "precision": 16,
    "log_every_n_steps": 200,
    "limit_train_batches": 1.0,
    "limit_val_batches": 1.0,
    "pl_checkpoint_monitor": "val-complete_match-epoch",
    "pl_checkpoint_mode": "max"
}

fix_cnocr_encoding.py

python 复制代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
修复CN-OCR库中的编码问题
专门针对utils.py文件中的read_tsv_file函数添加UTF-8编码参数
"""
import os
import fileinput
import shutil

# 定义cnocr的utils.py文件路径
cnocr_utils_path = os.path.join(
    os.path.dirname(os.path.abspath(__file__)),
    '.venv', 'Lib', 'site-packages', 'cnocr', 'utils.py'
)

# 检查文件是否存在
if not os.path.exists(cnocr_utils_path):
    print(f"错误：找不到cnocr的utils.py文件: {cnocr_utils_path}")
    print("请检查cnocr是否正确安装在虚拟环境中")
    exit(1)

print(f"找到cnocr的utils.py文件: {cnocr_utils_path}")

# 创建备份文件
backup_path = cnocr_utils_path + '.backup'
if not os.path.exists(backup_path):
    shutil.copy2(cnocr_utils_path, backup_path)
    print(f"已创建备份文件: {backup_path}")
else:
    print(f"备份文件已存在: {backup_path}")

# 读取文件内容
with open(cnocr_utils_path, 'r', encoding='utf-8') as f:
    content = f.read()

# 检查是否已经修复过
if 'with open(fp, encoding="utf-8")' in content:
    print("cnocr库的编码问题已经被修复过了！")
    exit(0)

# 查找read_tsv_file函数
if 'def read_tsv_file(' not in content:
    print("错误：在utils.py文件中找不到read_tsv_file函数")
    print("cnocr库的版本可能与预期不同")
    exit(1)

print("正在修复read_tsv_file函数中的编码问题...")

# 使用fileinput模块修改文件
for line in fileinput.input(cnocr_utils_path, inplace=True, encoding='utf-8'):
    # 查找并替换open语句，添加encoding='utf-8'
    if 'with open(fp)' in line and 'encoding=' not in line:
        line = line.replace('with open(fp)', 'with open(fp, encoding="utf-8")')
    print(line, end='')

print("\n修复完成！")
print(f"已在read_tsv_file函数中添加了encoding='utf-8'参数")
print(f"原始文件已备份到: {backup_path}")
print("现在您可以尝试使用cnocr train命令了。")

step 00 创建数据集

目录结构

-data

--myset

---images #存放训练图片

---dev.tsv #测试标签 tsv格式图片文件名\t内容

---train.tsv #训练标签 tsv格式图片文件名\t内容

step 01 创建虚拟环境

venv\Scripts\activate

step 02 安装开发包进行模型训练

pip install cnocr[dev]

step 03 更新pyarrow 到18.0.0版本，解决错误显示该版本中没有PyExtensionType属性

pip install --upgrade pyarrow==18.0.0

step 04 安装albumentations==1.3.1版本解决据错误信息， compression_type 参数要求是'jpeg'或'webp'字符串问题

pip install albumentations==1.3.1

step 05 修复gbk编码问题

python -m fix_cnocr_encoding

开始训练：

cnocr train -m densenet_lite_136-gru --index-dir data/myset --train-config-fp train_config.json

step 06 可视化训练结果

wandaAPI:dca541a51e980eea9bb52866363926f5ea6617edwt