一、概述
本节我们进行语音合成前端中的第二步,需要把中文转换为拼音。通过python和c++两种语言进行实现,python可以直接调用pypinyin库实现。c++实现是本节的重点,首先根据词典进行分词,接着把分词后的词进行词典映射,得到最终的拼音串。更多资料和代码可以进入https://t.zsxq.com/qgmoN ,同时欢迎大家提出宝贵的建议,以共同探讨学习。
二、python实现
import logging
import logging.config
import logging.handlers
from pypinyin import pinyin, lazy_pinyin, Style
import re
def to_pinyin(text):
pinyin_list = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True)
pinyin_str = " ".join(str(kk) for kk in pinyin_list)
pinyin_str = pinyin_str.replace("chong2 neng2", "zhong4 neng2")
return pinyin_str
def main():
text = '据华尔街日报报道,问题的核心在于百事可乐和可口可乐生产其秘方浓缩糖浆的地点存在差异。'
print("origin:{}".format(text))
print("convert:{}".format(to_pinyin(text)))
text = '这些浓缩糖浆是汽水的精华所在,通常在专门的设施中生产,然后运往装瓶厂,在那里与水、二氧化碳和甜味剂混合制成最终的饮料产品。'
print("origin:{}".format(text))
print("convert:{}".format(to_pinyin(text)))
text = '百事可乐50多年前就开始在爱尔兰生产浓缩糖浆,选址理由是爱尔兰的低企业税率。'
print("origin:{}".format(text))
print("convert:{}".format(to_pinyin(text)))
if __name__ == '__main__':
main()
三、c++工程化
3.1 工程结构
mouse-pinyin
├── CMakeLists.txt
├── bin
│ ├── CMakeLists.txt
│ └── test-pinyin-convert.cc
├── build
│ ├── CMakeCache.txt
│ ├── CMakeFiles
│ │ ├── 3.22.1
│ │ │ ├── CMakeCCompiler.cmake
│ │ │ ├── CMakeCXXCompiler.cmake
│ │ │ ├── CMakeDetermineCompilerABI_C.bin
│ │ │ ├── CMakeDetermineCompilerABI_CXX.bin
│ │ │ ├── CMakeSystem.cmake
│ │ │ ├── CompilerIdC
│ │ │ │ ├── CMakeCCompilerId.c
│ │ │ │ ├── a.out
│ │ │ │ └── tmp
│ │ │ └── CompilerIdCXX
│ │ │ ├── CMakeCXXCompilerId.cpp
│ │ │ ├── a.out
│ │ │ └── tmp
│ │ ├── CMakeDirectoryInformation.cmake
│ │ ├── CMakeOutput.log
│ │ ├── CMakeRuleHashes.txt
│ │ ├── CMakeTmp
│ │ ├── Makefile.cmake
│ │ ├── Makefile2
│ │ ├── TargetDirectories.txt
│ │ ├── cmake.check_cache
│ │ ├── libmouse-pinyin.a.dir
│ │ │ ├── DependInfo.cmake
│ │ │ ├── build.make
│ │ │ ├── cmake_clean.cmake
│ │ │ ├── compiler_depend.make
│ │ │ ├── compiler_depend.ts
│ │ │ └── progress.make
│ │ └── progress.marks
│ ├── Makefile
│ ├── bin
│ │ ├── CMakeFiles
│ │ │ ├── CMakeDirectoryInformation.cmake
│ │ │ ├── progress.marks
│ │ │ └── test-pinyin-convert.dir
│ │ │ ├── DependInfo.cmake
│ │ │ ├── build.make
│ │ │ ├── cmake_clean.cmake
│ │ │ ├── compiler_depend.internal
│ │ │ ├── compiler_depend.make
│ │ │ ├── compiler_depend.ts
│ │ │ ├── depend.make
│ │ │ ├── flags.make
│ │ │ ├── link.txt
│ │ │ ├── progress.make
│ │ │ ├── test-pinyin-convert.cc.o
│ │ │ └── test-pinyin-convert.cc.o.d
│ │ ├── Makefile
│ │ ├── cmake_install.cmake
│ │ └── test-pinyin-convert
│ ├── cmake_install.cmake
│ ├── libmouse-pinyin.a
│ └── src
│ ├── CMakeFiles
│ │ ├── CMakeDirectoryInformation.cmake
│ │ ├── mouse-pinyin.dir
│ │ │ ├── DependInfo.cmake
│ │ │ ├── build.make
│ │ │ ├── cmake_clean.cmake
│ │ │ ├── cmake_clean_target.cmake
│ │ │ ├── compiler_depend.internal
│ │ │ ├── compiler_depend.make
│ │ │ ├── compiler_depend.ts
│ │ │ ├── depend.make
│ │ │ ├── flags.make
│ │ │ ├── link.txt
│ │ │ ├── mouse-pinyin-alphabet.cc.o
│ │ │ ├── mouse-pinyin-alphabet.cc.o.d
│ │ │ ├── mouse-pinyin-api.cc.o
│ │ │ ├── mouse-pinyin-api.cc.o.d
│ │ │ ├── mouse-pinyin-handle.cc.o
│ │ │ ├── mouse-pinyin-handle.cc.o.d
│ │ │ ├── mouse-pinyin-log.cc.o
│ │ │ ├── mouse-pinyin-log.cc.o.d
│ │ │ ├── mouse-pinyin-model.cc.o
│ │ │ ├── mouse-pinyin-model.cc.o.d
│ │ │ ├── mouse-pinyin-pinyin.cc.o
│ │ │ ├── mouse-pinyin-pinyin.cc.o.d
│ │ │ ├── mouse-pinyin-split.cc.o
│ │ │ ├── mouse-pinyin-split.cc.o.d
│ │ │ └── progress.make
│ │ └── progress.marks
│ ├── Makefile
│ ├── cmake_install.cmake
│ └── libmouse-pinyin.a
├── src
│ ├── CMakeLists.txt
│ ├── mouse-pinyin-alphabet.cc
│ ├── mouse-pinyin-alphabet.h
│ ├── mouse-pinyin-api.cc
│ ├── mouse-pinyin-api.h
│ ├── mouse-pinyin-handle.cc
│ ├── mouse-pinyin-handle.h
│ ├── mouse-pinyin-log.cc
│ ├── mouse-pinyin-log.h
│ ├── mouse-pinyin-model.cc
│ ├── mouse-pinyin-model.h
│ ├── mouse-pinyin-pinyin.cc
│ ├── mouse-pinyin-pinyin.h
│ ├── mouse-pinyin-split.cc
│ └── mouse-pinyin-split.h
└── test
├── lexicon.txt
├── print_char.py
├── test-pinyin-convert -> ../build/bin/test-pinyin-convert
└── test.txt
3.2 核心代码
(1) mouse-pinyin-api.h
/*
*@author : aflyingwolf
*@date : 2025.4.20
*@file : mouse-pinyin-api.h
* */
#ifndef __MOUSE_PINYIN_API_H__
#define __MOUSE_PINYIN_API_H__
#ifdef __cplusplus
extern "C"
{
#endif
typedef struct _pinyin_result {
char result[2048];
int len;
} pinyin_result;
void *api_create_mouse_pinyin_res(const char *path);
void api_destroy_mouse_pinyin_res(void *res);
void *api_create_mouse_pinyin_handle(void *res);
int api_process_mouse_pinyin(void *handle, const char *text, pinyin_result *res);
void api_destroy_mouse_pinyin_handle(void *handle);
#ifdef __cplusplus
}
#endif
#endif
/*
*@author : aflyingwolf
*@date : 2025.4.20
*@file : mouse-pinyin-api.cc
* */
#include "mouse-pinyin-api.h"
#include "mouse-pinyin-handle.h"
#include <string.h>
using namespace mouse_pinyin;
void *api_create_mouse_pinyin_res(const char *path) {
PinyinModel *model = new PinyinModel();
model->Load(path);
return (void*)model;
}
void api_destroy_mouse_pinyin_res(void *res) {
PinyinModel *res_ = (PinyinModel*)res;
delete res_;
}
void *api_create_mouse_pinyin_handle(void *res) {
PinyinModel *res_ = (PinyinModel*)res;
PinyinHandle *handle = new PinyinHandle(res_);
return (void*)handle;
}
void api_destroy_mouse_pinyin_handle(void *handle) {
PinyinHandle *handle_ = (PinyinHandle*)handle;
delete handle_;
}
int api_process_mouse_pinyin(void *handle, const char *text, pinyin_result *res) {
PinyinHandle *handle_ = (PinyinHandle*)handle;
std::string ret = handle_->Process(text);
memset(res->result, 0, 2048);
if (ret.size() < 2048) {
memcpy(res->result, ret.c_str(), ret.size());
res->len = ret.size();
} else {
memcpy(res->result, ret.c_str(), 2047);
res->len = 2047;
}
return 0;
}
(3) mouse-pinyin-handle.h
/*
*@author : aflyingwolf
*@date : 2025.4.20
*@file : mouse-pinyin-handle.h
* */
#ifndef __MOUSE_PINYIN_HANDLE_H__
#define __MOUSE_PINYIN_HANDLE_H__
#include <stdio.h>
#include <map>
#include <string>
#include <vector>
#include "mouse-pinyin-model.h"
namespace mouse_pinyin {
class PinyinHandle {
public:
PinyinHandle(PinyinModel *res);
~PinyinHandle();
std::string Process(std::string text);
private:
std::vector<std::string> Split(const std::string str, const std::string pattern);
private:
PinyinModel *res_;
}; // end class
}
#endif
/*
*@author : aflyingwolf
*@date : 2025.4.20
*@file : mouse-pinyin-handle.cc
* */
#include "mouse-pinyin-handle.h"
#include "mouse-pinyin-log.h"
#include <string.h>
#include <stdlib.h>
namespace mouse_pinyin {
PinyinHandle::PinyinHandle(PinyinModel *res) {
res_ = res;
}
PinyinHandle::~PinyinHandle() {
}
std::string PinyinHandle::Process(std::string text) {
std::string split_res = res_->split_handle_->Split(text.c_str());
LOG_INFO("split_res:%s", split_res.c_str());
std::vector<std::string> split_vec = this->Split(split_res, " ");
std::string result("");
for (int i = 0; i < split_vec.size(); i++) {
LOG_INFO("words:%s", split_vec[i].c_str());
std::string local_result = res_->pinyin_handle_->GetPinyin(split_vec[i].c_str());
if (i != 0 && local_result != "") {
result += " ";
}
result += local_result;
}
LOG_INFO("pinyin_res:%s", result.c_str());
return result;
}
std::vector<std::string> PinyinHandle::Split(const std::string str, const std::string pattern) {
std::vector<std::string> ret;
if (pattern.empty())
return ret;
int start = 0, index = str.find_first_of(pattern,0);
while (index != str.npos) {
if (start != index) {
ret.push_back(str.substr(start,index-start));
}
start = index+1;
index=str.find_first_of(pattern,start);
}
if (!str.substr(start).empty()) {
ret.push_back(str.substr(start));
}
return ret;
}
}
/*
*@author : aflyingwolf
*@date : 2025.4.20
*@file : mouse-pinyin-pinyin.cc
* */
#include "mouse-pinyin-pinyin.h"
#include "mouse-pinyin-log.h"
#include <string.h>
#include <stdlib.h>
#include <vector>
#include <string>
namespace mouse_pinyin {
PinyinPinyin::PinyinPinyin() {
}
PinyinPinyin::~PinyinPinyin() {
Destroy();
}
int PinyinPinyin::LoadSyms(const char *filename) {
#define MAX_LINE_LEN 512
LOG_INFO("pinyin load lexicon start.");
char line[MAX_LINE_LEN];
FILE *fp = fopen(filename, "r");
if (fp == NULL) {
LOG_ERROR("open file failed.");
return -1;
}
memset(line, 0, MAX_LINE_LEN);
while (fgets(line, MAX_LINE_LEN, fp)) {
/*char syms[MAX_LINE_LEN];
memset(syms, 0, MAX_LINE_LEN);
char pinyin[MAX_LINE_LEN];
memset(pinyin, 0, MAX_LINE_LEN);
if (sscanf(line, "%s\t%s\n", syms, pinyin) != 2) {
LOG_ERROR("line:%s error\n", line);
return -1;
}*/
if (line[strlen(line)-1] == '\n') {
line[strlen(line)-1] = 0;
}
std::vector<std::string> element = Split(line, "\t");
std::pair<std::string, std::string> elem(element[0], element[1]);
str2pinyin_.insert(elem);
memset(line, 0, MAX_LINE_LEN);
}
fclose(fp);
LOG_INFO("pinyin load lexicon finish.");
return 0;
}
std::string PinyinPinyin::GetPinyin(const char *words) {
auto is_find = str2pinyin_.find(words);
if (is_find == str2pinyin_.end()){
LOG_ERROR("not find %s", words);
return "";
}
return is_find->second;
}
void PinyinPinyin::Destroy() {
}
std::vector<std::string> PinyinPinyin::Split(const std::string str, const std::string pattern) {
std::vector<std::string> ret;
if (pattern.empty())
return ret;
int start = 0, index = str.find_first_of(pattern,0);
while (index != str.npos) {
if (start != index) {
ret.push_back(str.substr(start,index-start));
}
start = index+1;
index=str.find_first_of(pattern,start);
}
if (!str.substr(start).empty()) {
ret.push_back(str.substr(start));
}
return ret;
}
}
(6) mouse-pinyin-pinyin.h
/*
*@author : aflyingwolf
*@date : 2025.4.20
*@file : mouse-pinyin-pinyin.h
* */
#ifndef __MOUSE_PINYIN_PINYIN_H__
#define __MOUSE_PINYIN_PINYIN_H__
#include <stdio.h>
#include <map>
#include <string>
#include <vector>
namespace mouse_pinyin {
class PinyinPinyin {
public:
PinyinPinyin();
~PinyinPinyin();
int LoadSyms(const char *filename);
std::string GetPinyin(const char *words);
private:
void Destroy();
std::vector<std::string> Split(const std::string str, const std::string pattern);
private:
std::map<std::string, std::string> str2pinyin_;
}; // end class
}
#endif
/*
*@author : aflyingwolf
*@date : 2025.4.20
*@file : mouse-pinyin-split.cc
* */
#include "mouse-pinyin-split.h"
#include "mouse-pinyin-log.h"
#include <string.h>
#include <stdlib.h>
namespace mouse_pinyin {
PinyinSplit::PinyinSplit(){
m_forward_roots = NULL;
m_backward_roots = NULL;
}
PinyinSplit::~PinyinSplit(){
Destroy();
}
int PinyinSplit::Init(){
if(m_forward_roots == NULL){
m_forward_roots = new TrieLexicon();
}
if(m_backward_roots == NULL){
m_backward_roots = new TrieLexicon();
}
return 0;
}
int PinyinSplit::Load(const char *lexicon_filename){
LOG_INFO("split load res start.");
Init();
FILE *fp = fopen(lexicon_filename, "r");
if(fp == NULL){
LOG_ERROR("fp == null.\n");
return -1;
}
char line1[1024];
char line[1024];
char tmp_tmp[1024];
char array[1024][128];
while(fgets(line1, 1024, fp)){
int word_num = 0;
bzero(line, sizeof(line));
sscanf(line1, "%s\t%s\n", line, tmp_tmp);
for(int i = 0; i < strlen(line) - 1;){
int nbytes = GetNbytes(line + i);
char tmpBuffer[128];
bzero(tmpBuffer, sizeof(tmpBuffer));
for(int j = 0; j < nbytes; j++){
tmpBuffer[j] = line[i+j];
}
i += nbytes;
memcpy(array[word_num++], tmpBuffer, sizeof(tmpBuffer));
}
InsertForwardTree(array, word_num);
InsertBackwardTree(array, word_num);
}
fclose(fp);
LOG_INFO("split load res finish.");
return 0;
}
int PinyinSplit::InsertForwardTree(char array[][128], int num){
TrieLexicon *pos = this->m_forward_roots;
for(int i = 0; i < num; i++){
auto is_find = pos->map.find(array[i]);
if(is_find == pos->map.end()){
std::pair<std::string, TrieLexicon*> elem(array[i], new TrieLexicon);
elem.second->parent = pos;
elem.second->value = std::string(array[i]);
pos->map.insert(elem);
is_find = pos->map.find(array[i]);
}
pos = is_find->second;
}
pos->flag = true;
return 0;
}
int PinyinSplit::InsertBackwardTree(char array[][128], int num){
TrieLexicon *pos = this->m_backward_roots;
for(int i = num - 1; i >= 0; i--){
auto is_find = pos->map.find(array[i]);
if(is_find == pos->map.end()){
std::pair<std::string, TrieLexicon*> elem(array[i], new TrieLexicon);
elem.second->parent = pos;
elem.second->value = std::string(array[i]);
pos->map.insert(elem);
is_find = pos->map.find(array[i]);
}
pos = is_find->second;
}
pos->flag = true;
return 0;
}
std::string PinyinSplit::Split(const char *line){
char array[1024][128];
int word_num = 0;
char tmpBuffer[128];
bzero(tmpBuffer, sizeof(tmpBuffer));
tmpBuffer[0] = '\n';
memcpy(array[word_num++], tmpBuffer, sizeof(tmpBuffer));
for(int i = 0; i < strlen(line);){
int nbytes = GetNbytes(line + i);
bzero(tmpBuffer, sizeof(tmpBuffer));
for(int j = 0; j < nbytes; j++){
tmpBuffer[j] = line[i+j];
}
i += nbytes;
memcpy(array[word_num++], tmpBuffer, sizeof(tmpBuffer));
}
bzero(tmpBuffer, sizeof(tmpBuffer));
tmpBuffer[0] = '\n';
memcpy(array[word_num++], tmpBuffer, sizeof(tmpBuffer));
int forward_num = ForwardSplit(array + 1, word_num -1);
int backward_num = BackwardSplit(array, word_num - 1);
if (forward_num > 10000 && backward_num > 10000) {
return "";
} else if(forward_num <= backward_num){
return m_forward_line;
}else{
return m_backward_line;
}
return "";
}
int PinyinSplit::ForwardSplit(char array[][128], int num){
auto pos = m_forward_roots;
int word_num = 0;
m_forward_line = "";
int loop_size = 0;
int max_loop_size = 100000;
for(int i = 0; i < num;){
loop_size++;
if (loop_size > max_loop_size) {
return 100000;
}
auto is_find = pos->map.find(array[i]);
if(is_find != pos->map.end()){
pos = is_find->second;
i++;
}else{
if(pos == m_forward_roots && array[i][0] != '\n'){
LOG_ERROR("%s not exit.", array[i]);
i++;
continue;
}
TrieLexicon *tmp = pos;
bool tmp_flag = false;
std::string str;
while(tmp){
loop_size++;
if (loop_size > max_loop_size) {
return 100000;
}
if(tmp_flag || tmp->flag){// || tmp == m_forward_roots){
tmp_flag = true;
}else if (tmp == m_forward_roots) {
;
}else{
i--;
}
if(tmp_flag == true){
str = tmp->value + str;
}
tmp = tmp->parent;
}
if(tmp_flag == false){
LOG_ERROR("%s not exit.", array[i]);
i++;
} else {
m_forward_line += str + " ";
word_num++;
}
pos = m_forward_roots;
if(i >= num - 1){
break;
}
}
}
return word_num;
}
int PinyinSplit::BackwardSplit(char array[][128], int num){
auto pos = m_backward_roots;
int word_num = 0;
m_backward_line = "";
int loop_size = 0;
int max_loop_size = 100000;
for(int i = num - 1; i >= 0;){
loop_size++;
if (loop_size > max_loop_size) {
return 100000;
}
auto is_find = pos->map.find(array[i]);
if(is_find != pos->map.end()){
pos = is_find->second;
i--;
}else{
if(pos == m_backward_roots && array[i][0] != '\n'){
LOG_ERROR("%s not exit.", array[i]);
i--;
continue;
}
TrieLexicon *tmp = pos;
bool tmp_flag = false;
std::string str;
while(tmp){
loop_size++;
if (loop_size > max_loop_size) {
return 100000;
}
if(tmp_flag || tmp->flag){// || tmp == m_backward_roots){
tmp_flag = true;
}else if(tmp == m_backward_roots){
;
}else{
i++;
}
if(tmp_flag == true){
str = str + tmp->value;
}
tmp = tmp->parent;
}
if (tmp_flag == false) {
LOG_ERROR("%s not exit.", array[i]);
i--;
} else {
m_backward_line = str + " " + m_backward_line;
word_num++;
}
pos = m_backward_roots;
if(i <= 0){
break;
}
}
}
return word_num;
}
void PinyinSplit::Destroy(){
Delete(m_forward_roots);
Delete(m_backward_roots);
}
void PinyinSplit::Delete(TrieLexicon *node) {
if (node == NULL) {
return;
}
std::map<std::string, TrieLexicon*>::iterator iter;
iter = node->map.begin();
while (iter != node->map.end()) {
Delete(iter->second);
iter++;
}
delete node;
}
int PinyinSplit::GetNbytes(const char *buf){
char firstByte = buf[0];
int offset = 1;
if(firstByte & kFirstBitMask){
if(firstByte & kThirdBitMask){
if(firstByte & kFourthBitMask){
offset = 4;
}else{
offset = 3;
}
}else{
offset = 2;
}
}
return offset;
}
}
(8) mouse-pinyin-split.h
/*
*@author : aflyingwolf
*@date : 2025.4.20
*@file : mouse-pinyin-split.h
* */
#ifndef __MOUSE_PINYIN_SPLIT_H__
#define __MOUSE_PINYIN_SPLIT_H__
#include <stdio.h>
#include <map>
#include <string>
namespace mouse_pinyin {
typedef struct _TrieLexicon{
_TrieLexicon(){
flag = false;
parent = NULL;
}
std::map<std::string, struct _TrieLexicon*> map;
bool flag;
std::string value;
struct _TrieLexicon *parent;
} TrieLexicon;
const unsigned char kFirstBitMask = 128; // 1000000
const unsigned char kSecondBitMask = 64; // 0100000
const unsigned char kThirdBitMask = 32; // 0010000
const unsigned char kFourthBitMask = 16; // 0001000
const unsigned char kFifthBitMask = 8; // 0000100
class PinyinSplit{
public:
PinyinSplit();
~PinyinSplit();
int Load(const char *lexicon_filename);
std::string Split(const char *line);
void Destroy();
private:
int Init();
int GetNbytes(const char *buf);
int ForwardSplit(char array[][128], int num);
int InsertBackwardTree(char array[][128], int num);
int InsertForwardTree(char array[][128], int num);
int BackwardSplit(char array[][128], int num);
void Delete(TrieLexicon *node);
private:
TrieLexicon *m_forward_roots;
TrieLexicon *m_backward_roots;
std::string m_forward_line;
std::string m_backward_line;
};
}
#endif
(9) CMakeLists.txt
cmake_minimum_required(VERSION 3.22)
project("mouse-pinyin")
set(NAME "mouse-pinyin")
set(LIBNAME "lib${NAME}.a")
set(OUTPUT_LIB "lib${NAME}.a")
#SET(CMAKE_BUILD_TYPE "DEBUG")
SET(CMAKE_BUILD_TYPE "RELEASE")
SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -fPIC -g -Wno-deprecated")
SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-deprecated")
SET(CMAKE_C_FLAGS_RELEASE "$ENV{CFLAGS} -fPIC -O3 -Wall -Wno-deprecated")
MESSAGE(STATUS "CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")
MESSAGE(STATUS "CMAKE_C_FLAGS_RELEASE: ${CMAKE_C_FLAGS_RELEASE}")
add_compile_options(-std=c++11)
SET(BUILD_PATH ${CMAKE_BINARY_DIR})
SET(SRC_PATH ${CMAKE_SOURCE_DIR})
SET(LIB_PATH ${CMAKE_SOURCE_DIR}/lib)
include_directories(${SRC_PATH}/src ${SRC_PATH}/include)
link_directories(${BUILD_PATH})
add_subdirectory(src)
add_subdirectory(bin)
ADD_CUSTOM_TARGET(${LIBNAME} ALL
COMMAND cp src/${LIBNAME} ${OUTPUT_LIB}
COMMAND echo CREATE ${OUTPUT_LIB} > ar.mac
COMMAND echo SAVE >> ar.mac
COMMAND echo END >> ar.mac
COMMAND echo OPEN ${OUTPUT_LIB} > ar.mac
COMMAND echo SAVE >> ar.mac
COMMAND echo END >> ar.mac
COMMAND ar -M < ar.mac
COMMAND rm ar.mac
WORKING_DIRECTORY ${BUILD_PATH})
ADD_DEPENDENCIES(${LIBNAME} ${NAME})
3.3 demo
#include <stdio.h>
#include <string.h>
#include "mouse-pinyin-api.h"
int main(int argc, char *argv[]) {
if (argc < 3) {
printf("%s input lexicon.txt and test-file\n", argv[0]);
return -1;
}
void *res = api_create_mouse_pinyin_res(argv[1]);
void *handle = api_create_mouse_pinyin_handle(res);
FILE *fp = fopen(argv[2], "r");
char line[1024];
memset(line, 0, sizeof(line));
while(fgets(line, 1024, fp) != NULL){
if (line[0] == '#') {
continue;
} else {
pinyin_result result;
int len = strlen(line);
if (line[len-1] == '\n') {
line[len-1] = 0;
}
api_process_mouse_pinyin(handle, line, &result);
printf("origin:%s\n", line);
printf("convet:%s\n\n\n", result.result);
}
}
fclose(fp);
api_destroy_mouse_pinyin_handle(handle);
api_destroy_mouse_pinyin_res(res);
return 0;
}
四、结果演示
4.1 python demo1
origin:据华尔街日报报道,问题的核心在于百事可乐和可口可乐生产其秘方浓缩糖浆的地点存在差异。
convert:ju4 hua2 er3 jie1 ri4 bao4 bao4 dao4 , wen4 ti2 de5 he2 xin1 zai4 yu2 bai3 shi4 ke3 le4 he2 ke3 kou3 ke3 le4 sheng1 chan3 qi2 mi4 fang1 nong2 suo1 tang2 jiang1 de5 di4 dian3 cun2 zai4 cha1 yi4 。
origin:这些浓缩糖浆是汽水的精华所在,通常在专门的设施中生产,然后运往装瓶厂,在那里与水、二氧化碳和甜味剂混合制成最终的饮料产品。
convert:zhe4 xie1 nong2 suo1 tang2 jiang1 shi4 qi4 shui3 de5 jing1 hua2 suo3 zai4 , tong1 chang2 zai4 zhuan1 men2 de5 she4 shi1 zhong1 sheng1 chan3 , ran2 hou4 yun4 wang3 zhuang1 ping2 chang3 , zai4 na4 li3 yu3 shui3 、 er4 yang3 hua4 tan4 he2 tian2 wei4 ji4 hun4 he2 zhi4 cheng2 zui4 zhong1 de5 yin3 liao4 chan3 pin3 。
origin:百事可乐50多年前就开始在爱尔兰生产浓缩糖浆,选址理由是爱尔兰的低企业税率。
convert:bai3 shi4 ke3 le4 50 duo1 nian2 qian2 jiu4 kai1 shi3 zai4 ai4 er3 lan2 sheng1 chan3 nong2 suo1 tang2 jiang1 , xuan3 zhi3 li3 you2 shi4 ai4 er3 lan2 de5 di1 qi3 ye4 shui4 lv4 。
4.2 c++ demo2
origin:今年的花卉长势很好
convet:jin1 nian2 de5 hua1 hui4 zhang3 shi4 hen3 hao3
origin:你太唠叨了
convet:ni3 tai4 lao2 dao1 le5
origin:如果是棵小草,即使在最好的企业里,你也长不成大树。果真如此,不如历经风雨,把自己培养成名贵花卉。
convet:ru2 guo3 shi4 ke1 xiao3 cao3 ji2 shi3 zai4 zui4 hao3 de5 qi3 ye4 li3 ni3 ye3 zhang3 bu4 cheng2 da4 shu4 guo3 zhen1 ru2 ci3 bu4 ru2 li4 jing1 feng1 yu3 ba3 zi4 ji3 pei2 yang3 cheng2 ming2 gui4 hua1 hui4
origin:"总"要为爱人着想,"经"得起爱人唠叨,"理"应对爱人谦让,男人应当"总经理"。
convet: zong3 yao4 wei4 ai4 ren5 zhuo2 xiang3 jing1 de2 qi3 ai4 ren5 lao2 dao1 li3 ying4 dui4 ai4 ren5 qian1 rang4 nan2 ren2 ying1 dang1 zong3 jing1 li3
五、总结
本节我们对中文进行了转拼音操作,并进行了算法的工程化。通过词典映射解决了大部分多音字的问题,通常情况下,我们还需要做韵律预测,特别是传统方法对韵律预测依赖性更强。我们使用端到端模型,使得韵律信息可以和声学模型一起训练,这样对韵律模型有所弱化,也是端到端模型的主要优势,当然加上韵律模型效果会更好,可控性更强。