文章目录
pinyin4j
- 添加maven依赖
yml
<dependency>
<groupId>com.belerweb</groupId>
<artifactId>pinyin4j</artifactId>
<version>2.5.0</version>
</dependency>
- 获取文本拼音
java
/**
* 获取文本拼音
* @param context 文本内容
* @param existNotPinyin 是否保存非汉字
* @return String 拼音
*/
private String pinyinTest(String context, boolean existNotPinyin) {
if (context == null || context.trim().length() <= 0) {
return null;
}
//设置格式
HanyuPinyinOutputFormat outputFormat = new HanyuPinyinOutputFormat();
//WITHOUT_TONE 不带音标、WITH_TONE_NUMBER 带数字音标、WITH_TONE_MARK 带符号音标
outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
char[] chars = context.trim().toCharArray();
StringBuilder builder = new StringBuilder();
try {
for (char aChar : chars) {
String[] pinyin = PinyinHelper.toHanyuPinyinStringArray(aChar, outputFormat);
//不是汉字会返回null
if (pinyin == null || pinyin.length <= 0) {
if (existNotPinyin) {
builder.append(aChar);
}
continue;
}
//多音字的情况取第一个(也可以全取)
builder.append(pinyin[0]);
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
return builder.toString().toUpperCase();
}
- 测试用例
java
@Test
public void test() {
String temp = "我爱罗52";
String list = pinyinTest(temp, false); //WOAILUO
}
hanlp
- 添加Maven依赖
yml
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.4</version>
</dependency>
- 获取文本拼音
java
/**
* 获取文本拼音
* @param context 文本内容
* @param existNotPinyin 是否保存非汉字
* @return String 拼音
*/
private String hanLpTest(String content, Boolean existNotPinyin) {
if (context == null || context.trim().length() <= 0) {
return null;
}
if (existNotPinyin) {
return HanLP.convertToPinyinString(content, "", false).toUpperCase();
}
List<Pinyin> pinyinList = HanLP.convertToPinyinList(content);
StringBuilder builder = new StringBuilder();
pinyinList.forEach(pinyin -> {
if (pinyin == null || Pinyin.none5.equals(pinyin)) {
return;
}
builder.append(pinyin.getPinyinWithoutTone());
});
return builder.toString().toUpperCase();
}
- 测试用例
java
@Test
public void test() {
String temp = "我爱罗52";
System.out.println(hanLpTest(temp, true)); //WOAILUO52
}
关键字分词
- 正则表达式
java
/**
* 分词正则表达式
*/
private final String SPLIT_WORD_REG_EX = "[^aoeiuv]?h?[iuv]?(ai|ei|ao|ou|er|ang?|eng?|ong|a|o|e|i|u|ng|n)?";
- 获取分词结果
java
/**
* 关键字分词
* @param keyword 关键字
* @return List<String> 分词列表
*/
private List<String> splitTest(String keyword) {
if (context == null || context.trim().length() <= 0) {
return Collections.emptyList();
}
List<String> keywordList = new ArrayList<>();
int index = 0;
Pattern pat = Pattern.compile(SPLIT_WORD_REG_EX);
for (int i = keyword.length(); i > 0; i = i - index) {
Matcher matcher = pat.matcher(keyword);
if (!matcher.find()) {
break;
}
keywordList.add(matcher.group());
index = matcher.end() - matcher.start();
keyword = keyword.substring(index);
}
return keywordList;
}
- 测试用例
java
@Test
public void test() {
String temp = "我爱罗52";
List<String> list = splitTest(temp);
System.out.println(list); //[我, 爱, 罗, 5, 2]
}