赞
踩
这是一个基于n-Gram+CRF+HMM的中文分词的java实现.
分词速度达到每秒钟大约200万字左右(mac air下测试),准确率能达到96%以上
目前实现了.中文分词. 中文姓名识别 . 用户自定义词典,关键字提取,自动摘要,关键字标记等功能
可以应用到自然语言处理等方面,适用于对分词效果要求高的各种项目.
<dependency>
<groupId>org.ansj</groupId>
<artifactId>ansj_seg</artifactId>
<version>5.1.1</version>
</dependency>
package com.example.fenci.OnlyFile; import com.example.fenci.test.Test; import org.ansj.domain.Result; import org.ansj.splitWord.analysis.ToAnalysis; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.*; import java.util.stream.Collectors; public class OnlyFile { public static void main(String[] args) throws IOException { OnlyFile onlyFile = new OnlyFile(); String feile = onlyFile.feileToString("D:\\train.jsonl"); // String str = "欢迎使用ansj_seg,(ansj中文分词)在这里如果你遇到什么问题都可以联系我.我一定尽我所能.帮助大家.ansj_seg更快,更准,更自由!"; feile=feile.replaceAll("\\s*","").replaceAll("[^(\\u4e00-\\u9fa5)]",""); Result parse = ToAnalysis.parse(feile); String result = parse.toString(); //去掉空格 //result=result.replaceAll("\\s*","").replaceAll("[^(\\u4e00-\\u9fa5)]",""); //去掉多余 // result=result.replaceAll("/",""); String[] split = result.split(","); //单个词 Map<String,Integer> map1=new HashMap<>(); //两个词 Map<String,Integer> map2=new HashMap<>(); //三个词 Map<String,Integer> map3=new HashMap<>(); for (String s : split) { s=s.replaceAll("\\s*","").replaceAll("[^(\\u4e00-\\u9fa5)]",""); if(s.length()==1) { map1.put(s, map1.get(s) == null ? 0 : (map1.get(s) + 1)); } if (s.length()==2) { map2.put(s, map2.get(s) == null ? 0 : (map2.get(s) + 1)); } if (s.length()==3) { map3.put(s, map3.get(s) == null ? 0 : (map3.get(s) + 1)); } } //排序 HashMap<String, Integer> finalOut1 = new LinkedHashMap<>(); map1.entrySet() .stream() .sorted((p1, p2) -> p2.getValue().compareTo(p1.getValue())) .collect(Collectors.toList()).forEach(ele -> finalOut1.put(ele.getKey(), ele.getValue())); HashMap<String, Integer> finalOut2 = new LinkedHashMap<>(); map2.entrySet() .stream() .sorted((p1, p2) -> p2.getValue().compareTo(p1.getValue())) .collect(Collectors.toList()).forEach(ele -> finalOut2.put(ele.getKey(), ele.getValue())); HashMap<String, Integer> finalOut3 = new LinkedHashMap<>(); map3.entrySet() .stream() .sorted((p1, p2) -> p2.getValue().compareTo(p1.getValue())) .collect(Collectors.toList()).forEach(ele -> finalOut3.put(ele.getKey(), ele.getValue())); System.out.println("单词前100:"); int i1=0; Set<String> set1 = finalOut1.keySet(); for (String s : set1) { System.out.println(s+":"+map1.get(s)); i1++; if(i1==100){ break; } } System.out.println("双词前100:"); int i2=0; Set<String> set2 = finalOut2.keySet(); for (String s : set2) { System.out.println(s+":"+map2.get(s)); i2++; if(i2==100){ break; } } System.out.println("三词前100:"); int i3=0; Set<String> set3 = finalOut3.keySet(); for (String s : set3) { System.out.println(s+":"+map3.get(s)); i3++; if(i3==100){ break; } } } public String feileToString(String filePath) throws IOException { File file = new File(filePath); String result = ""; BufferedReader br=null; try { br = new BufferedReader(new FileReader(file));//构造一个BufferedReader类来读取文件 String s = null; while ((s = br.readLine()) != null) {//使用readLine方法,一次读一行 result = result + "\n" + s; } } catch (Exception e) { e.printStackTrace(); }finally { br.close(); } return result; } public boolean strMatch(String str,int type){ if(type==1){ boolean b = str.matches("/^(?![A-Za-z0-9]+$)[\\u4e00-\\u9fa5A-Za-z0-9]{1}$/"); return b; } if(type==2){ boolean b = str.matches("/^(?![A-Za-z0-9]+$)[\\u4e00-\\u9fa5A-Za-z0-9]{2}$/"); return b; } if(type==3){ boolean b = str.matches("/^(?![A-Za-z0-9]+$)[\\u4e00-\\u9fa5A-Za-z0-9]{3}$/"); return b; } return true; } }
单词前100: 的:12036 不:6367 在:4818 人:4469 我:3885 有:3738 是:3418 上:3348 来:3088 无:2892 双词前100: 春风:645 不知:571 万里:565 明月:533 没有:527 何处:513 秋风:390 一样:371 已经:366 江南:350 一片:346 今日:341 桃花:323 三词前100: 为什么:69 高高的:47 洛阳城:41 洞庭湖:36 三千里:36 长安城:35 玉门关:34 岳阳楼:27 二十年:25 三十年:25 终南山:24 白帝城:23 人世间:23 淡淡的:23 老朋友:23
参考https://blog.csdn.net/u011136197/article/details/78921752
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。