赞
踩
文章中需要的Apache openNlp jar包:openNlp下载:https://opennlp.apache.org/cgi-bin/download.cgi
其他涉及的jar都是java基础类包
- package com.npl.demo.utils;
-
- import java.io.IOException;
- import java.io.StreamTokenizer;
- import java.io.StringReader;
- import java.text.BreakIterator;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.List;
- import java.util.Locale;
- import java.util.Scanner;
- import java.util.StringTokenizer;
- import java.util.regex.Pattern;
-
- import opennlp.tools.tokenize.SimpleTokenizer;
- import opennlp.tools.tokenize.WhitespaceTokenizer;
-
-
- /**
- * Filename: NlpTokenization.java
- * Description:
- * Copyright: Copyright (c) 2019 All Rights Reserved.
- * @author: wangk
- * @version: 1.0
- * Create at: 2019年5月5日 下午4:28:56
- *
- * Modification History:
- * Date Author Version Description
- * ------------------------------------------------------------------
- * 2019年5月5日 wangk 1.0 1.0 Version
- *
- */
- public class NlpTokenization {
- static String paragraph = "Let's The first sentence. The second sentence. Let's ";
- static String[] sentences = {
- "Tim was agood neighbor. Perhaps not as good Bob "+
- "Haywood, but stille pretty good. Of course Mr. Adam "+
- "took the cake!"
- };
-
- static String chineseLanguage = "时代的碰撞|中国古典民乐与流行的相遇"; //中文可以进行正则匹配每隔字中间加一个空格,就可以进行分词了
- //代码如下
- /*String regex = "(.{1})";
- text = text.replaceAll (regex, "$1 ");*/
-
-
-
- public static void main(String[] args) {
- NlpTokenization to = new NlpTokenization();
- //to.scanner(paragraph);
- //to.split(chineseLanguage);
- //to.breakIterator(paragraph);
- //to.streamTokenizer(paragraph);
- //to.stringTokenizer(chineseLanguage);
-
- //to.textSplit(); //测试分词 性能
- to.openNlpSimpleTokenizer(chineseLanguage);
- }
-
-
- /**
- * @Description: /默认使用空格作为分隔符 java类 Scanner方法
- * @author wangk
- * @param text
- * @return
- * @date: 2019年5月5日 下午1:51:38
- */
- public List scanner(String text) {
- Scanner scanner = new Scanner(text);
- scanner.useDelimiter("[ ,.]");//设置基于字符串或模式的分隔符 --设置分隔符为空格,逗号,句号 使用正则设置
- //scanner.reset();//分隔符复位为空格
- List<String> list = new ArrayList<>();
- while(scanner.hasNext()) {
- String token = scanner.next();
- list.add(token);
- }
-
- for(String token : list) {
- System.out.println(token);
- }
- return null;
- }
-
- /**
- * @Description: 文本分词 java类 split方法
- * @author wangk
- * @param text
- * @return
- * @date: 2019年5月5日 下午1:51:30
- */
- public List split(String text) {
- String tokens[] = text.split("\\s+");
- for (String token : tokens) {
- System.out.println(token);
- }
- return null;
-
- }
-
- /**
- * @Description: 文本分词 java类 BreakIterator方法 该类可以获取各种边界
- * @author wangk
- * @param text
- * @return
- * @date: 2019年5月5日 下午1:51:19
- */
- public List breakIterator(String text) {
-
- BreakIterator wordIterator = BreakIterator.getWordInstance();
- wordIterator.setText(text);
- int boundary = wordIterator.first();
-
- while(boundary != BreakIterator.DONE) {//done为最后一个边界
- int begin = boundary;
- System.out.print(boundary+"-");
- boundary = wordIterator.next();
- int end = boundary;
- if(end == BreakIterator.DONE) break;
- System.out.println(boundary+"["+ text.substring(begin, end)+"]");
- }
- return null;
- }
-
-
- /**
- * @Description: 文本分词 java类 StreamTokenizer方法 通常基于一个文件创建,对文件中的文本分词
- * @author wangk
- * @param text
- * @return
- * @date: 2019年5月5日 下午1:50:37
- */
- public List streamTokenizer(String text) {
- StreamTokenizer tokenizer = new StreamTokenizer(new StringReader(text));
- //对于分词器会将单引号字符和双引号字符表示引用文本,由于没有对应的引号,故字符串的其他部分被忽略了
- //使用ordinaryChar方法制定那些字符串应为普通字符
- tokenizer.ordinaryChar('\'');
- tokenizer.ordinaryChar(',');
-
- boolean isEOF = false; //用来终止循环
- while(!isEOF) {
- try {
- int token = tokenizer.nextToken(); //返回词项的类型
- switch(token) {
- case StreamTokenizer.TT_EOF: //static int 流结束的一个常数
- isEOF = true;
- break;
- case StreamTokenizer.TT_EOL: //static int 行结束的一个常数
- break;
- case StreamTokenizer.TT_NUMBER: //static int 读取词项的数量
- System.out.println(tokenizer.nval); //double 如果当前词项是一个单词则存有一个数字
- break;
- case StreamTokenizer.TT_WORD: //static int 指明一个单词词项的常数
- System.out.println(tokenizer.sval); //String 如果当前词项是一个单词则存有这个词项
- break;
- default:
- System.out.println((char) token);
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- return null;
-
- }
-
- /**
- * @Description: 文本分词 java类 stringTokenizer方法 可以处理热恩和来源的字符串
- * @author wangk
- * @param text
- * @return
- * @date: 2019年5月5日 下午3:05:36
- */
- public List stringTokenizer(String text) {
- StringTokenizer st = new StringTokenizer(text);
- while(st.hasMoreTokens()) {
- System.out.println(st.nextToken());
- }
- return null;
- }
-
- /**
- * @Description: 测试分词 性能
- * @author wangk
- * @date: 2019年5月5日 下午4:28:59
- */
- public void textSplit() {
- StringBuilder sb = new StringBuilder();
- for (int i = 100000; i < 100000 + 60; i++)
- sb.append(i).append(' ');
- String sample = sb.toString();
-
- int runs = 100000;
- for (int i = 0; i < 5; i++) {
- {
- long start = System.nanoTime();
- for (int r = 0; r < runs; r++) {
- StringTokenizer st = new StringTokenizer(sample);
- List<String> list = new ArrayList<String>();
- while (st.hasMoreTokens())
- list.add(st.nextToken());
- }
- long time = System.nanoTime() - start;
- System.out.printf("StringTokenizer took an average of %.1f us%n", time / runs / 1000.0);
- }
- {
- long start = System.nanoTime();
- Pattern spacePattern = Pattern.compile(" ");
- for (int r = 0; r < runs; r++) {
- List<String> list = Arrays.asList(spacePattern.split(sample, 0));
- }
- long time = System.nanoTime() - start;
- System.out.printf("Pattern.split took an average of %.1f us%n", time / runs / 1000.0);
- }
- {
- long start = System.nanoTime();
- for (int r = 0; r < runs; r++) {
- List<String> list = new ArrayList<String>();
- int pos = 0, end;
- while ((end = sample.indexOf(' ', pos)) >= 0) {
- list.add(sample.substring(pos, end));
- pos = end + 1;
- }
- }
- long time = System.nanoTime() - start;
- System.out.printf("indexOf loop took an average of %.1f us%n", time / runs / 1000.0);
- }
- }
- }
-
- /**
- * @Description: 英文标点也被作为单独项 openNlp 方法SimpleTokenizer
- * @author wangk
- * @param text
- * @return
- * @date: 2019年5月6日 上午10:36:38
- */
- public List openNlpSimpleTokenizer(String text) {
- SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
- String regex = "(.{1})";
- text = text.replaceAll (regex, "$1 ");
- String tokens[] = simpleTokenizer.tokenize(text);
- for(String token : tokens) {
- System.out.println(token);
- }
-
- return null;
-
- }
-
-
- /**
- * @Description: 空格作为分隔符 openNlp 方法 WhitespaceTokenizer
- * @author wangk
- * @param text
- * @return
- * @date: 2019年5月6日 上午10:36:38
- */
- public List openNlpWhitespaceTokenizer(String text) {
- WhitespaceTokenizer simpleTokenizer = WhitespaceTokenizer.INSTANCE;
- String tokens[] = simpleTokenizer.tokenize(text);
- for(String token : tokens) {
- System.out.println(token);
- }
- return null;
- }
-
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。