赞
踩
参考网上修改后
创建StackTest 存入要提取的关键词
- package org.jeecg.modules.bim.test;
-
- /**
- * 建立StackTest类
- * 存放关键词内容
- * @author wangdongqi
- *
- */
- public class StackTest {
- private Object[] stack;
- //元素个数;
- private int size;
- //默认长度为10;
- public StackTest(){
- this(10);
- }
- //也可以自己设置长度,即容量;
- StackTest(int len){
- stack = new Object[len];
- }
- //返回元素个数;
- public int size(){
- return size;
- }
- //返回数组长度,即容量;
- private int capacity(){
- return stack.length;
- }
- //实现动态的数组;
- private void ensureCapacity(){
- if(size() == capacity()){
- Object[] newStack = new Object[size() * 3 / 2 + 1];
- System.arraycopy(stack, 0, newStack, 0, size());
- stack = newStack;
- }
- }
- //入栈;
- public void push(Object o){
- size++;
- ensureCapacity();
- stack[size - 1] = o;
- }
- //判空;
- public boolean isEmpty(){
- return size == 0;
- }
- //出栈;
- public Object pop(){
- //首先要判空;
- if(isEmpty()){
- throw new ArrayIndexOutOfBoundsException("不能为空");
- }
- Object o = stack[--size];
- stack[size] = null;
- return o;
- }
- }
创建 Split 类 分词
- package org.jeecg.modules.bim.test;
-
- import org.apache.commons.io.IOUtils;
- import org.springframework.core.io.ClassPathResource;
-
- import java.io.IOException;
- import java.io.InputStream;
- import java.nio.charset.StandardCharsets;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
-
- /**
- * 建立Split类
- * 设置词典内容
- * @author wangdongqi
- *
- */
- class Split {
-
- private List<String> stackOut = new ArrayList<>();
- private ClassPathResource classPathResource = new ClassPathResource("keyWord");
- private InputStream inputStream =classPathResource.getInputStream();
- private String result = IOUtils.toString(inputStream, String.valueOf(StandardCharsets.UTF_8));
- // 去除空格 回车 换行
- private Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");
- private Matcher m = pattern.matcher(result);
-
- private String[] dictionary = m.replaceAll("").split(","); //取得关键词词典
- private String input = null;
-
- Split(String input) throws IOException {
- this.input = input;
- }
- //分词
- public List<String> start() {
- String temp = null;
- StackTest stack = new StackTest(20);
- for(int i=0;i<this.input.length();i++) {
- temp = this.input.substring(i);
- // 每次从字符串开头截取一个字,并存到temp中
- // 如果该词在词典中, 则删除该词并在原始字符串中截取该词
- if(this.isInDictionary(temp)) {
- stack.push(temp); //入栈
- this.input = this.input.replace(temp, "");
- i = -1; // i=-1是因为要重新查找, 而要先执行循环中的i++
- }
- }
-
- // 当前循环完毕,词的末尾截去一个字,继续循环, 直到词变为空
- if(!"".equals(this.input)) {
- this.input = this.input.substring(0,this.input.length()-1);
- this.start();
- }
-
- //出栈
- while (!stack.isEmpty()) {
- stackOut.add(stack.pop().toString());
- }
- return stackOut;
- }
-
- //判断当前词是否在词典中
- private boolean isInDictionary(String temp) {
- for (String s : this.dictionary) {
- if (temp.equals(s)) {
- return true;
- }
- }
- return false;
- }
- }
keyWord.txt 为关键词词库文件 放在项目resources文件夹下
public static void main(String[] args) throws IOException { String input = "我是大连理工大学一名一年级的学生,今年夏天就进入二年级了!"; // 要匹配的字符串 List<String> stack = new Split(input).start(); for (String s:stack ) { System.out.println(s); } }
结果 :
大连
理工大学
一名
学生
二年级
Disconnected from the target VM, address: '127.0.0.1:63885', transport: 'socket'
Process finished with exit code 0
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。