赞
踩
目录
命名查找器可以检测文本中的命名实体和数字。为了能够检测到实体,命名查找器需要一个模型。模型依赖于它被训练的语言和实体类型。OpenNLP项目提供了许多预先培训过的名字查找模型,这些模型在各种免费的语料库上进行了培训。它们可以在我们的模型下载页面下载。要在原始文本中查找名称,必须将文本分成标记和句子。
默认情况下,输入的训练数据中每行一个句子,句子内是经过tokenizer分词的词语。名称实体使用Span进行标记;输入中遇到一个空行表示文档结束。官方建议训练一个模型至少需要15000个句子。如:
<START:person> Pierre Vinken <END> , 61 years old , will join the board as a nonexecutive director Nov. 29 . Mr . <START:person> Vinken <END> is chairman of Elsevier N.V. , the Dutch publishing group . |
OpenNLP定义了一个默认的特性生成,在没有指定自定义特性生成时使用,用户可以通过API自定义特征生成器。
- import java.io.BufferedOutputStream;
-
- import java.io.File;
-
- import java.io.FileOutputStream;
-
- import java.io.IOException;
-
- import java.io.OutputStream;
-
- import java.nio.charset.StandardCharsets;
-
- import opennlp.tools.namefind.NameFinderME;
-
- import opennlp.tools.namefind.NameSample;
-
- import opennlp.tools.namefind.NameSampleDataStream;
-
- import opennlp.tools.namefind.TokenNameFinderFactory;
-
- import opennlp.tools.namefind.TokenNameFinderModel;
-
- import opennlp.tools.util.InputStreamFactory;
-
- import opennlp.tools.util.MarkableFileInputStreamFactory;
-
- import opennlp.tools.util.ObjectStream;
-
- import opennlp.tools.util.PlainTextByLineStream;
-
- import opennlp.tools.util.TrainingParameters;
-
- public class NameFinderTrain {
-
- public static void main(String[] args) throws IOException {
-
- // TODO Auto-generated method stub
-
- String rootDir = System.getProperty("user.dir") + File.separator;
-
- String fileResourcesDir = rootDir + "resources" + File.separator;
-
- String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
-
- //训练数据的路径
-
- String filePath = fileResourcesDir + "naneFinder.txt";
-
- //训练后模型的保存路径
-
- String modelPath = modelResourcesDir + "en-ner-person-my.bin";
-
-
- //按行读取数据
-
- InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
-
- ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);
-
- //按行读取数据
-
- ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
-
- TokenNameFinderFactory factory =new TokenNameFinderFactory();
-
- //训练模型
-
- TokenNameFinderModel model =NameFinderME.train("en","person", sampleStream, TrainingParameters.defaultParams(), factory);
-
- //保存模型
-
- FileOutputStream fos=new FileOutputStream(new File(modelPath));
-
- OutputStream modelOut = new BufferedOutputStream(fos);
-
- model.serialize(modelOut);
-
-
- //评估模型
-
- TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(new NameFinderME(model));
-
- evaluator.evaluate(sampleStream);
-
-
-
- FMeasure result = evaluator.getFMeasure();
-
- System.out.println(result.toString());
-
- }
-
- }
- import java.io.File;
-
- import java.io.FileInputStream;
-
- import java.io.IOException;
-
- import java.io.InputStream;
-
- import opennlp.tools.namefind.NameFinderME;
-
- import opennlp.tools.namefind.TokenNameFinderModel;
-
- import opennlp.tools.util.Span;
-
-
-
- public class NameFinderPredit {
-
-
-
- public static void main(String[] args) throws IOException {
-
- // TODO Auto-generated method stub
-
- String rootDir = System.getProperty("user.dir") + File.separator;
-
-
-
- String fileResourcesDir = rootDir + "resources" + File.separator;
-
- String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
-
-
-
- //String filePath = fileResourcesDir + "sentenceDetector.txt";
-
- String modelPath = modelResourcesDir + "en-ner-person.bin";
-
-
-
- InputStream modelIn = new FileInputStream(modelPath) ;
-
- //加载模型
-
- TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
-
- //实例化模型
-
- NameFinderME nameFinder = new NameFinderME(model);
-
-
-
- String tokens[] = new String[]{
-
- "Vinken",
-
- "is",
-
- "61",
-
- "years",
-
- "old",
-
- "Pierre",
-
- ".",
-
- "Pierre",
-
- };
-
- //命名检测
-
- //Span 保存表示命名实体在tokens中的位置
-
- Span[] nameFinds= nameFinder.find(tokens);
-
-
-
- for(Span str:nameFinds){
-
- System.out.println("type:"+str.getType()+";Tostring:"+str.toString()+";length:"+nameFinds.length+"start:"+str.getStart()+";end:"+str.getEnd()+";name:"+tokens[str.getStart()]);
-
- }
-
- }
-
- }
输出:
type:person;Tostring:[5..6) person;length:2start:5;end:6;name:Pierre
type:person;Tostring:[7..8) person;length:2start:7;end:8;name:Pierre
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。