当前位置:   article > 正文

[NLP]OpenNLP命名实体识别(NameFinder)的使用_tokennamefindermodel

tokennamefindermodel

目录

 

Name Finder

模型训练

命名识别


Name Finder

命名查找器可以检测文本中的命名实体和数字。为了能够检测到实体,命名查找器需要一个模型。模型依赖于它被训练的语言和实体类型。OpenNLP项目提供了许多预先培训过的名字查找模型,这些模型在各种免费的语料库上进行了培训。它们可以在我们的模型下载页面下载。要在原始文本中查找名称,必须将文本分成标记和句子。

默认情况下,输入的训练数据中每行一个句子,句子内是经过tokenizer分词的词语。名称实体使用Span进行标记;输入中遇到一个空行表示文档结束。官方建议训练一个模型至少需要15000个句子。如:

<START:person> Pierre Vinken <END> , 61 years old , will join the board as a nonexecutive director Nov. 29 .

Mr . <START:person> Vinken <END> is chairman of Elsevier N.V. , the Dutch publishing group .

OpenNLP定义了一个默认的特性生成,在没有指定自定义特性生成时使用,用户可以通过API自定义特征生成器。

 

模型训练

  1. import java.io.BufferedOutputStream;
  2. import java.io.File;
  3. import java.io.FileOutputStream;
  4. import java.io.IOException;
  5. import java.io.OutputStream;
  6. import java.nio.charset.StandardCharsets;
  7. import opennlp.tools.namefind.NameFinderME;
  8. import opennlp.tools.namefind.NameSample;
  9. import opennlp.tools.namefind.NameSampleDataStream;
  10. import opennlp.tools.namefind.TokenNameFinderFactory;
  11. import opennlp.tools.namefind.TokenNameFinderModel;
  12. import opennlp.tools.util.InputStreamFactory;
  13. import opennlp.tools.util.MarkableFileInputStreamFactory;
  14. import opennlp.tools.util.ObjectStream;
  15. import opennlp.tools.util.PlainTextByLineStream;
  16. import opennlp.tools.util.TrainingParameters;
  17. public class NameFinderTrain {
  18.     public static void main(String[] args) throws IOException {
  19.        // TODO Auto-generated method stub
  20.        String rootDir = System.getProperty("user.dir") + File.separator;
  21.        String fileResourcesDir = rootDir + "resources" + File.separator;
  22.        String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;      
  23.        //训练数据的路径
  24.         String filePath = fileResourcesDir + "naneFinder.txt";
  25.        //训练后模型的保存路径
  26.         String modelPath = modelResourcesDir + "en-ner-person-my.bin";
  27.          
  28.        //按行读取数据
  29.        InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
  30.        ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);      
  31.        //按行读取数据
  32.         ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
  33.          TokenNameFinderFactory factory =new TokenNameFinderFactory();
  34.        //训练模型
  35.        TokenNameFinderModel model =NameFinderME.train("en","person", sampleStream,  TrainingParameters.defaultParams(),  factory);        
  36.        //保存模型
  37.        FileOutputStream fos=new FileOutputStream(new File(modelPath));
  38.         OutputStream modelOut = new BufferedOutputStream(fos);
  39.         model.serialize(modelOut);
  40. //评估模型
  41.         TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(new NameFinderME(model));
  42.         evaluator.evaluate(sampleStream);
  43.         FMeasure result = evaluator.getFMeasure();
  44.         System.out.println(result.toString());
  45.     }
  46. }

 

命名识别

  1. import java.io.File;
  2. import java.io.FileInputStream;
  3. import java.io.IOException;
  4. import java.io.InputStream;
  5. import opennlp.tools.namefind.NameFinderME;
  6. import opennlp.tools.namefind.TokenNameFinderModel;
  7. import opennlp.tools.util.Span;
  8. public class NameFinderPredit {
  9.     public static void main(String[] args) throws IOException {
  10.        // TODO Auto-generated method stub
  11.        String rootDir = System.getProperty("user.dir") + File.separator;
  12.       
  13.        String fileResourcesDir = rootDir + "resources" + File.separator;
  14.        String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
  15.       
  16.        //String filePath = fileResourcesDir + "sentenceDetector.txt";
  17.        String modelPath = modelResourcesDir + "en-ner-person.bin";
  18.        InputStream modelIn = new FileInputStream(modelPath) ;
  19.        //加载模型
  20.        TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
  21.        //实例化模型
  22.        NameFinderME nameFinder  = new NameFinderME(model);
  23.       
  24.        String tokens[] = new String[]{
  25.                "Vinken",
  26.                "is",
  27.                "61",
  28.                "years",
  29.                "old",
  30.                "Pierre",
  31.                ".",
  32.                "Pierre",
  33.                };
  34.        //命名检测
  35. //Span 保存表示命名实体在tokens中的位置
  36.        Span[] nameFinds= nameFinder.find(tokens);
  37.       
  38.        for(Span str:nameFinds){
  39.        System.out.println("type:"+str.getType()+";Tostring:"+str.toString()+";length:"+nameFinds.length+"start:"+str.getStart()+";end:"+str.getEnd()+";name:"+tokens[str.getStart()]);
  40.        }
  41.     }
  42. }

输出:

type:person;Tostring:[5..6) person;length:2start:5;end:6;name:Pierre

type:person;Tostring:[7..8) person;length:2start:7;end:8;name:Pierre

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/羊村懒王/article/detail/358473
推荐阅读
相关标签
  

闽ICP备14008679号