赞
踩
词形还原,把用POS tagger标注格式的词还原为标注前的格式。如
输入:
Rockwell_NNP International_NNP Corp._NNP 's_POS Tulsa_NNP unit_NN said_VBD it_PRP
或者,输入三列,第一列为原词,第二列为词性标注,第三列为lemma的词形
He PRP he reckons VBZ reckon the DT the current JJ current accounts NNS account deficit NN deficit will MD will narrow VB narrow to TO to only RB only # # # 1.8 CD 1.8 millions CD million in IN in September NNP september . . O
输出:
Rockwell NNP rockwell
International NNP international
Corp. NNP corp.
's POS 's
Tulsa NNP tulsa
unit NN unit
said VBD say
it PRP it
```java import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.charset.StandardCharsets; import opennlp.tools.lemmatizer.LemmaSample; import opennlp.tools.lemmatizer.LemmaSampleStream; import opennlp.tools.lemmatizer.LemmatizerEvaluator; import opennlp.tools.lemmatizer.LemmatizerFactory; import opennlp.tools.lemmatizer.LemmatizerME; import opennlp.tools.lemmatizer.LemmatizerModel; import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; public class LemmatizerTrain { public static void main(String[] args) throws IOException { // TODO Auto-generated method stub String rootDir = System.getProperty("user.dir") + File.separator; String fileResourcesDir = rootDir + "resources" + File.separator; String modelResourcesDir = rootDir + "opennlpmodel" + File.separator; //训练数据的路径 String filePath = fileResourcesDir + "lemmatizer.txt"; //训练后模型的保存路径 String modelPath = modelResourcesDir + "lemmatizer-my.bin"; //按行读取数据 InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath)); ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8); //按行读取数据 ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream(lineStream); LemmatizerFactory factory =new LemmatizerFactory(); //训练模型 LemmatizerModel model =LemmatizerME.train("en",sampleStream, TrainingParameters.defaultParams(),factory); //保存模型 FileOutputStream fos=new FileOutputStream(new File(modelPath)); OutputStream modelOut = new BufferedOutputStream(fos); model.serialize(modelOut); //评估模型 LemmatizerEvaluator evaluator=new LemmatizerEvaluator(new LemmatizerME(model)); evaluator.evaluate(sampleStream); System.out.println("正确的词数:"+ evaluator.getWordAccuracy()); } }
```java import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import opennlp.tools.lemmatizer.LemmatizerME; import opennlp.tools.lemmatizer.LemmatizerModel; public class LemmatizerPredit { public static void main(String[] args) throws IOException { // TODO Auto-generated method stub String rootDir = System.getProperty("user.dir") + File.separator; String fileResourcesDir = rootDir + "resources" + File.separator; String modelResourcesDir = rootDir + "opennlpmodel" + File.separator; //String filePath = fileResourcesDir + "sentenceDetector.txt"; String modelPath = modelResourcesDir + "lemmatizer-my.bin"; InputStream modelIn = new FileInputStream(modelPath) ; //加载模型 LemmatizerModel model = new LemmatizerModel(modelIn); //实例化模型 LemmatizerME lemmatizer = new LemmatizerME(model); //词形还原 String[] tokens = new String[] { "Rockwell", "International", "Corp.", "'s", "Tulsa", "unit", "said", "it", "signed", "a", "tentative", "agreement", "extending", "its", "contract", "with", "Boeing", "Co.", "to", "provide", "structural", "parts", "for", "Boeing", "'s", "747", "jetliners", "." }; String[] postags = new String[] { "NNP", "NNP", "NNP", "POS", "NNP", "NN", "VBD", "PRP", "VBD", "DT", "JJ", "NN", "VBG", "PRP$", "NN", "IN", "NNP", "NNP", "TO", "VB", "JJ", "NNS", "IN", "NNP", "POS", "CD", "NNS", "." }; String[] lemmas =lemmatizer.lemmatize(tokens, postags); for(String str:lemmas){ System.out.println(str); } } }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。