赞
踩
HanLP是自然语言处理包,这里不多做赘述,由于可能会出现需要把HanLP提供的资源文件(词库)放在SpringBoot的Resources下,所以特地在此记录自己尝试成功的经历
关于引入依赖和词库文件的解压不多做说明,首先看一下提供的配置文件
文件名hanlp.properties
#2\u5143\u8bed\u6cd5\u8bcd\u5178\u8def\u5f84 BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt #\u505c\u7528\u8bcd\u8bcd\u5178\u8def\u5f84 CoreStopWordDictionaryPath=data/dictionary/stopwords.txt #\u540c\u4e49\u8bcd\u8bcd\u5178\u8def\u5f84 CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt #\u4eba\u540d\u8bcd\u5178\u8def\u5f84 PersonDictionaryPath=data/dictionary/person/nr.txt #\u4eba\u540d\u8bcd\u5178\u8f6c\u79fb\u77e9\u9635\u8def\u5f84 PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt #\u7e41\u7b80\u8bcd\u5178\u6839\u76ee\u5f55 tcDictionaryRoot=data/dictionary/tc #\u81ea\u5b9a\u4e49\u8bcd\u5178\u8def\u5f84\uff0c\u7528;\u9694\u5f00\u591a\u4e2a\u81ea\u5b9a\u4e49\u8bcd\u5178\uff0c\u7a7a\u683c\u5f00\u5934\u8868\u793a\u5728\u540c\u4e00\u4e2a\u76ee\u5f55\uff0c\u4f7f\u7528\u201c\u6587\u4ef6\u540d \u8bcd\u6027\u201d\u5f62\u5f0f\u5219\u8868\u793a\u8fd9\u4e2a\u8bcd\u5178\u7684\u8bcd\u6027\u9ed8\u8ba4\u662f\u8be5\u8bcd\u6027\u3002\u4f18\u5148\u7ea7\u9012\u51cf\u3002 #\u53e6\u5916data/dictionary/custom/CustomDictionary.txt\u662f\u4e2a\u9ad8\u8d28\u91cf\u7684\u8bcd\u5e93\uff0c\u8bf7\u4e0d\u8981\u5220\u9664\u3002\u6240\u6709\u8bcd\u5178\u7edf\u4e00\u4f7f\u7528UTF-8\u7f16\u7801\u3002 CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; \u73b0\u4ee3\u6c49\u8bed\u8865\u5145\u8bcd\u5e93.txt; \u5168\u56fd\u5730\u540d\u5927\u5168.txt ns; \u4eba\u540d\u8bcd\u5178.txt; \u673a\u6784\u540d\u8bcd\u5178.txt; user_dic.txt; \u4e0a\u6d77\u5730\u540d.txt ns;data/dictionary/person/nrf.txt nrf; #CRF\u5206\u8bcd\u6a21\u578b\u8def\u5f84 CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt #HMM\u5206\u8bcd\u6a21\u578b HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin #\u5206\u8bcd\u7ed3\u679c\u662f\u5426\u5c55\u793a\u8bcd\u6027 ShowTermNature=true #IO\u9002\u914d\u5668\uff0c\u5b9e\u73b0com.hankcs.hanlp.corpus.io.IIOAdapter\u63a5\u53e3\u4ee5\u5728\u4e0d\u540c\u7684\u5e73\u53f0\uff08Hadoop\u3001Redis\u7b49\uff09\u4e0a\u8fd0\u884cHanLP #\u9ed8\u8ba4\u7684IO\u9002\u914d\u5668\u5982\u4e0b\uff0c\u8be5\u9002\u914d\u5668\u662f\u57fa\u4e8e\u666e\u901a\u6587\u4ef6\u7cfb\u7edf\u7684\u3002 #IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter #\u611f\u77e5\u673a\u8bcd\u6cd5\u5206\u6790\u5668 PerceptronCWSModelPath=data/model/perceptron/pku199801/cws.bin PerceptronPOSModelPath=data/model/perceptron/pku199801/pos.bin PerceptronNERModelPath=data/model/perceptron/pku199801/ner.bin #CRF\u8bcd\u6cd5\u5206\u6790\u5668 CRFCWSModelPath=data/model/crf/pku199801/cws.bin CRFPOSModelPath=data/model/crf/pku199801/pos.bin CRFNERModelPath=data/model/crf/pku199801/ner.bin #\u66f4\u591a\u914d\u7f6e\u9879\u8bf7\u53c2\u8003 https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/HanLP.java#L59 \u81ea\u884c\u6dfb\u52a0
然后有一个地方需要注意下
在HanLP类下
public static void init(String rootPath) { Properties p = new Properties(); String prePath; int i; int lastSplash; try { ClassLoader loader = Thread.currentThread().getContextClassLoader(); if (loader == null) { loader = HanLP.Config.class.getClassLoader(); } p.load(new InputStreamReader((InputStream)(Predefine.HANLP_PROPERTIES_PATH == null ? loader.getResourceAsStream("hanlp.properties") : new FileInputStream(Predefine.HANLP_PROPERTIES_PATH)), "UTF-8")); String root = p.getProperty("root", ""); if (StringUtil.isEmpty(root)) { root = StringUtil.getString(rootPath); } System.out.println(root); root = root.replaceAll("\\\\", "/"); if (root.length() > 0 && !root.endsWith("/")) { root = root + "/"; } CoreDictionaryPath = root + p.getProperty("CoreDictionaryPath", CoreDictionaryPath); CoreDictionaryTransformMatrixDictionaryPath = root + p.getProperty("CoreDictionaryTransformMatrixDictionaryPath", CoreDictionaryTransformMatrixDictionaryPath); BiGramDictionaryPath = root + p.getProperty("BiGramDictionaryPath", BiGramDictionaryPath); CoreStopWordDictionaryPath = root + p.getProperty("CoreStopWordDictionaryPath", CoreStopWordDictionaryPath); CoreSynonymDictionaryDictionaryPath = root + p.getProperty("CoreSynonymDictionaryDictionaryPath", CoreSynonymDictionaryDictionaryPath); PersonDictionaryPath = root + p.getProperty("PersonDictionaryPath", PersonDictionaryPath); PersonDictionaryTrPath = root + p.getProperty("PersonDictionaryTrPath", PersonDictionaryTrPath); String[] pathArray = p.getProperty("CustomDictionaryPath", "data/dictionary/custom/CustomDictionary.txt").split(";"); prePath = root; for(i = 0; i < pathArray.length; ++i) { if (pathArray[i].startsWith(" ")) { pathArray[i] = prePath + pathArray[i].trim(); } else { pathArray[i] = root + pathArray[i]; lastSplash = pathArray[i].lastIndexOf(47); if (lastSplash != -1) { prePath = pathArray[i].substring(0, lastSplash + 1); } } } CustomDictionaryPath = pathArray; tcDictionaryRoot = root + p.getProperty("tcDictionaryRoot", tcDictionaryRoot); if (!tcDictionaryRoot.endsWith("/")) { tcDictionaryRoot = tcDictionaryRoot + '/'; } PinyinDictionaryPath = root + p.getProperty("PinyinDictionaryPath", PinyinDictionaryPath); TranslatedPersonDictionaryPath = root + p.getProperty("TranslatedPersonDictionaryPath", TranslatedPersonDictionaryPath); JapanesePersonDictionaryPath = root + p.getProperty("JapanesePersonDictionaryPath", JapanesePersonDictionaryPath); PlaceDictionaryPath = root + p.getProperty("PlaceDictionaryPath", PlaceDictionaryPath); PlaceDictionaryTrPath = root + p.getProperty("PlaceDictionaryTrPath", PlaceDictionaryTrPath); OrganizationDictionaryPath = root + p.getProperty("OrganizationDictionaryPath", OrganizationDictionaryPath); OrganizationDictionaryTrPath = root + p.getProperty("OrganizationDictionaryTrPath", OrganizationDictionaryTrPath); CharTypePath = root + p.getProperty("CharTypePath", CharTypePath); CharTablePath = root + p.getProperty("CharTablePath", CharTablePath); PartOfSpeechTagDictionary = root + p.getProperty("PartOfSpeechTagDictionary", PartOfSpeechTagDictionary); WordNatureModelPath = root + p.getProperty("WordNatureModelPath", WordNatureModelPath); MaxEntModelPath = root + p.getProperty("MaxEntModelPath", MaxEntModelPath); NNParserModelPath = root + p.getProperty("NNParserModelPath", NNParserModelPath); CRFSegmentModelPath = root + p.getProperty("CRFSegmentModelPath", CRFSegmentModelPath); CRFDependencyModelPath = root + p.getProperty("CRFDependencyModelPath", CRFDependencyModelPath); HMMSegmentModelPath = root + p.getProperty("HMMSegmentModelPath", HMMSegmentModelPath); CRFCWSModelPath = root + p.getProperty("CRFCWSModelPath", CRFCWSModelPath); CRFPOSModelPath = root + p.getProperty("CRFPOSModelPath", CRFPOSModelPath); CRFNERModelPath = root + p.getProperty("CRFNERModelPath", CRFNERModelPath); PerceptronCWSModelPath = root + p.getProperty("PerceptronCWSModelPath", PerceptronCWSModelPath); PerceptronPOSModelPath = root + p.getProperty("PerceptronPOSModelPath", PerceptronPOSModelPath); PerceptronNERModelPath = root + p.getProperty("PerceptronNERModelPath", PerceptronNERModelPath); ShowTermNature = "true".equals(p.getProperty("ShowTermNature", "true")); Normalization = "true".equals(p.getProperty("Normalization", "false")); String ioAdapterClassName = p.getProperty("IOAdapter"); if (ioAdapterClassName != null) { try { Class<?> clazz = Class.forName(ioAdapterClassName); Constructor<?> ctor = clazz.getConstructor(); Object instance = ctor.newInstance(); if (instance != null) { IOAdapter = (IIOAdapter)instance; } } catch (ClassNotFoundException var10) { Predefine.logger.warning(String.format("找不到IO适配器类: %s ,请检查第三方插件jar包", ioAdapterClassName)); } catch (NoSuchMethodException var11) { Predefine.logger.warning(String.format("工厂类[%s]没有默认构造方法,不符合要求", ioAdapterClassName)); } catch (SecurityException var12) { Predefine.logger.warning(String.format("工厂类[%s]默认构造方法无法访问,不符合要求", ioAdapterClassName)); } catch (Exception var13) { Predefine.logger.warning(String.format("工厂类[%s]构造失败:%s\n", ioAdapterClassName, TextUtility.exceptionToString(var13))); } } } catch (Exception var14) { StringBuilder sbInfo = new StringBuilder("========Tips========\n请将hanlp.properties放在下列目录:\n"); String classPath = (String)System.getProperties().get("java.class.path"); if (classPath != null) { String[] var8; lastSplash = (var8 = classPath.split(File.pathSeparator)).length; for(i = 0; i < lastSplash; ++i) { prePath = var8[i]; if ((new File(prePath)).isDirectory()) { sbInfo.append(prePath).append('\n'); } } } sbInfo.append("Web项目则请放到下列目录:\nWebapp/WEB-INF/lib\nWebapp/WEB-INF/classes\nAppserver/lib\nJRE/lib\n"); sbInfo.append("并且编辑root=PARENT/path/to/your/data\n"); sbInfo.append("现在HanLP将尝试从").append(System.getProperties().get("user.dir")).append("读取data……"); Predefine.logger.severe("没有找到hanlp.properties,可能会导致找不到data\n" + sbInfo); } }
这个函数需要输入一个root,也就是词库文件夹的绝对路径,如果设置的路径为空则会从properties中读取配置的路径,也就是我们需要的resources下的路径
HanLP下的Config类用于载入配置路径
public static final class Config { public static boolean DEBUG = false; public static String CoreDictionaryPath = "data/dictionary/CoreNatureDictionary.txt"; public static String CoreDictionaryTransformMatrixDictionaryPath = "data/dictionary/CoreNatureDictionary.tr.txt"; public static String[] CustomDictionaryPath = new String[]{"data/dictionary/custom/CustomDictionary.txt"}; public static String BiGramDictionaryPath = "data/dictionary/CoreNatureDictionary.ngram.txt"; public static String CoreStopWordDictionaryPath = "data/dictionary/stopwords.txt"; public static String CoreSynonymDictionaryDictionaryPath = "data/dictionary/synonym/CoreSynonym.txt"; public static String PersonDictionaryPath = "data/dictionary/person/nr.txt"; public static String PersonDictionaryTrPath = "data/dictionary/person/nr.tr.txt"; public static String PlaceDictionaryPath = "data/dictionary/place/ns.txt"; public static String PlaceDictionaryTrPath = "data/dictionary/place/ns.tr.txt"; public static String OrganizationDictionaryPath = "data/dictionary/organization/nt.txt"; public static String OrganizationDictionaryTrPath = "data/dictionary/organization/nt.tr.txt"; public static String tcDictionaryRoot = "data/dictionary/tc/"; public static String PinyinDictionaryPath = "data/dictionary/pinyin/pinyin.txt"; public static String TranslatedPersonDictionaryPath = "data/dictionary/person/nrf.txt"; public static String JapanesePersonDictionaryPath = "data/dictionary/person/nrj.txt"; public static String CharTypePath = "data/dictionary/other/CharType.bin"; public static String CharTablePath = "data/dictionary/other/CharTable.txt"; public static String PartOfSpeechTagDictionary = "data/dictionary/other/TagPKU98.csv"; public static String WordNatureModelPath = "data/model/dependency/WordNature.txt"; public static String MaxEntModelPath = "data/model/dependency/MaxEntModel.txt"; public static String NNParserModelPath = "data/model/dependency/NNParserModel.txt"; public static String CRFSegmentModelPath = "data/model/segment/CRFSegmentModel.txt"; public static String HMMSegmentModelPath = "data/model/segment/HMMSegmentModel.bin"; /** @deprecated */ public static String CRFDependencyModelPath = "data/model/dependency/CRFDependencyModelMini.txt"; public static String CRFCWSModelPath = "data/model/crf/pku199801/cws.bin"; public static String CRFPOSModelPath = "data/model/crf/pku199801/pos.bin"; public static String CRFNERModelPath = "data/model/crf/pku199801/ner.bin"; public static String PerceptronCWSModelPath = "data/model/perceptron/pku199801/cws.bin"; public static String PerceptronPOSModelPath = "data/model/perceptron/pku199801/pos.bin"; public static String PerceptronNERModelPath = "data/model/perceptron/pku199801/ner.bin"; public static boolean ShowTermNature = true; public static boolean Normalization = false; public static IIOAdapter IOAdapter; public Config() { }
在此了解基础上我们需要继承一个接口重写两个方法
/** * @Description: HanLP读取静态资源 * @author: Amethyst * @date: 2019-11-28 11:12 * @update_by: * @tags: */ public class HanLPResourcesAdapter implements IIOAdapter { @Override public InputStream open(String path) throws IOException { ClassPathResource resource =new ClassPathResource(path); IntputStream is=new FileIntputStream(resource.getFile()); return is; } @Override public OutputStream create(String path) throws IOException { ClassPathResource resource=new ClassPathResource(path); OutputStream os=new FileOutputStream(resource.getFile()); return os; } }
在重写方法的基础上我们还需要对properties进行两方面的改动
#添加根路径设置为空
root=
#添加适配器为我们自己适配器的路径
IOAdapter=x.x.HanLPResourcesAdapter
然后我们需要在调用前对HanLP进行一个初始化,调用一下init函数,设置根路径为空
//在载入包时需要调用此函数进行词库的读取初始化
HanLP.Config.init("");
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。