赞
踩
Github:https://github.com/Zhifa-Liu/EmotionClassDemo
数据集中文情感挖掘语料-ChnSentiCorp(谭松波)与搜狗文本分类语料库迷你版的下载链接可以从以下地址找到,另一个直接百度搜索即可:
https://github.com/hankcs/HanLP/wiki/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90#%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90
package cn.edu.neu.alink; import cn.edu.neu.alink.cons.ClassifierConstant; import com.alibaba.alink.operator.batch.BatchOperator; import com.alibaba.alink.operator.batch.source.CsvSourceBatchOp; import com.alibaba.alink.operator.batch.source.TextSourceBatchOp; import com.alibaba.alink.pipeline.LocalPredictor; import com.alibaba.alink.pipeline.Pipeline; import com.alibaba.alink.pipeline.PipelineModel; import com.alibaba.alink.pipeline.classification.LogisticRegression; import com.alibaba.alink.pipeline.classification.NaiveBayesTextClassifier; import com.alibaba.alink.pipeline.dataproc.Imputer; import com.alibaba.alink.pipeline.nlp.DocCountVectorizer; import com.alibaba.alink.pipeline.nlp.Segment; import com.alibaba.alink.pipeline.nlp.StopWordsRemover; import org.apache.flink.types.Row; import java.io.File; import java.util.List; /** * @author 32098 */ public class CommentClassifier { private static PipelineModel pipelineModel; public static void initNaiveBayesModel(){ pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH); if(pipelineModel==null){ System.err.println("载入模型失败..."); System.out.println("开始构建模型..."); BatchOperator<?> sourceBatchOp = getCommentSourceOp(); Pipeline pipeline = new Pipeline( // 缺失值填充:null new Imputer().setSelectedCols("review").setOutputCols("featureText").setStrategy("value").setFillValue("null"), // 分词操作 new Segment().setSelectedCol("featureText"), // 去除停用词 new StopWordsRemover().setSelectedCol("featureText"), /* * TF, Term Frequency: 词频,生成特征向量的类型 * https://www.yuque.com/pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782 */ new DocCountVectorizer().setFeatureType("TF").setSelectedCol("featureText").setOutputCol("featureVector"), new NaiveBayesTextClassifier().setVectorCol("featureVector").setLabelCol("label").setPredictionCol("pred") ); pipelineModel = pipeline.fit(sourceBatchOp); pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH); try { // save 方法是将模型连接到了 sink 组件,还需要等到 BatchOperator.execute(),才会真正写出模型 BatchOperator.execute(); } catch (Exception e) { e.printStackTrace(); } } System.out.println("模型构建成功!"); } public static void initLogisticRegressionModel(){ pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH); if(pipelineModel==null){ System.err.println("载入模型失败..."); System.out.println("开始构建模型..."); BatchOperator<?> sourceBatchOp = getCommentSourceOp(); Pipeline pipeline = new Pipeline( // 缺失值填充:null new Imputer().setSelectedCols("review").setOutputCols("featureText").setStrategy("value").setFillValue("null"), // 分词操作 new Segment().setSelectedCol("featureText"), // 去除停用词 new StopWordsRemover().setSelectedCol("featureText"), /* * TF, Term Frequency: 词频,生成特征向量的类型 * https://www.yuque.com/pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782 */ new DocCountVectorizer().setFeatureType("TF").setSelectedCol("featureText").setOutputCol("featureVector"), new LogisticRegression().setVectorCol("featureVector").setLabelCol("label").setPredictionCol("pred") ); pipelineModel = pipeline.fit(sourceBatchOp); pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH); try { // save 方法是将模型连接到了 sink 组件,还需要等到 BatchOperator.execute(),才会真正写出模型 BatchOperator.execute(); } catch (Exception e) { e.printStackTrace(); } } System.out.println("模型构建成功!"); } private static BatchOperator<?> getCommentSourceOp(){ return new CsvSourceBatchOp() .setFilePath(ClassifierConstant.DATASET_WEIBO_PATH) .setSchemaStr("label int, review string") .setIgnoreFirstLine(true); } public static String getClassification(String text){ if(pipelineModel==null){ System.err.println("As you didn't call initNaiveBayesModel() or initLogisticRegressionModel() before using getClassification(String text),\n" + "we will call initNaiveBayesModel() to set value for our inner attribute (pipelineModel) to get your text's Classification"); initNaiveBayesModel(); } try { // https://blog.csdn.net/Alink1024/article/details/107813310 LocalPredictor localPredictor = pipelineModel.collectLocalPredictor("review string"); // System.out.print(localPredictor.getOutputSchema()); Row row = Row.of(text); return String.valueOf(localPredictor.map(row).getField(3)); } catch (Exception e) { e.printStackTrace(); } return null; } public static void main(String[] args) throws Exception { // Can't, we will use LocalPredictor // initNaiveBayesModel(); // System.out.println("------------------------------"); // TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp() // .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/neg.txt".replace("/", File.separator)) // .setTextCol("review"); // pipelineModel.transform(textSourceBatchOp1).select(new String[]{"label", "pred", "review"}).sampleWithSize(20).print(); // // initLogisticRegressionModel(); // System.out.println("------------------------------"); // TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp() // .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/pos.txt".replace("/", File.separator)) // .setTextCol("review"); // pipelineModel.transform(textSourceBatchOp2).select(new String[]{"label", "pred", "review"}).sampleWithSize(20).print(); System.out.println(getClassification("你真好")); System.out.println(getClassification("哇哦今年的春夏季衣服不错诶")); TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp() .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/neg.txt".replace("/", File.separator)) .setTextCol("review"); TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp() .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/pos.txt".replace("/", File.separator)) .setTextCol("review"); List<Row> negRows = textSourceBatchOp1.getDataSet().collect(); List<Row> posRows = textSourceBatchOp2.getDataSet().collect(); int acc = 0; for (Row negRow : negRows) { // except to be 0 String text = getClassification((String) negRow.getField(0)); System.out.println(text); if("0".equals(text)){ acc+=1; } } for (Row posRow : posRows) { // except to be 1 String text = getClassification((String) posRow.getField(0)); System.out.println(text); if("0".equals(text)){ acc+=1; } } System.out.println("Acc: "+(double) acc/(negRows.size()+posRows.size())); } }
这个分类感觉有点慢!!!
HanLP git:https://github.com/hankcs/HanLP/tree/doc-zh
HanLP 中文情感分析:https://github.com/hankcs/HanLP/wiki/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90#%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90
通过 HanLP的NaiveBayesClassifier与HanLPTokenizer实现的微博评论情感分析、酒店评论情感分析、文本分类:
package cn.edu.neu.hanlp; import cn.edu.neu.hanlp.cons.ClassifierConstant; import com.hankcs.hanlp.classification.classifiers.AbstractClassifier; import com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier; import com.hankcs.hanlp.classification.corpus.FileDataSet; import com.hankcs.hanlp.classification.corpus.IDataSet; import com.hankcs.hanlp.classification.models.AbstractModel; import com.hankcs.hanlp.classification.models.NaiveBayesModel; import com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer; import java.io.*; import java.util.Map; /** * @author 32098 */ public class HanLpClassifier { private static AbstractClassifier classifier = null; /** * * @param dataPath 数据路径 * @param modelPath 模型路径 */ public static void initClassifier(String dataPath, String modelPath){ AbstractModel model = loadModel(modelPath); if(model==null){ System.out.println("No model find, begin train model!"); IDataSet dataSet = null; try { System.out.println(dataPath); File f = new File(dataPath); if(f.isFile()){ BufferedReader reader = new BufferedReader(new FileReader(dataPath)); String str; dataSet = new FileDataSet().setTokenizer(new HanLPTokenizer()); System.out.println("Prepare dataset!"); // ignore first line str = reader.readLine(); while ((str=reader.readLine())!=null){ dataSet.add(str.substring(0,1), str.substring(2)); } }else{ dataSet = new FileDataSet().setTokenizer(new HanLPTokenizer()).load(dataPath, "UTF-8"); } System.out.println("Dataset prepared!"); } catch (IOException e) { e.printStackTrace(); } classifier = new NaiveBayesClassifier(); classifier.train(dataSet); model = classifier.getModel(); saveModel(modelPath, model); }else{ System.out.println("NaiveBayesModel init succeeded!"); classifier = new NaiveBayesClassifier((NaiveBayesModel) model); } } private static void saveModel(String modelPath, AbstractModel model){ try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(modelPath))) { oos.writeObject(model); System.out.println("Save NaiveBayesModel Succeeded!"); } catch (Exception e) { System.err.println("Save NaiveBayesModel Failed!"); System.err.println(e.getMessage()); } } private static AbstractModel loadModel(String modelPath){ try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelPath))) { Object o = ois.readObject(); return (AbstractModel) o; } catch (FileNotFoundException e) { System.err.println("Load NaiveBayesModel Failed(NaiveBayesModel file:" + modelPath+" not Found!)"); } catch (Exception e) { System.err.println(e.getMessage()); } return null; } public static Double getScoreOfWeiboComment(String sentence){ if(classifier==null){ System.err.println("Classifier is null, default using weibo comment data to init classifier"); System.out.println("If you want to use different data to init classifier, call initClassifier first"); initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH); } Map<String, Double> map = classifier.predict(sentence); return map.get("1") - map.get("0"); } public static String getClassification(String sentence) { if(classifier==null){ System.err.println("Classifier is null, default using weibo comment data to init classifier"); System.out.println("If you want to use different data to init classifier, call initClassifier first"); initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH); } Map<String, Double> map = classifier.predict(sentence); // System.out.println(map); return classifier.classify(sentence); } }
package cn.edu.neu.hanlp; import cn.edu.neu.hanlp.cons.ClassifierConstant; /** * @author 32098 * * 情感分类、中文文本分类 */ public class Test { public static void main(String[] args) { HanLpClassifier.initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH); System.out.println(HanLpClassifier.getClassification("天安门")); System.out.println(HanLpClassifier.getClassification("哇哦今年的春夏季衣服不错诶")); System.out.println(HanLpClassifier.getClassification("去死吧")); System.out.println(HanLpClassifier.getClassification("加油")); System.out.println(HanLpClassifier.getClassification("你真好")); System.out.println(HanLpClassifier.getScoreOfWeiboComment("你真好")); HanLpClassifier.initClassifier(ClassifierConstant.DATASET_HOTEL_PATH, ClassifierConstant.HOTEL_MODEL_PATH); System.out.println(HanLpClassifier.getClassification("酒店太差了")); HanLpClassifier.initClassifier(ClassifierConstant.DATASET_SOUGOU_PATH, ClassifierConstant.SOUGOU_MODEL_PATH); System.out.println(HanLpClassifier.getClassification("篮球、羽毛球")); } }
运行结果:
暂略
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。