当前位置:   article > 正文

Java 中文情感分类(or 文本分类):Alink 中文情感分析、HanLP 中文情感分析、SparkML 中文情感分析(待定)_文本情感分析实验(基于中文情感挖掘语料-chnsenticorp 谭松波)

文本情感分析实验(基于中文情感挖掘语料-chnsenticorp 谭松波)

一、项目目录与地址

Githubhttps://github.com/Zhifa-Liu/EmotionClassDemo
在这里插入图片描述

  • cn.edu.neu.alink:Alink 中文情感分析
  • cn.edu.neu.bayes:在 https://github.com/marwincn/pubsenti-finder 代码基础上,略作修改后的贝叶斯情感分类,效果似乎不太好,不予介绍
  • cn.edu.neu.hanlp:HanLP 中文情感分析
  • cn.edu.neu.sparkml:SparkML 中文情感分析,待定
  • cn.edu.neu.zoom.data:中文情感分析(文本分类)使用的数据集
    • 中文情感挖掘语料-ChnSentiCorp(谭松波)
    • 搜狗文本分类语料库迷你版
    • 微博评论情感数据集:weibo_senti_100k.csv
  • cn.edu.neu.zoom.model:保存的情感分析模型

数据集中文情感挖掘语料-ChnSentiCorp(谭松波)与搜狗文本分类语料库迷你版的下载链接可以从以下地址找到,另一个直接百度搜索即可:
https://github.com/hankcs/HanLP/wiki/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90#%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90

二、Alink 中文情感分析:微博评论情感分析

package cn.edu.neu.alink;

import cn.edu.neu.alink.cons.ClassifierConstant;
import com.alibaba.alink.operator.batch.BatchOperator;
import com.alibaba.alink.operator.batch.source.CsvSourceBatchOp;
import com.alibaba.alink.operator.batch.source.TextSourceBatchOp;
import com.alibaba.alink.pipeline.LocalPredictor;
import com.alibaba.alink.pipeline.Pipeline;
import com.alibaba.alink.pipeline.PipelineModel;
import com.alibaba.alink.pipeline.classification.LogisticRegression;
import com.alibaba.alink.pipeline.classification.NaiveBayesTextClassifier;
import com.alibaba.alink.pipeline.dataproc.Imputer;
import com.alibaba.alink.pipeline.nlp.DocCountVectorizer;
import com.alibaba.alink.pipeline.nlp.Segment;
import com.alibaba.alink.pipeline.nlp.StopWordsRemover;
import org.apache.flink.types.Row;

import java.io.File;
import java.util.List;

/**
 * @author 32098
 */
public class CommentClassifier {
    private static PipelineModel pipelineModel;

    public static void initNaiveBayesModel(){
        pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH);
        if(pipelineModel==null){
            System.err.println("载入模型失败...");
            System.out.println("开始构建模型...");
            BatchOperator<?> sourceBatchOp = getCommentSourceOp();
            Pipeline pipeline = new Pipeline(
                    // 缺失值填充:null
                    new Imputer().setSelectedCols("review").setOutputCols("featureText").setStrategy("value").setFillValue("null"),
                    // 分词操作
                    new Segment().setSelectedCol("featureText"),
                    // 去除停用词
                    new StopWordsRemover().setSelectedCol("featureText"),
                    /*
                     * TF, Term Frequency: 词频,生成特征向量的类型
                     * https://www.yuque.com/pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782
                     */
                    new DocCountVectorizer().setFeatureType("TF").setSelectedCol("featureText").setOutputCol("featureVector"),
                    new NaiveBayesTextClassifier().setVectorCol("featureVector").setLabelCol("label").setPredictionCol("pred")
            );
            pipelineModel = pipeline.fit(sourceBatchOp);
            pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH);
            try {
                // save 方法是将模型连接到了 sink 组件,还需要等到 BatchOperator.execute(),才会真正写出模型
                BatchOperator.execute();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        System.out.println("模型构建成功!");
    }

    public static void initLogisticRegressionModel(){
        pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH);
        if(pipelineModel==null){
            System.err.println("载入模型失败...");
            System.out.println("开始构建模型...");
            BatchOperator<?> sourceBatchOp = getCommentSourceOp();
            Pipeline pipeline = new Pipeline(
                    // 缺失值填充:null
                    new Imputer().setSelectedCols("review").setOutputCols("featureText").setStrategy("value").setFillValue("null"),
                    // 分词操作
                    new Segment().setSelectedCol("featureText"),
                    // 去除停用词
                    new StopWordsRemover().setSelectedCol("featureText"),
                    /*
                     * TF, Term Frequency: 词频,生成特征向量的类型
                     * https://www.yuque.com/pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782
                     */
                    new DocCountVectorizer().setFeatureType("TF").setSelectedCol("featureText").setOutputCol("featureVector"),
                    new LogisticRegression().setVectorCol("featureVector").setLabelCol("label").setPredictionCol("pred")
            );
            pipelineModel = pipeline.fit(sourceBatchOp);
            pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH);
            try {
                // save 方法是将模型连接到了 sink 组件,还需要等到 BatchOperator.execute(),才会真正写出模型
                BatchOperator.execute();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        System.out.println("模型构建成功!");
    }

    private static BatchOperator<?> getCommentSourceOp(){
        return new CsvSourceBatchOp()
                .setFilePath(ClassifierConstant.DATASET_WEIBO_PATH)
                .setSchemaStr("label int, review string")
                .setIgnoreFirstLine(true);
    }

    public static String getClassification(String text){
        if(pipelineModel==null){
            System.err.println("As you didn't call initNaiveBayesModel() or initLogisticRegressionModel() before using getClassification(String text),\n" +
                    "we will call initNaiveBayesModel() to set value for our inner attribute (pipelineModel) to get your text's Classification");
            initNaiveBayesModel();
        }
        try {
            // https://blog.csdn.net/Alink1024/article/details/107813310
            LocalPredictor localPredictor = pipelineModel.collectLocalPredictor("review string");
            // System.out.print(localPredictor.getOutputSchema());
            Row row = Row.of(text);
            return String.valueOf(localPredictor.map(row).getField(3));
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public static void main(String[] args) throws Exception {
        // Can't, we will use LocalPredictor
//        initNaiveBayesModel();
//        System.out.println("------------------------------");
//        TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp()
//                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/neg.txt".replace("/", File.separator))
//                .setTextCol("review");
//        pipelineModel.transform(textSourceBatchOp1).select(new String[]{"label", "pred", "review"}).sampleWithSize(20).print();
//
//        initLogisticRegressionModel();
//        System.out.println("------------------------------");
//        TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp()
//                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/pos.txt".replace("/", File.separator))
//                .setTextCol("review");
//        pipelineModel.transform(textSourceBatchOp2).select(new String[]{"label", "pred", "review"}).sampleWithSize(20).print();

        System.out.println(getClassification("你真好"));
        System.out.println(getClassification("哇哦今年的春夏季衣服不错诶"));

        TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp()
                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/neg.txt".replace("/", File.separator))
                .setTextCol("review");
        TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp()
                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/pos.txt".replace("/", File.separator))
                .setTextCol("review");
        List<Row> negRows = textSourceBatchOp1.getDataSet().collect();
        List<Row> posRows = textSourceBatchOp2.getDataSet().collect();

        int acc = 0;
        for (Row negRow : negRows) {
            // except to be 0
            String text = getClassification((String) negRow.getField(0));
            System.out.println(text);
            if("0".equals(text)){
                acc+=1;
            }
        }
        for (Row posRow : posRows) {
            // except to be 1
            String text = getClassification((String) posRow.getField(0));
            System.out.println(text);
            if("0".equals(text)){
                acc+=1;
            }
        }
        System.out.println("Acc: "+(double) acc/(negRows.size()+posRows.size()));
    }
}

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164

在这里插入图片描述
  这个分类感觉有点慢!!!

三、HanLP 中文情感分析

HanLP git:https://github.com/hankcs/HanLP/tree/doc-zh
HanLP 中文情感分析:https://github.com/hankcs/HanLP/wiki/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90#%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90

  通过 HanLP的NaiveBayesClassifier与HanLPTokenizer实现的微博评论情感分析、酒店评论情感分析、文本分类:

package cn.edu.neu.hanlp;

import cn.edu.neu.hanlp.cons.ClassifierConstant;
import com.hankcs.hanlp.classification.classifiers.AbstractClassifier;
import com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier;
import com.hankcs.hanlp.classification.corpus.FileDataSet;
import com.hankcs.hanlp.classification.corpus.IDataSet;
import com.hankcs.hanlp.classification.models.AbstractModel;
import com.hankcs.hanlp.classification.models.NaiveBayesModel;
import com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer;

import java.io.*;
import java.util.Map;

/**
 * @author 32098
 */
public class HanLpClassifier {
    private static AbstractClassifier classifier = null;

    /**
     *
     * @param dataPath 数据路径
     * @param modelPath 模型路径
     */
    public static void initClassifier(String dataPath, String modelPath){
        AbstractModel model = loadModel(modelPath);
        if(model==null){
            System.out.println("No model find, begin train model!");
            IDataSet dataSet = null;
            try {
                System.out.println(dataPath);

                File f = new File(dataPath);
                if(f.isFile()){
                    BufferedReader reader = new BufferedReader(new FileReader(dataPath));
                    String str;
                    dataSet = new FileDataSet().setTokenizer(new HanLPTokenizer());
                    System.out.println("Prepare dataset!");
                    // ignore first line
                    str = reader.readLine();
                    while ((str=reader.readLine())!=null){
                        dataSet.add(str.substring(0,1), str.substring(2));
                    }
                }else{
                    dataSet = new FileDataSet().setTokenizer(new HanLPTokenizer()).load(dataPath, "UTF-8");
                }
                System.out.println("Dataset prepared!");
            } catch (IOException e) {
                e.printStackTrace();
            }
            classifier = new NaiveBayesClassifier();
            classifier.train(dataSet);
            model = classifier.getModel();
            saveModel(modelPath, model);
        }else{
            System.out.println("NaiveBayesModel init succeeded!");
            classifier = new NaiveBayesClassifier((NaiveBayesModel) model);
        }
    }

    private static void saveModel(String modelPath, AbstractModel model){
        try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(modelPath))) {
            oos.writeObject(model);
            System.out.println("Save NaiveBayesModel Succeeded!");
        } catch (Exception e) {
            System.err.println("Save NaiveBayesModel Failed!");
            System.err.println(e.getMessage());
        }
    }

    private static AbstractModel loadModel(String modelPath){
        try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelPath))) {
            Object o = ois.readObject();
            return (AbstractModel) o;
        } catch (FileNotFoundException e) {
            System.err.println("Load NaiveBayesModel Failed(NaiveBayesModel file:" + modelPath+" not Found!)");
        } catch (Exception e) {
            System.err.println(e.getMessage());
        }
        return null;
    }

    public static Double getScoreOfWeiboComment(String sentence){
        if(classifier==null){
            System.err.println("Classifier is null, default using weibo comment data to init classifier");
            System.out.println("If you want to use different data to init classifier, call initClassifier first");
            initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
        }
        Map<String, Double> map = classifier.predict(sentence);
        return map.get("1") - map.get("0");
    }

    public static String getClassification(String sentence) {
        if(classifier==null){
            System.err.println("Classifier is null, default using weibo comment data to init classifier");
            System.out.println("If you want to use different data to init classifier, call initClassifier first");
            initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
        }
        Map<String, Double> map = classifier.predict(sentence);
        // System.out.println(map);
        return classifier.classify(sentence);
    }
}

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
package cn.edu.neu.hanlp;

import cn.edu.neu.hanlp.cons.ClassifierConstant;

/**
 * @author 32098
 *
 * 情感分类、中文文本分类
 */
public class Test {
    public static void main(String[] args) {
        HanLpClassifier.initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
        System.out.println(HanLpClassifier.getClassification("天安门"));
        System.out.println(HanLpClassifier.getClassification("哇哦今年的春夏季衣服不错诶"));
        System.out.println(HanLpClassifier.getClassification("去死吧"));
        System.out.println(HanLpClassifier.getClassification("加油"));
        System.out.println(HanLpClassifier.getClassification("你真好"));
        System.out.println(HanLpClassifier.getScoreOfWeiboComment("你真好"));

        HanLpClassifier.initClassifier(ClassifierConstant.DATASET_HOTEL_PATH, ClassifierConstant.HOTEL_MODEL_PATH);
        System.out.println(HanLpClassifier.getClassification("酒店太差了"));

        HanLpClassifier.initClassifier(ClassifierConstant.DATASET_SOUGOU_PATH, ClassifierConstant.SOUGOU_MODEL_PATH);
        System.out.println(HanLpClassifier.getClassification("篮球、羽毛球"));
    }
}

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27

  运行结果:
在这里插入图片描述

四、SparkML 中文情感分类(待定)

暂略

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/你好赵伟/article/detail/604492
推荐阅读
相关标签
  

闽ICP备14008679号