Java 中文情感分类(or 文本分类)：Alink 中文情感分析、HanLP 中文情感分析、SparkML 中文情感分析(待定)_文本情感分析实验(基于中文情感挖掘语料-chnsenticorp 谭松波)

作者：你好赵伟 | 2024-05-21 20:25:37

踩

文本情感分析实验(基于中文情感挖掘语料-chnsenticorp 谭松波)

文章目录

一、项目目录与地址

Github：https://github.com/Zhifa-Liu/EmotionClassDemo
在这里插入图片描述

cn.edu.neu.alink：Alink 中文情感分析
cn.edu.neu.bayes：在 https://github.com/marwincn/pubsenti-finder 代码基础上，略作修改后的贝叶斯情感分类，效果似乎不太好，不予介绍
cn.edu.neu.hanlp：HanLP 中文情感分析
cn.edu.neu.sparkml：SparkML 中文情感分析，待定
cn.edu.neu.zoom.data：中文情感分析(文本分类)使用的数据集
- 中文情感挖掘语料-ChnSentiCorp(谭松波)
- 搜狗文本分类语料库迷你版
- 微博评论情感数据集：weibo_senti_100k.csv
cn.edu.neu.zoom.model：保存的情感分析模型

数据集中文情感挖掘语料-ChnSentiCorp(谭松波)与搜狗文本分类语料库迷你版的下载链接可以从以下地址找到，另一个直接百度搜索即可：
https://github.com/hankcs/HanLP/wiki/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90#%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90

二、Alink 中文情感分析：微博评论情感分析

package cn.edu.neu.alink;

import cn.edu.neu.alink.cons.ClassifierConstant;
import com.alibaba.alink.operator.batch.BatchOperator;
import com.alibaba.alink.operator.batch.source.CsvSourceBatchOp;
import com.alibaba.alink.operator.batch.source.TextSourceBatchOp;
import com.alibaba.alink.pipeline.LocalPredictor;
import com.alibaba.alink.pipeline.Pipeline;
import com.alibaba.alink.pipeline.PipelineModel;
import com.alibaba.alink.pipeline.classification.LogisticRegression;
import com.alibaba.alink.pipeline.classification.NaiveBayesTextClassifier;
import com.alibaba.alink.pipeline.dataproc.Imputer;
import com.alibaba.alink.pipeline.nlp.DocCountVectorizer;
import com.alibaba.alink.pipeline.nlp.Segment;
import com.alibaba.alink.pipeline.nlp.StopWordsRemover;
import org.apache.flink.types.Row;

import java.io.File;
import java.util.List;

/**
 * @author 32098
 */
public class CommentClassifier {
    private static PipelineModel pipelineModel;

    public static void initNaiveBayesModel(){
        pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH);
        if(pipelineModel==null){
            System.err.println("载入模型失败...");
            System.out.println("开始构建模型...");
            BatchOperator<?> sourceBatchOp = getCommentSourceOp();
            Pipeline pipeline = new Pipeline(
                    // 缺失值填充：null
                    new Imputer().setSelectedCols("review").setOutputCols("featureText").setStrategy("value").setFillValue("null"),
                    // 分词操作
                    new Segment().setSelectedCol("featureText"),
                    // 去除停用词
                    new StopWordsRemover().setSelectedCol("featureText"),
                    /*
                     * TF, Term Frequency: 词频，生成特征向量的类型
                     * https://www.yuque.com/pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782
                     */
                    new DocCountVectorizer().setFeatureType("TF").setSelectedCol("featureText").setOutputCol("featureVector"),
                    new NaiveBayesTextClassifier().setVectorCol("featureVector").setLabelCol("label").setPredictionCol("pred")
            );
            pipelineModel = pipeline.fit(sourceBatchOp);
            pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH);
            try {
                // save 方法是将模型连接到了 sink 组件，还需要等到 BatchOperator.execute()，才会真正写出模型
                BatchOperator.execute();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        System.out.println("模型构建成功！");
    }

    public static void initLogisticRegressionModel(){
        pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH);
        if(pipelineModel==null){
            System.err.println("载入模型失败...");
            System.out.println("开始构建模型...");
            BatchOperator<?> sourceBatchOp = getCommentSourceOp();
            Pipeline pipeline = new Pipeline(
                    // 缺失值填充：null
                    new Imputer().setSelectedCols("review").setOutputCols("featureText").setStrategy("value").setFillValue("null"),
                    // 分词操作
                    new Segment().setSelectedCol("featureText"),
                    // 去除停用词
                    new StopWordsRemover().setSelectedCol("featureText"),
                    /*
                     * TF, Term Frequency: 词频，生成特征向量的类型
                     * https://www.yuque.com/pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782
                     */
                    new DocCountVectorizer().setFeatureType("TF").setSelectedCol("featureText").setOutputCol("featureVector"),
                    new LogisticRegression().setVectorCol("featureVector").setLabelCol("label").setPredictionCol("pred")
            );
            pipelineModel = pipeline.fit(sourceBatchOp);
            pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH);
            try {
                // save 方法是将模型连接到了 sink 组件，还需要等到 BatchOperator.execute()，才会真正写出模型
                BatchOperator.execute();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        System.out.println("模型构建成功！");
    }

    private static BatchOperator<?> getCommentSourceOp(){
        return new CsvSourceBatchOp()
                .setFilePath(ClassifierConstant.DATASET_WEIBO_PATH)
                .setSchemaStr("label int, review string")
                .setIgnoreFirstLine(true);
    }

    public static String getClassification(String text){
        if(pipelineModel==null){
            System.err.println("As you didn't call initNaiveBayesModel() or initLogisticRegressionModel() before using getClassification(String text),\n" +
                    "we will call initNaiveBayesModel() to set value for our inner attribute (pipelineModel) to get your text's Classification");
            initNaiveBayesModel();
        }
        try {
            // https://blog.csdn.net/Alink1024/article/details/107813310
            LocalPredictor localPredictor = pipelineModel.collectLocalPredictor("review string");
            // System.out.print(localPredictor.getOutputSchema());
            Row row = Row.of(text);
            return String.valueOf(localPredictor.map(row).getField(3));
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public static void main(String[] args) throws Exception {
        // Can't, we will use LocalPredictor
//        initNaiveBayesModel();
//        System.out.println("------------------------------");
//        TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp()
//                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/neg.txt".replace("/", File.separator))
//                .setTextCol("review");
//        pipelineModel.transform(textSourceBatchOp1).select(new String[]{"label", "pred", "review"}).sampleWithSize(20).print();
//
//        initLogisticRegressionModel();
//        System.out.println("------------------------------");
//        TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp()
//                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/pos.txt".replace("/", File.separator))
//                .setTextCol("review");
//        pipelineModel.transform(textSourceBatchOp2).select(new String[]{"label", "pred", "review"}).sampleWithSize(20).print();

        System.out.println(getClassification("你真好"));
        System.out.println(getClassification("哇哦今年的春夏季衣服不错诶"));

        TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp()
                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/neg.txt".replace("/", File.separator))
                .setTextCol("review");
        TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp()
                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/pos.txt".replace("/", File.separator))
                .setTextCol("review");
        List<Row> negRows = textSourceBatchOp1.getDataSet().collect();
        List<Row> posRows = textSourceBatchOp2.getDataSet().collect();

        int acc = 0;
        for (Row negRow : negRows) {
            // except to be 0
            String text = getClassification((String) negRow.getField(0));
            System.out.println(text);
            if("0".equals(text)){
                acc+=1;
            }
        }
        for (Row posRow : posRows) {
            // except to be 1
            String text = getClassification((String) posRow.getField(0));
            System.out.println(text);
            if("0".equals(text)){
                acc+=1;
            }
        }
        System.out.println("Acc: "+(double) acc/(negRows.size()+posRows.size()));
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

在这里插入图片描述
这个分类感觉有点慢！！！

三、HanLP 中文情感分析

HanLP git：https://github.com/hankcs/HanLP/tree/doc-zh
HanLP 中文情感分析：https://github.com/hankcs/HanLP/wiki/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90#%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90

通过 HanLP的NaiveBayesClassifier与HanLPTokenizer实现的微博评论情感分析、酒店评论情感分析、文本分类：

package cn.edu.neu.hanlp;

import cn.edu.neu.hanlp.cons.ClassifierConstant;
import com.hankcs.hanlp.classification.classifiers.AbstractClassifier;
import com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier;
import com.hankcs.hanlp.classification.corpus.FileDataSet;
import com.hankcs.hanlp.classification.corpus.IDataSet;
import com.hankcs.hanlp.classification.models.AbstractModel;
import com.hankcs.hanlp.classification.models.NaiveBayesModel;
import com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer;

import java.io.*;
import java.util.Map;

/**
 * @author 32098
 */
public class HanLpClassifier {
    private static AbstractClassifier classifier = null;

    /**
     *
     * @param dataPath 数据路径
     * @param modelPath 模型路径
     */
    public static void initClassifier(String dataPath, String modelPath){
        AbstractModel model = loadModel(modelPath);
        if(model==null){
            System.out.println("No model find, begin train model!");
            IDataSet dataSet = null;
            try {
                System.out.println(dataPath);

                File f = new File(dataPath);
                if(f.isFile()){
                    BufferedReader reader = new BufferedReader(new FileReader(dataPath));
                    String str;
                    dataSet = new FileDataSet().setTokenizer(new HanLPTokenizer());
                    System.out.println("Prepare dataset!");
                    // ignore first line
                    str = reader.readLine();
                    while ((str=reader.readLine())!=null){
                        dataSet.add(str.substring(0,1), str.substring(2));
                    }
                }else{
                    dataSet = new FileDataSet().setTokenizer(new HanLPTokenizer()).load(dataPath, "UTF-8");
                }
                System.out.println("Dataset prepared!");
            } catch (IOException e) {
                e.printStackTrace();
            }
            classifier = new NaiveBayesClassifier();
            classifier.train(dataSet);
            model = classifier.getModel();
            saveModel(modelPath, model);
        }else{
            System.out.println("NaiveBayesModel init succeeded!");
            classifier = new NaiveBayesClassifier((NaiveBayesModel) model);
        }
    }

    private static void saveModel(String modelPath, AbstractModel model){
        try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(modelPath))) {
            oos.writeObject(model);
            System.out.println("Save NaiveBayesModel Succeeded!");
        } catch (Exception e) {
            System.err.println("Save NaiveBayesModel Failed!");
            System.err.println(e.getMessage());
        }
    }

    private static AbstractModel loadModel(String modelPath){
        try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelPath))) {
            Object o = ois.readObject();
            return (AbstractModel) o;
        } catch (FileNotFoundException e) {
            System.err.println("Load NaiveBayesModel Failed(NaiveBayesModel file：" + modelPath+" not Found!)");
        } catch (Exception e) {
            System.err.println(e.getMessage());
        }
        return null;
    }

    public static Double getScoreOfWeiboComment(String sentence){
        if(classifier==null){
            System.err.println("Classifier is null, default using weibo comment data to init classifier");
            System.out.println("If you want to use different data to init classifier, call initClassifier first");
            initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
        }
        Map<String, Double> map = classifier.predict(sentence);
        return map.get("1") - map.get("0");
    }

    public static String getClassification(String sentence) {
        if(classifier==null){
            System.err.println("Classifier is null, default using weibo comment data to init classifier");
            System.out.println("If you want to use different data to init classifier, call initClassifier first");
            initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
        }
        Map<String, Double> map = classifier.predict(sentence);
        // System.out.println(map);
        return classifier.classify(sentence);
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

package cn.edu.neu.hanlp;

import cn.edu.neu.hanlp.cons.ClassifierConstant;

/**
 * @author 32098
 *
 * 情感分类、中文文本分类
 */
public class Test {
    public static void main(String[] args) {
        HanLpClassifier.initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
        System.out.println(HanLpClassifier.getClassification("天安门"));
        System.out.println(HanLpClassifier.getClassification("哇哦今年的春夏季衣服不错诶"));
        System.out.println(HanLpClassifier.getClassification("去死吧"));
        System.out.println(HanLpClassifier.getClassification("加油"));
        System.out.println(HanLpClassifier.getClassification("你真好"));
        System.out.println(HanLpClassifier.getScoreOfWeiboComment("你真好"));

        HanLpClassifier.initClassifier(ClassifierConstant.DATASET_HOTEL_PATH, ClassifierConstant.HOTEL_MODEL_PATH);
        System.out.println(HanLpClassifier.getClassification("酒店太差了"));

        HanLpClassifier.initClassifier(ClassifierConstant.DATASET_SOUGOU_PATH, ClassifierConstant.SOUGOU_MODEL_PATH);
        System.out.println(HanLpClassifier.getClassification("篮球、羽毛球"));
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

运行结果：
在这里插入图片描述

四、SparkML 中文情感分类(待定)

暂略

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/你好赵伟/article/detail/604492