【大语言模型NER处理-进行标注生成】_大语言模型的标注方法

作者：爱喝兽奶帝天荒 | 2024-07-24 02:18:56

踩

大语言模型的标注方法

Qwen 进行NER 识别

前言

chatgpt出来的时候就想过将其利用在信息抽取方面，后续也发现了不少基于这种大语言模型的信息抽取的论文，比如之前收集过的：：

https://github.com/cocacola-lab/GPT4IE
https://github.com/RidongHan/Evaluation-of-ChatGPT-on-Information-Extraction
https://github.com/cocacola-lab/ChatIE

为什么使用大语言模型在NER上的表现仍然明显低于普遍的基线？

	
	由于NER和LLMs这两个任务之间的差距：

		前者本质上是一个序列标记任务，而后者是一个文本生成模型。
		
1
2
3
4
5

一、QWEN_NER

1.引入库

代码如下（示例）：

from dashscope import Generation
from http import HTTPStatus
import difflib
import jieba
import synonyms
import numpy as np
import yaml
from typing import List, Tuple
np.seterr(divide='ignore', invalid='ignore')


1
2
3
4
5
6
7
8
9
10
11

2.封装主函数相似度处理

class EmotionAnalyzer:
    def __init__(self):
        file1 = open('./config_emtion.yaml', 'r', encoding='utf-8')
        self.data_file = yaml.load(file1, Loader=yaml.FullLoader)
        self.SK_TONGYI = self.data_file['SK_TONGYI'][0]
        self.dict_labelmakeup_list = self.data_file['Attribute_words_makeup_label']
        self.data_makeup_list = self.data_file['Attribute_makeup_words']
        file1.close()

    def call_with_messages(self, text: str, label_list: List[str]) -> str:
        messages = [
            {
                'role': 'system',
                'content': '''# Goals
                    对用户输入的内容进行分析，打标处理，从客服的角度分析用户的意图，就事论事，不要发散输出和思考，判断与以下哪些合适的标签匹配。  
                    只输出标签，自行检查匹配标签和内容是否合理,不要输出任何解释，若匹配多个标签则使用半角逗号分割，若一个标签也没有或者不合理匹配输出null,
                    # 要求
                    - 标签列表：{a}'''.format(a=label_list)},
            {
                'role': 'user',
                'content': text
            }
        ]
        gen = Generation()
        response = gen.call(
            Generation.Models.qwen_plus_v1,
            messages=messages,
            api_key=self.SK_TONGYI,
            temperature=0.6,
            top_p=0.6,
            result_format='message'  # set the result is message format
        )
        if response.status_code == HTTPStatus.OK:
            return response
        else:
            return (
                f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}'
            )


	# 获取相似度匹配最高值进行迭代数据
    def find_most_similar(self, target: str, data_list: List[str]) -> Tuple[str, float]:
        most_similar = None
        highest_similarity = 0
        for data in data_list:
            similarity = difflib.SequenceMatcher(None, target, data).ratio()
            if similarity > highest_similarity:
                highest_similarity = similarity
                most_similar = data
        return most_similar, highest_similarity

    def cut_jieba(self, word: str, dict_path_jieba: str) -> List[str]:
        if isinstance(word, str):
            data = [word.replace('/', '')]
            jieba.load_userdict(dict_path_jieba)
            words = [words for i in data for words in jieba.cut(i) if len(words) > 1]
            return words
        else:
            print('日志存储')

    def label_jieba(self) -> Tuple[List[str], List[str]]:
        data_list_label = self.dict_labelmakeup_list
        data_list = self.data_makeup_list
        return data_list_label, data_list

    def get_target_set(self, word: str, dict_path_jieba: str = './jieba_dict.txt') -> set:
        data_list_label, data_list = self.label_jieba()
        target = self.cut_jieba(word, dict_path_jieba)
        for i in target:
            highest_similarity = 0
            for data in data_list:
                synlst = synonyms.compare(i, data)
                if synlst > highest_similarity:
                    highest_similarity = synlst
                    most_similar = data
            if float(highest_similarity) > 0.5:
                yield most_similar

    def score_similar(self, word: str) -> List[str]:
        target = word.replace('\'', '').split(',')
        data_list_label, data_list = self.label_jieba()
        for i in target:
            item = i.strip('\'')
            word_score, score = self.find_most_similar(item, data_list_label)
            if float(score) > float(0.9):
                yield word_score

    def node_class_sec(self, node: str) -> List[str]:
        attributes = {}
        attrlist = []
        score_list = []
        if isinstance(node, str):
            key, value = node.split('-', 1)
            if value not in ['', '无法确定', '无法确定 ', '未提及', '无 ', '无', '无评价', '无评价 ', '未提及 ']:
                attributes[key] = value
            for key, values in attributes.items():
                attrlist.append(key + '-' + values)
            data_list_label, data_list = self.label_jieba()
            for i in attrlist:
                for j in data_list_label:
                    word_score, score = self.find_most_similar(i, j)
                    if float(score) > float(0.9):
                        score_list.append(word_score)
            return score_list
        else:
            return [None]

    def node_class_first(self, node: str) -> List[str]:
        attributes = {}
        attrlist = []
        score_list = []
        if isinstance(node, str):
            key, value = node.split('：', 1)
            
            if value not in ['', '无法确定', '无法确定 ', '未提及', '无 ', '无', '无评价', '无评价 ', '未提及 ']:
                attributes[key] = value
            for key, values in attributes.items():
                attrlist.append(key + '-' + values)
            data_list_label, data_list = self.label_jieba()
            for i in attrlist:
                for j in data_list_label:
                    word_score, score = self.find_most_similar(i, j)
                    if float(score) > float(0.9):
                        score_list.append(word_score)
            return score_list
        else:
            return [None]

    def class_type_first(self, data: str) -> List[str]:
        score_list_f = []
        pairs = data.replace('\'', '').replace('"', '').replace(" ", ',').replace('\\n', ' ,').split(',')
        pairs = list(filter(None, pairs))

        for pair in pairs:
            if '：' in pair:
                score_list = self.node_class_first(pair)
                if len(score_list) == 0:
                    continue
                score_list_f.append(score_list[0])

            elif '-' in pair:
                score_list = self.node_class_sec(pair)
                if len(score_list) == 0:
                    continue
                score_list_f.append(score_list[0])
            else:
                continue

        if len(list(set(score_list_f))) >= 10:
            return [None]
        else:
            return list(set(score_list_f))

    def main_run(self, text: str) -> Tuple[List[str], str]:
        dict_text = list(self.get_target_set(text))  # 将生成器转换为列表
        prompt_text = text + '\n参考内容：\n' + str(dict_text).replace('\\', '')
        try:
            data = self.call_with_messages(prompt_text, label_list=self.dict_labelmakeup_list)['output']['choices'][0][
                'message']['content']
        except TypeError:
            data = text

        # 在处理生成器进行数据迭代匹配
        result = self.score_similar(data)
        result_list = list(result).copy()

        if len(list(set(result_list))) >= 9:
            return self.class_type_first(data), data
        else:
            return list(set(result_list)), data

    def main_dict_run(self,text):
        attrlist, data = analyzer.main_run(text)
        result = {'好评': [], '差评': []}
        for item in attrlist:
            if '-好评' in item:
                result['好评'].append(item.split('-')[0])
            elif '-差评' in item:
                result['差评'].append(item.split('-')[0])
        return result


if __name__ == '__main__':
    analyzer = EmotionAnalyzer()
    text = '最差的一次，明明买30ml享50ml收到货少了一个小样，然后还有入会员的福利，虽说是一分钱不多，但是要讲究一个信誉啊，福利也没发过来，然后就是各种借口理由搪塞过去无语'
    data = analyzer.main_dict_run(text)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

实际匹配标签(例子)

 Exact_Match_Tag = ['质量', '卖家服务',
              '性价比', '物流', '正品', '包装', '整体', '赠品', '美白效果', '使用效果', '颜色', '抗衰效果',
              '成分', '去黑头效果', '异味气味', '妆感','控油效果', '洁净效果', '保质期', '做工', '外观设计', 
              '色泽', '描述相符', '便捷性', '手感', '份量',...........
              ]
1
2
3
4
5

获取结果

     result_dict = {'好评': [], '差评': ['便捷性', '包装', '保质期', '正品']}
1

总结

真对LLM 模型的表述

不足之处： LLM 的泛化和召回和开始说的一样，本身就是生成model 的胚子，可控的系数很低，但是可以用chatgpt 3.5 或者是4.0 试一试，应该会非常理想。我目前测试的是Qwen一个小demo,当然，有好的prompt，欢迎指正，嘿嘿

当然，不用LLM_embedding 来计算相似度是因为耗时太长，所以我选择了粗细力度和synonyms来进行NER 的匹配。

建议是最好的优化prompt . 多谢分享

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/爱喝兽奶帝天荒/article/detail/872504