当前位置:   article > 正文

命名实体识别NER探索(2)

命名实体识别NER探索(2)

系列文章目录

命名实体识别NER探索(1) https://duanzhihua.blog.csdn.net/article/details/108338970


前言

在NER系列第一篇命名实体识别NER探索(1)中讲解了数据的采集及清洗,自动标注将文本转化为深度学习的格式。


本文将文本转换为编码,并且将数据进行各种形式的拼接,进行数据增强。

编写编码转换、特征拼接函数

data_utils.py代码如下(示例):

 # encoding = utf8
import re
import math
import codecs
import random
import os
import numpy as np
import pandas as pd
import jieba
import pickle
from tqdm import tqdm

jieba.initialize()

def get_data(name = 'train'):
    '''
    该函数的主要功能是:把所有的数据都放在一个文件里面一起获取,并且将数据进行不同形式的拼接,进行数据增强
    :param name:所有数据所在的位置
    :return:
    '''
    with open(f'data/Prepare/dict.pkl','rb') as f:
        map_dict = pickle.load(f)


    def item2id(data,w2i):
        '''
        该函数的主要功能是:把字符转变成id
        :param data: 等待转化的数据
        :param w2i: 转化的方法
        :return: 如果是认识的值就返回对应的ID,如果不认识,就返回UNK的id
        '''
        return [w2i[x] if x in w2i else w2i['UNK'] for x in data]

    results = []
    root = os.path.join('data/prepare/',name)
    files = list(os.listdir(root))
    fileindex=-1
    file_index = []


    for file in tqdm(files):
    #for file in files:
        result=[]

        path = os.path.join(root,file)

        try:
            #samples = pd.read_csv(path, sep=',', encoding='gbk')
            samples = pd.read_csv(path, sep=',' )
        except UnicodeEncodeError:
            #samples = pd.read_csv(path, sep=',', encoding='UTF-8',errors='ignore')
            samples = pd.read_csv(path, sep=',' , errors='ignore')
        except Exception as e:
            print(e)

        num_samples = len(samples)
        fileindex += num_samples
        file_index.append(fileindex)
        # 存储好每个句子开始的下标
        sep_index = [-1]+samples[samples['word']=='sep'].index.tolist()+[num_samples]#-1,20,40,50

        # -----------------------------获取句子并且将句子全部转换成id----------------------------
        for i in range(len(sep_index)-1):
            start = sep_index[i]+1
            end = sep_index[i+1]
            data = []
            for feature in samples.columns:
                #print(list(samples[feature])[start:end],map_dict[feature][1])
                try:
                    data.append(item2id(list(samples[feature])[start:end],map_dict[feature][1]))
                except:
                    print(item2id(list(samples[feature])[start:end],map_dict[feature][1]))
                #print(data)
            result.append(data)
        #按照数据进行不同的拼接,不拼接、拼接1个、拼接2个...从而增强数据学习的能力

        # ----------------------------------------数据增强-------------------------------------
        if name == 'task':
            results.extend(result)
        else:
            two=[]
            for i in range(len(result)-1):
                first = result[i]
                second = result[i+1]
                two.append([first[k]+second[k] for k in range(len(first))])

            three = []
            for i in range(len(result) - 2):
                first = result[i]
                second = result[i + 1]
                third = result[i + 2]
                three.append([first[k] + second[k]+third[k] for k in range(len(first))])
            #应该用extend而不是append
            results.extend(result+two+three)

    with open(f'data/prepare/'+name+'.pkl','wb') as f:
        pickle.dump(results,f)

def create_dico(item_list):
    """
    Create a dictionary of items from a list of list of items.
    """
    assert type(item_list) is list
    dico = {}
    for items in item_list:
        for item in items:
            if item not in dico:
                dico[item] = 1
            else:
                dico[item] += 1
    return dico


def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item


def zero_digits(s):
    """
    Replace every digit in a string by a zero.
    """
    return re.sub('\d', '0', s)


def iob2(tags):
    """
    Check that tags have a valid IOB format.
    Tags in IOB1 format are converted to IOB2.
    """
    for i, tag in enumerate(tags):
        if tag == 'O':
            continue
        split = tag.split('-')
        if len(split) != 2 or split[0] not in ['I', 'B']:
            return False
        if split[0] == 'B':
            continue
        elif i == 0 or tags[i - 1] == 'O':  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
        elif tags[i - 1][1:] == tag[1:]:
            continue
        else:  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
    return True


def iob_iobes(tags):
    """
    IOB -> IOBES
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i + 1 != len(tags) and \
               tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i + 1 < len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags


def iobes_iob(tags):
    """
    IOBES -> IOB
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag.split('-')[0] == 'B':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'I':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'S':
            new_tags.append(tag.replace('S-', 'B-'))
        elif tag.split('-')[0] == 'E':
            new_tags.append(tag.replace('E-', 'I-'))
        elif tag.split('-')[0] == 'O':
            new_tags.append(tag)
        else:
            raise Exception('Invalid format!')
    return new_tags


def insert_singletons(words, singletons, p=0.5):
    """
    Replace singletons by the unknown word with a probability p.
    """
    new_words = []
    for word in words:
        if word in singletons and np.random.uniform() < p:
            new_words.append(0)
        else:
            new_words.append(word)
    return new_words


def get_seg_features(string):
    """
    Segment text with jieba
    features are represented in bies format
    s donates single word
    """
    #def features(self,string):
        #def _w2f(word):
            #lenth=len(word)
            #if lenth==1:
                #r=[0]
            #if lenth>1:
                #r=[2]*lenth
                #r[0]=1
                #r[-1]=3
            #return r
        #return list(chain.from_iterable([_w2f(word) for word in jieba.cut(string) if len(word.strip())>0]))    
    
    seg_feature = []

    for word in jieba.cut(string):
        if len(word) == 1:
            seg_feature.append(0)
        else:
            tmp = [2] * len(word)
            tmp[0] = 1
            tmp[-1] = 3
            seg_feature.extend(tmp)
    
    return seg_feature
    #return [i for word in jieba.cut(string) for i in range(1,len(word)+1) ]

def create_input(data):
    """
    Take sentence data and return an input for
    the training or the evaluation function.
    """
    inputs = list()
    inputs.append(data['chars'])
    inputs.append(data["segs"])
    inputs.append(data['tags'])
    return inputs


def load_word2vec(emb_path, id_to_word, word_dim, old_weights):
    """
    Load word embedding from pre-trained file
    embedding size must match
    """
    new_weights = old_weights
    print('Loading pretrained embeddings from {}...'.format(emb_path))
    pre_trained = {}
    emb_invalid = 0
    for i, line in enumerate(codecs.open(emb_path, 'r', 'utf-8')):
        line = line.rstrip().split()
        if len(line) == word_dim + 1:
            pre_trained[line[0]] = np.array(
                [float(x) for x in line[1:]]
            ).astype(np.float32)
        else:
            emb_invalid += 1
    if emb_invalid > 0:
        print('WARNING: %i invalid lines' % emb_invalid)
    c_found = 0
    c_lower = 0
    c_zeros = 0
    n_words = len(id_to_word)
    # Lookup table initialization
    for i in range(n_words):
        word = id_to_word[i]
        if word in pre_trained:
            new_weights[i] = pre_trained[word]
            c_found += 1
        elif word.lower() in pre_trained:
            new_weights[i] = pre_trained[word.lower()]
            c_lower += 1
        elif re.sub('\d', '0', word.lower()) in pre_trained:
            new_weights[i] = pre_trained[
                re.sub('\d', '0', word.lower())
            ]
            c_zeros += 1
    print('Loaded %i pretrained embeddings.' % len(pre_trained))
    print('%i / %i (%.4f%%) words have been initialized with '
          'pretrained embeddings.' % (
        c_found + c_lower + c_zeros, n_words,
        100. * (c_found + c_lower + c_zeros) / n_words)
    )
    print('%i found directly, %i after lowercasing, '
          '%i after lowercasing + zero.' % (
        c_found, c_lower, c_zeros
    ))
    return new_weights


def full_to_half(s):
    """
    Convert full-width character to half-width one 
    """
    n = []
    for char in s:
        num = ord(char)
        if num == 0x3000:
            num = 32
        elif 0xFF01 <= num <= 0xFF5E:
            num -= 0xfee0
        char = chr(num)
        n.append(char)
    return ''.join(n)


def cut_to_sentence(text):
    """
    Cut text to sentences 
    """
    sentence = []
    sentences = []
    len_p = len(text)
    pre_cut = False
    for idx, word in enumerate(text):
        sentence.append(word)
        cut = False
        if pre_cut:
            cut=True
            pre_cut=False
        if word in u"!?\n":
            cut = True
            if len_p > idx+1:
                if text[idx+1] in ".\"\'?!":
                    cut = False
                    pre_cut=True

        if cut:
            sentences.append(sentence)
            sentence = []
    if sentence:
        sentences.append("".join(list(sentence)))
    return sentences


def replace_html(s):
    s = s.replace('&quot;','"')
    s = s.replace('&amp;','&')
    s = s.replace('&lt;','<')
    s = s.replace('&gt;','>')
    s = s.replace('&nbsp;',' ')
    s = s.replace("&ldquo;", "")
    s = s.replace("&rdquo;", "")
    s = s.replace("&mdash;","")
    s = s.replace("\xa0", " ")
    return(s)

def get_dict(path):
   with open(path,'rb') as f:
       dict = pickle.load(f)
   return dict

def input_from_line(line, char_to_id):
    """
    Take sentence data and return an input for
    the training or the evaluation function.
    """
    line = full_to_half(line)
    line = replace_html(line)
    inputs = list()
    inputs.append([line])
    line.replace(" ", "$")
    inputs.append([[char_to_id[char] if char in char_to_id else char_to_id["<UNK>"]
                   for char in line]])
    inputs.append([get_seg_features(line)])
    inputs.append([[]])
    return inputs


class BatchManager(object):
    '''
    def __init__(self, data,  batch_size):
        self.batch_data = self.sort_and_pad(data, batch_size)
        self.len_data = len(self.batch_data)
    '''
    def __init__(self,batch_size,name='train'):
        with open(f'data/prepare/' + name + '.pkl', 'rb') as f:
            data = pickle.load(f)
        self.batch_data = self.sort_and_pad(data,batch_size,name)
        self.len_data = len(self.batch_data)

    def sort_and_pad(self, data, batch_size, name):
        # 总共有多少批次
        num_batch = int(math.ceil(len(data) / batch_size))
        # print(len(data[0][0]))
        # 按照句子长度进行排序
        sorted_data = sorted(data, key=lambda x: len(x[0]))
        batch_data = list()
        for i in range(num_batch):
            batch_data.append(self.pad_data(sorted_data[i * int(batch_size):(i + 1) * int(batch_size)], name))
        return batch_data

    @staticmethod
    def pad_data(data, name):
        if name != 'task':
            chars = []
            targets = []
            bounds = []
            flags = []
            radicals = []
            pinyins = []

            max_length = max([len(sentence[0]) for sentence in data])  # len(data[-1][0])
            for line in data:
                char, target, bound, flag, radical, pinyin = line
                padding = [0] * (max_length - len(char))
                chars.append(char + padding)
                targets.append(target + padding)
                bounds.append(bound + padding)
                flags.append(flag + padding)
                radicals.append(radical + padding)
                pinyins.append(pinyin + padding)
            return [chars, targets, bounds, flags, radicals, pinyins]
        else:
            chars = []
            bounds = []
            flags = []
            radicals = []
            pinyins = []

            max_length = max([len(sentence[0]) for sentence in data])  # len(data[-1][0])
            for line in data:
                char, bound, flag, radical, pinyin = line
                padding = [0] * (max_length - len(char))
                chars.append(char + padding)
                bounds.append(bound + padding)
                flags.append(flag + padding)
                radicals.append(radical + padding)
                pinyins.append(pinyin + padding)
            return [chars, bounds, flags, radicals, pinyins]

    def iter_batch(self, shuffle=False):
        if shuffle:
            random.shuffle(self.batch_data)
        for idx in range(self.len_data):
            yield self.batch_data[idx]

'''
    def sort_and_pad(self, data, batch_size):
        num_batch = int(math.ceil(len(data) /batch_size))
        sorted_data = sorted(data, key=lambda x: len(x[0]))
        batch_data = list()
        for i in range(num_batch):
            batch_data.append(self.pad_data(sorted_data[i*int(batch_size) : (i+1)*int(batch_size)]))
        return batch_data

    @staticmethod
    def pad_data(data):
        strings = []
        chars = []
        segs = []
        targets = []
        max_length = max([len(sentence[0]) for sentence in data])  #len(data[-1][0])
        for line in data:
            string, char, seg, target = line
            padding = [0] * (max_length - len(string))
            strings.append(string + padding)
            chars.append(char + padding)
            segs.append(seg + padding)
            targets.append(target + padding)
        return [strings, chars, segs, targets]

    def iter_batch(self, shuffle=False):
        if shuffle:
            random.shuffle(self.batch_data)
        for idx in range(self.len_data):
            yield self.batch_data[idx]
'''
if __name__ == '__main__':
    get_data('train')
    get_data('test')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
  • 227
  • 228
  • 229
  • 230
  • 231
  • 232
  • 233
  • 234
  • 235
  • 236
  • 237
  • 238
  • 239
  • 240
  • 241
  • 242
  • 243
  • 244
  • 245
  • 246
  • 247
  • 248
  • 249
  • 250
  • 251
  • 252
  • 253
  • 254
  • 255
  • 256
  • 257
  • 258
  • 259
  • 260
  • 261
  • 262
  • 263
  • 264
  • 265
  • 266
  • 267
  • 268
  • 269
  • 270
  • 271
  • 272
  • 273
  • 274
  • 275
  • 276
  • 277
  • 278
  • 279
  • 280
  • 281
  • 282
  • 283
  • 284
  • 285
  • 286
  • 287
  • 288
  • 289
  • 290
  • 291
  • 292
  • 293
  • 294
  • 295
  • 296
  • 297
  • 298
  • 299
  • 300
  • 301
  • 302
  • 303
  • 304
  • 305
  • 306
  • 307
  • 308
  • 309
  • 310
  • 311
  • 312
  • 313
  • 314
  • 315
  • 316
  • 317
  • 318
  • 319
  • 320
  • 321
  • 322
  • 323
  • 324
  • 325
  • 326
  • 327
  • 328
  • 329
  • 330
  • 331
  • 332
  • 333
  • 334
  • 335
  • 336
  • 337
  • 338
  • 339
  • 340
  • 341
  • 342
  • 343
  • 344
  • 345
  • 346
  • 347
  • 348
  • 349
  • 350
  • 351
  • 352
  • 353
  • 354
  • 355
  • 356
  • 357
  • 358
  • 359
  • 360
  • 361
  • 362
  • 363
  • 364
  • 365
  • 366
  • 367
  • 368
  • 369
  • 370
  • 371
  • 372
  • 373
  • 374
  • 375
  • 376
  • 377
  • 378
  • 379
  • 380
  • 381
  • 382
  • 383
  • 384
  • 385
  • 386
  • 387
  • 388
  • 389
  • 390
  • 391
  • 392
  • 393
  • 394
  • 395
  • 396
  • 397
  • 398
  • 399
  • 400
  • 401
  • 402
  • 403
  • 404
  • 405
  • 406
  • 407
  • 408
  • 409
  • 410
  • 411
  • 412
  • 413
  • 414
  • 415
  • 416
  • 417
  • 418
  • 419
  • 420
  • 421
  • 422
  • 423
  • 424
  • 425
  • 426
  • 427
  • 428
  • 429
  • 430
  • 431
  • 432
  • 433
  • 434
  • 435
  • 436
  • 437
  • 438
  • 439
  • 440
  • 441
  • 442
  • 443
  • 444
  • 445
  • 446
  • 447
  • 448
  • 449
  • 450
  • 451
  • 452
  • 453
  • 454
  • 455
  • 456
  • 457
  • 458
  • 459
  • 460
  • 461
  • 462
  • 463
  • 464
  • 465
  • 466
  • 467
  • 468
  • 469
  • 470
  • 471
  • 472
  • 473
  • 474
  • 475
  • 476
  • 477
  • 478
  • 479
  • 480
  • 481
  • 482
  • 483
  • 484
  • 485
  • 486
  • 487

程序运行结果

运行结果如下,生成train.pkl及test.pkl :

 Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lenovo\AppData\Local\Temp\jieba.cache
Loading model cost 1.349 seconds.
Prefix dict has been built successfully.
100%|██████████| 290/290 [11:42<00:00,  2.42s/it]
100%|██████████| 73/73 [02:37<00:00,  2.16s/it]

Process finished with exit code 0

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

简化版的自动标注

构建一个命名实体识别词典

.......,AT
无,AT
端口,REG
UP,SYM
.....
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

原始数据集

1.txtoriginal.txt

......
描述:两端已恢复,请回单。
......
  • 1
  • 2
  • 3

自动标注

# encoding=utf8
import os, jieba, csv
import jieba.posseg as pseg

c_root = os.path.split(os.getcwd())[0] + os.sep + "ner_input_source_data" + os.sep
dev = open("../ner_output_result_data/noc_ner.val", 'w', encoding='utf8')
train = open("../ner_output_result_data/noc_ner.train", 'w', encoding='utf8')
test = open("../ner_output_result_data/noc_ner.test", 'w', encoding='utf8')
biaoji = set(['ORG', 'AM', 'FR', 'TE', 'AT', 'DE', 'SYM', 'CH', 'REG', 'Noun','name' ])
fuhao = set(['。', '?', '?', '!', '!'])
dics = csv.reader(open("../dictionary/NOC_CSA_ORDER_DICT.csv", 'r', encoding='utf8'))
for row in dics:
    if len(row) == 2:
        jieba.add_word(row[0].strip(), tag=row[1].strip())
        jieba.suggest_freq(row[0].strip())
split_num = 0
for file in os.listdir(c_root):
    if "txtoriginal.txt" in file:
        fp = open(c_root + file, 'r', encoding='utf8')
        for line in fp:
            split_num += 1
            words = pseg.cut(line)
            for key, value in words:
                # print(key)
                # print(value)
                if value.strip() and key.strip():
                    import time

                    start_time = time.time()
                    index = str(1) if split_num % 15 < 2 else str(
                        2) if split_num % 15 > 1 and split_num % 15 < 4 else str(3)
                    end_time = time.time()
                    #print("method one used time is {}".format(end_time - start_time))
                    if value not in biaoji:
                        value = 'O'
                        for achar in key.strip():
                            if achar and achar.strip() in fuhao:
                                string = achar + " " + value.strip() + "\n" + "\n"
                                dev.write(string) if index == '1' else test.write(
                                    string) if index == '2' else train.write(string)
                            elif achar.strip() and achar.strip() not in fuhao:
                                string = achar + " " + value.strip() + "\n"
                                dev.write(string) if index == '1' else test.write(
                                    string) if index == '2' else train.write(string)

                    elif value.strip() in biaoji:
                        begin = 0
                        for char in key.strip():
                            if begin == 0:
                                begin += 1
                                string1 = char + ' ' + 'B-' + value.strip() + '\n'
                                if index == '1':
                                    dev.write(string1)
                                elif index == '2':
                                    test.write(string1)
                                elif index == '3':
                                    train.write(string1)
                                else:
                                    pass
                            else:
                                string1 = char + ' ' + 'I-' + value.strip() + '\n'
                                if index == '1':
                                    dev.write(string1)
                                elif index == '2':
                                    test.write(string1)
                                elif index == '3':
                                    train.write(string1)
                                else:
                                    pass
                    else:
                        continue
dev.close()
train.close()
test.close()
print("字符级别数据自动打标签执行OK! 结果保存在ner_output_result_data目录!")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75

运行结果如下

......
: O
N B-ORG
........
中 I-ORG
心 I-ORG
- O
融 O 
.......
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

总结

以上就是今天要讲的内容,本文简单介绍了NER输入文本特征编码转换及特征拼接, 生成训练集、测试集pkl文件。 同时,也讲解了简化版的自动标注实现方法。
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家自动化/article/detail/770778
推荐阅读
相关标签
  

闽ICP备14008679号