当前位置:   article > 正文

python 利用已有Ner模型进行数据清洗合并_ner训练文本数据如何进行清洗

ner训练文本数据如何进行清洗
# -*- coding: utf-8 -*-
from kashgari.corpus import DataReader
import re
from tqdm import tqdm


def cut_text(text, lenth):
    textArr = re.findall('.{' + str(lenth) + '}', text)
    textArr.append(text[(len(textArr) * lenth):])
    return textArr


def clean_data(source_file, target_file, ner_model):
    
    data_x, data_y = DataReader().read_conll_format_file(source_file)

    with tqdm(total=len(data_x)) as pbar:
        for idx, text_array in enumerate(data_x):
            if len(text_array) <= 100:
                ners = ner_model.predict([text_array])
                ner = ners[0]
            else:
                texts = cut_text(''.join(text_array), 100)
                ners = []
                for text in texts:
                    ner = ner_model.predict([[char for char in text]])
                    ners = ners + ner[0]
                ner = ners         
            # print('[-----------------------', idx, len(data_x))
            # print(data_y[idx])
            # print(ner)
        
            for jdx, t in enumerate(text_array):
                if ner[jdx].startswith('B') or ner[jdx].startswith('I') :
                    if data_y[idx][jdx] == 'O':
                        data_y[idx][jdx] = ner[jdx]
           
            # print(data_y[idx])
            # print('-----------------------]')  
            pbar.update(1)
            
    f = open(target_file, 'a', encoding="utf-8")    
    for idx, text_array in enumerate(data_x):
        if idx != 0:
            f.writelines(['\n'])   
        for jdx, t in enumerate(text_array):
            text = t + ' ' + data_y[idx][jdx] 
            if idx == 0 and jdx == 0:
                text = text
            else:
                text = '\n' + text
            f.writelines([text])   
    
    f.close()   
    
    data_x2, data_y2 = DataReader().read_conll_format_file(source_file)
    print(data_x == data_x2, len(data_y) == len(data_y2), '数据清洗完成')              
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
# -*- coding: utf-8 -*-
import kashgari
from data_tools import clean_data
time_ner = kashgari.utils.load_model('time_ner.h5')
clean_data('./data/example.dev', 'example.dev', time_ner)
  • 1
  • 2
  • 3
  • 4
  • 5
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/AllinToyou/article/detail/350614
推荐阅读
相关标签
  

闽ICP备14008679号