""" __author__:shuangrui Guo __description__: """ import sys import nltk import json from tqdm import tqdm #多进程的包 import multiprocessing import argparse import os import re SUFFIX_NLTK = '__nltk.json' #清洗文本 def clean_text(text): text = re.sub(r'[^\x00-\x7F]+',' ',text) text = re.sub(r"([.,!:?()])",r" \1 ",text) text = re.sub(r"\s{2,}"," ",text) text = text.replace("-"," ") return text #获取文件行数的函数 def get_line_count(inFile): lines = 0 with open(inFile,'r') as f: while f.readline(): lines+=1 return lines #跳过所有的单个词,默认是True def get_nps_from_tree(tree, words_original, attachNP=False, skip_single_word=True): nps = [] st = 0 for subtree in tree: if isinstance(subtree, nltk.tree.Tree): if subtree.label() == 'NP': np = subtree.leaves() ed = st + len(np) if not skip_single_word or len(np) > 1: nps.append({'st': st, 'ed': ed, 'text': ' '.join(words_original[st:ed])}) if attachNP: nps[-1]['np'] = np st += len(subtree.leaves()) else: st += 1 return nps def validate_nps(nps, words_original): validated_nps = [] for np in sorted(nps, key=lambda x:x['st']): st = np['st'] ed = np['ed'] token_span = words_original[st:ed] # 'A polynomial time algorithm for the Lambek calculus with brackets of bounded order' if ' '.join(token_span).strip() != np['text'].strip(): print(' '.join(token_span)) print(np) return validated_nps validated_nps.append(np) return nps def get_nps_nltk_raw(doc): # 预先定义的分块语法,具体含义不清楚 GRAMMAR = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns(名次和形容词,并且以名词结尾) NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ # 定义语法解析器 _PARSER = nltk.RegexpParser(GRAMMAR) doc = clean_text(doc) #对文档使用空格切分 words_original = nltk.word_tokenize(doc) #words_original = doc.split(' ') try: parse_tree = _PARSER.parse(nltk.pos_tag(words_original)) pass except Exception as e: import ipdb; ipdb.set_trace() pass nps = get_nps_from_tree(parse_tree, words_original) return nps #读入与写出文件 def writeToJson(inFile, outFile): #分别读入文件,与写出文件 with open(inFile, 'r') as fin, open(outFile, 'w') as fout: total = get_line_count(inFile) for line in tqdm(fin, total=total): doc = line.strip('\r\n') #对每一行进行处理 if doc: nps = get_nps_nltk_raw(doc) else: nps = [] fout.write(json.dumps(nps)) fout.write('\n') if __name__ == '__main__': inFile = "./patent_abstract.txt" outFile = inFile + SUFFIX_NLTK writeToJson(inFile, outFile)
原文链接 :https://blog.csdn.net/qq_28790663/article/details/115956521
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。