赞
踩
目录
词形还原(Lemmatization):把一个任何形式的语言词汇还原为基本形式(能表达完整语义);
词干提取(Stemming):抽取词的词干或词根形式(不一定能够表达完整语义);
二者都是词形规范化的方式,都能够达到有效归并词形的目的,二者既有联系也有区别。
- from nltk.stem import WordNetLemmatizer
- wnl = WordNetLemmatizer()
- #print(wnl.lemmatize('countries','apples'))
-
-
- ff=open("xxx.txt",'a',encoding='UTF-8')
- list=[];
- with open("xxx.txt",'r',encoding='utf-8')as f:
- for line in f:
- LineList=[];
- for word in line.split():
- a=wnl.lemmatize(word)
- LineList.append(a);
- list.append(LineList);
- for line in list:
- print(line,file=ff);
- from nltk.corpus import wordnet as wn
- import jieba
- import sys
- from nltk.stem import WordNetLemmatizer
- wnl = WordNetLemmatizer()
- #print(wnl.lemmatize('countries','apples'))
-
-
- ff=open("xxx.txt",'a')
- list=[];
- with open("xxx.txt",'r',encoding='utf-8')as f:
- for line in f:
- LineList=[];
- for word in line.split():
- a=wnl.lemmatize(word)
- LineList.append(a);
- list.append(LineList);
- for line in list:
- print(line,file=ff);
- # -*- coding: utf8 -*-
- import spacy
- nlp = spacy.load('en', disable=['parser', 'ner'])
-
- texts=[["x"],["xx"]]
- def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
- """https://spacy.io/api/annotation"""
- texts_out = []
- for sent in texts:
- doc = nlp(" ".join(sent))
- texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
- return texts_out
-
- # Do lemmatization keeping only noun, adj, vb, adv
- data_lemmatized = lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
- data=str(data_lemmatized)
- f=open('xxx.txt','w',encoding='utf-8')
- f.writelines(data)
- f.close()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。