赞
踩
分词实现:
import pandas as pd
import jieba
import jieba.analyse
# 数据源
df_news = pd.read_table('C:/Users/Shirley/Desktop/python/article.txt',names=['id','content'],encoding='utf-8')
# 存在缺失值,drop掉
df_news = df_news.dropna()
content = df_news.content.values.tolist()
content_S = []
for line in content:
current_segment = jieba.lcut(line)
if len(current_segment)>1 and current_segment != '\r\n':
content_S.append(current_segment)
df_content = pd.DataFrame({'content_S':content_S})
print(df_content.head())
加入停用词:
stopwords= pd.read_csv("C:/Users/Shirley/Desktop/python/stopwords_3.txt",index_col=False,sep='\t',quoting=3,names=['stopwords'],encoding='utf-8')
stopwords.head()
def drop_stopwords(contents,stopwords):
contents_clean = []
all_words = []
for line in conten
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。