赞
踩
一、实验目标
1、将文件评论分为积极评论和消极评论两类,其中消极评论包括答非所问,省略回答与拒绝回答
(本文中我暂且规定积极评论为0,消极评论为1)
二、实验思路
1、用jieba库,去除停用词等操作处理原始数据,并挑出数据中评论为空的值(nan)可以直接将预测值变为1(消极评论)
2、根据1500条数据量划分出训练集和测试集,对模型进行测试,并选取正确率最高的方法
(我最终选取的是朴素贝叶斯方法)
3、针对30万条数据进行处理,查看算法的可行性
4、处理更大的300万条数据
三、实验预备知识
1、csv文件操作
(1)文件的创建与打开
- with open(filepath,mode,encoding="utf8",newline="") as f:
- rows = csv.reader(f) #rows为一个迭代器(大列表套着小列表)
- for row in rows:
- print(rows)
A.mode:文件打开模式
r:只读模式,文件不存在泽报错,默认模式(文件指针位于文件末尾)
r+:只读模式,文件不存在泽报错(文件指针位于文件开头)
w:写入模式,文件不存在则自动报错,每次打开会覆盖原文件内容,文件不关闭则可以进行多次写入(只会在打开文件时清空文件内容)
w+:写入模式,文件不存在则自动报错,每次打开会覆盖原文件内容,文件不关闭则可以进行多次写入(只会在打开文件时清空文件内容,指针位置在文件内容末尾)
a:追加模式,文件不存在则会自动创建,从末尾追加,不可读。
a+:追加且可读模式,刚打开时文件指针就在文件末尾。
B.utf-8和utf-8-sig
”utf-8“ 是以字节为编码单元,它的字节顺序在所有系统中都是一样的,没有字节序问题,因此它不需要BOM,所以当用"utf-8"编码方式读取带有BOM的文件时,它会把BOM当做是文件内容来处理, 也就会发生类似上边的错误.
“utf-8-sig"中sig全拼为 signature 也就是"带有签名的utf-8”, 因此"utf-8-sig"读取带有BOM的"utf-8文件时"会把BOM单独处理,与文本内容隔离开,也是我们期望的结果.
(2)csv文件的写入
- headers=[]
- datas=[]
-
- with open(filepath,mode,encoding="utf8",newline="") as f:
- rows = csv.reader(f)
- rows.writerow(headers)
- rows.writerows(datas)
2、大文件的查看与修改
推荐一个软件EmEditor,excel最多打开104万行数据,更大的数据量会导致excel崩溃,对于查看或修改更大的csv文件,此时最好使用EmEditor
3、jieba,sklearn等库的使用
代码里会详细解释
四、实验代码
- import jieba
- import pandas as pd
- import numpy as np
- from sklearn.model_selection import train_test_split
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.naive_bayes import MultinomialNB
- from sklearn.linear_model import LogisticRegression
- from pandas import DataFrame
- import openpyxl as op
-
- #训练集
- new_reply = pd.DataFrame(None,columns=["reply","answer"])
- reply = pd.read_excel("sum.xlsx").astype(str)
-
- print("训练集读取成功!")
- #新数据
- reply1 = pd.read_csv("result297.csv").astype(str)
- new_reply1 = pd.DataFrame(None,columns=["reply","answer"])
- print("数据集读取成功")
- #stopwords的引用
-
- with open("stopwords.txt","r",encoding = "utf-8") as f:
- stops = f.readlines()
- stopwords = [x.strip() for x in stops]
-
-
- #利用jieba对文本进行分段
-
- new_reply["reply"] = reply.回答.apply(lambda s:" ".join(jieba.cut(s)))
- new_reply["answer"] = reply["answer(人工阅读)"]
- new_reply["reply"] = new_reply.reply.apply(lambda s:s if str(s)!="nan" else np.nan)
- df = new_reply["reply"]
- df=df.to_frame(name="reply")
- df_isnull_remark = df[df["reply"].isnull()]
- new_reply.dropna(subset=["reply"],inplace=True)
- cut_list = new_reply["reply"].apply(lambda s:s.split(" "))
- result = cut_list.apply(lambda s:[x for x in s if x not in stopwords]) #result类型为series
-
-
- new_reply1["reply"] = reply1.回答.apply(lambda s:" ".join(jieba.cut(s)))
-
- # new_reply1["answer"] = reply1["answer(人工)"]
- new_reply1["reply"] = new_reply1.reply.apply(lambda s:s if str(s)!="nan" else np.nan)
- cf = new_reply1["reply"]
- cf=cf.to_frame(name="reply")
- cf_isnull_remark = cf[cf["reply"].isnull()]
- new_reply1.dropna(subset=["reply"],inplace=True)
- cut_list1 = new_reply1["reply"].apply(lambda s:s.split(" "))
- result1 = cut_list1.apply(lambda s:[x for x in s if x not in stopwords]) #result1类型为series
-
-
- # x_train,x_test,y_train,y_test = train_test_split(result,new_reply["answer"],test_size = 0.25,random_state = 3)
-
- x_train = result
- y_train = new_reply["answer"]
-
- # test_index = x_test.index
- test_index = result1.index
-
- x_train = x_train.reset_index(drop=True)
- # print(type(x_test)) #x_test类型为series
-
-
- words=[]
- for i in range(len(x_train)):
- words.append(' '.join(x_train[i]))
-
- result1 = result1.reset_index(drop=True)
-
- test_words=[]
-
- for i in range(len(result1)):
- test_words.append(' '.join(result1[i]))
-
-
- # print(test_words)
-
- # vec = CountVectorizer(analyzer='word',max_features=4000,lowercase=False)
- # vec.fit(words)
- # classifer = MultinomialNB()
- # classifer.fit(vec.transform(words),y_train)
- # score = classifer.score(vec.transform(test_words),y_test)
- # # a=classifer.predict_proba(vec.transform(test_words))
- # print(score)
-
-
- cv = CountVectorizer(ngram_range=(1,2))
- cv.fit(words)
- classifer = MultinomialNB()
- classifer.fit(cv.transform(words),y_train)
- # score = classifer.score(cv.transform(test_words),test)
- a=classifer.predict_proba(cv.transform(test_words))
- # print(score)
-
-
- # tv = TfidfVectorizer(analyzer='word',max_features = 4000,ngram_range=(1,2),lowercase=False)
- # tv.fit(words)
- # classifer = MultinomialNB()
- # classifer.fit(tv.transform(words),y_train)
- # score = classifer.score(tv.transform(test_words),y_test)
- # a=classifer.predict_proba(tv.transform(test_words))
- # print(score)
-
- # b=[]
- # c=[]
- # for nums in a:
-
- # for num in nums:
- # num = int(num +0.5)
- # b.append(num)
- # if(b[0] == 1):
- # c.append(1)
- # if(b[1] == 1):
- # c.append(0)
- # b=[]
-
-
- # print(cf_isnull_remark.index)
- # tableAll = op.load_workbook('201401.xlsx')
- # table1 = tableAll['Sheet1']
-
- # for i in range(len(c)):
- # table1.cell(test_index[i]+2, 12, c[i])8i
- # for i in range(len(df_isnull_remark.index)):
- # table1.cell(df_isnull_remark.index[i]+2, 12, 0)
-
- # tableAll.save('201401.xlsx')
-
- # judge=Ture
- # p=1
- # with open("result297.csv","r",encoding='utf-8',newline='') as f:
- # rows = csv.reader(f)
- # with open("help297.csv", 'w',encoding='utf-8',newline='') as file:
- # writer = csv.writer(file)
- # for row in rows:
- # if(p==1):
- # row.append("result")
- # p+=1
- # else:
- # for o in range(len(c)):
- # if(p==test_index[o]+2):
- # row.append(c[o])
- # break
- # for u in range(len(cf_isnull_remark.index)):
-
- # if(p==cf_isnull_remark.index[u]+2):
- # row.append(1)
- # break
- # p+=1
- # writer.writerow(row)
-
- print("ok")
五、部分代码解析
1、jieba分词部分
- #利用jieba对文本进行分段
-
- new_reply["reply"] = reply.回答.apply(lambda s:" ".join(jieba.cut(s)))
- new_reply["answer"] = reply["answer(人工阅读)"]
- cut_list = new_reply["reply"].apply(lambda s:s.split(" "))
- #new_reply["reply"]为DataFrame类型,通过使用""切分将cut_list转化为Series类型
jieba分词的作用如下所示,它会将输入的参数自动划分为诸多词块
2、去除停用词
停用词为评论中一些对于结果的分类影响不大的词语,一般在网上都可以查到。
删除的方法为遍历整个列表,找到跟停用词相同的元素立即删除并继续循环,直到去掉评论中的所有停用词
- with open("stopwords.txt","r",encoding = "utf-8") as f:
- stops = f.readlines()
- stopwords = [x.strip() for x in stops]
-
- #strip()删除无意义换行符,空格等等
-
- cut_list1 = new_reply1["reply"].apply(lambda s:s.split(" "))
- result1 = cut_list1.apply(lambda s:[x for x in s if x not in stopwords])
3、划分训练集和测试集
- x_train,x_test,y_train,y_test = train_test_split(result,new_reply["answer"],test_size = 0.25,random_state = 3)
-
- #此处选取75%数据作训练集,25%数据作测试集
4、利用机器学习方法进行训练
- # vec = CountVectorizer(analyzer='word',max_features=4000,lowercase=False)
- # vec.fit(words)
- # classifer = MultinomialNB()
- # classifer.fit(vec.transform(words),y_train)
- # score = classifer.score(vec.transform(test_words),y_test)
- # # a=classifer.predict_proba(vec.transform(test_words))
- # print(score)
-
-
- cv = CountVectorizer(ngram_range=(1,2))
- cv.fit(words)
- classifer = MultinomialNB()
- classifer.fit(cv.transform(words),y_train)
- # score = classifer.score(cv.transform(test_words),test)
- a=classifer.predict_proba(cv.transform(test_words))
- # print(score)
-
-
- # tv = TfidfVectorizer(analyzer='word',max_features = 4000,ngram_range=(1,2),lowercase=False)
- # tv.fit(words)
- # classifer = MultinomialNB()
- # classifer.fit(tv.transform(words),y_train)
- # score = classifer.score(tv.transform(test_words),y_test)
- # a=classifer.predict_proba(tv.transform(test_words))
- # print(score)
cv.fit函数将训练集划分的词块投喂进去,统计有效词词频。
classifer.predict_proba函数可以统计差异,计算正确率,经计算,利用朴素贝叶斯分类的正确率可以达到91%,效果较为理想。
5、大csv文件的处理操作具体实现
处理三百万条数据时python读入导致计算机卡顿,于是我将数据进行拆分成10000条数据的小csv文件并对小文件进行操作,最后将得到结果的所有文件再进行合并。
A、拆分为300个小文件=========>>
- import csv
- import os
-
- #准备10000行的csv作为样例,进行拆分
- example_path = 'data1.csv' # 需要拆分文件的路径
- example_result_dir = 'chaifen' # 拆分后文件的路径
-
- with open(example_path, 'r', newline='',encoding = 'utf-8') as example_file:
- example = csv.reader(example_file)
- i = j = 1
- for row in example:
- # print(row)
- # print(f'i 等于 {i}, j 等于 {j}')
- # 每1000个就j加1, 然后就有一个新的文件名
- if i % 10000 == 0:
-
- print(f'第{j}个文件生成完成')
- j += 1
-
- example_result_path = example_result_dir + '\\result' + str(j) + '.csv'
- # print(example_result_path)
-
- # 不存在此文件的时候,就创建
- if not os.path.exists(example_result_path):
- with open(example_result_path, 'w', newline='',encoding = 'utf-8-sig') as file:
- csvwriter = csv.writer(file)
- csvwriter.writerow(['code', '提问', '回答', '回答时间', 'Question', 'NegAnswer_Reg', '查找方式', 'year','DueOccup', 'Gender', 'SOEearly', 'Exchange', 'NegAnswer_Lasso','Length', 'Salary', 'TotleQuestion', 'Analyst', 'NegTone', 'AccTerm','Readability', 'ShareHold', 'InstOwn', 'SOE', '行业名称', '行业代码','industry', 'quarter'])
- csvwriter.writerow(row)
- i += 1
-
-
- # 存在的时候就往里面添加
- else:
- with open(example_result_path, 'a', newline='',encoding = 'utf-8') as file:
- csvwriter = csv.writer(file)
- csvwriter.writerow(row)
- i += 1
-
-
B、分别进行处理=========>>
- import jieba
- import pandas as pd
- import numpy as np
- from sklearn.model_selection import train_test_split
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.naive_bayes import MultinomialNB
- from sklearn.linear_model import LogisticRegression
- from pandas import DataFrame
- import openpyxl as op
- import threading
- from concurrent.futures import ThreadPoolExecutor
- import time
- import csv
-
- #训练集
- new_reply = pd.DataFrame(None,columns=["reply","answer"])
- reply = pd.read_excel("sum.xlsx").astype(str)
-
- print("训练集读取成功!")
- # print(reply.head())
-
-
- with open("stopwords.txt","r",encoding = "utf-8") as f:
- stops = f.readlines()
- stopwords = [x.strip() for x in stops]
-
- #利用jieba对文本进行分段
-
- new_reply["reply"] = reply.回答.apply(lambda s:" ".join(jieba.cut(s)))
- new_reply["answer"] = reply["answer(人工阅读)"]
- new_reply["reply"] = new_reply.reply.apply(lambda s:s if str(s)!="nan" else np.nan)
- df = new_reply["reply"]
- df=df.to_frame(name="reply")
- df_isnull_remark = df[df["reply"].isnull()]
- new_reply.dropna(subset=["reply"],inplace=True)
- cut_list = new_reply["reply"].apply(lambda s:s.split(" "))
- result = cut_list.apply(lambda s:[x for x in s if x not in stopwords]) #result类型为series
- x_train = result
- y_train = new_reply["answer"]
-
- # print(new_reply.head())
-
- time1 = time.time()
-
-
- x_train = x_train.reset_index(drop=True)
- words=[]
- for i in range(len(x_train)):
- words.append(' '.join(x_train[i]))
-
-
- cv = CountVectorizer(ngram_range=(1,2))
- cv.fit(words)
- classifer = MultinomialNB()
- classifer.fit(cv.transform(words),y_train)
- i=0
- # print(words)
-
- #新数据
- for i in range(296):
- t = time.time()
- path = './chaifen/result' + str(i+1) +'.csv'
- reply1 = pd.read_csv(path).astype(str)
- new_reply1 = pd.DataFrame(None,columns=["reply","answer"])
- print(f"第{i+1}个数据集读取成功")
- #stopwords的引用
-
- new_reply1["reply"] = reply1.回答.apply(lambda s:" ".join(jieba.cut(s)))
- new_reply1["reply"] = new_reply1.reply.apply(lambda s:s if str(s)!="nan" else np.nan)
- cf = new_reply1["reply"]
- cf=cf.to_frame(name="reply")
- cf_isnull_remark = cf[cf["reply"].isnull()]
- new_reply1.dropna(subset=["reply"],inplace=True)
- cut_list1 = new_reply1["reply"].apply(lambda s:s.split(" "))
- result1 = cut_list1.apply(lambda s:[x for x in s if x not in stopwords]) #result1类型为series
- test_index = result1.index
-
- # print(result1)
-
-
- result1 = result1.reset_index(drop=True)
-
- test_words=[]
- for j in range(len(result1)):
- test_words.append(' '.join(result1[j]))
- # print(test_words[0])
-
-
-
- a=classifer.predict_proba(cv.transform(test_words))
- # print(ad)
- b=[]
- c=[]
- for nums in a:
- for num in nums:
- num = int(num +0.5)
- b.append(num)
- if(b[0] == 1):
- c.append(1)
- if(b[1] == 1):
- c.append(0)
- b=[]
- print(len(c))
- p=1
- with open(path,"r",encoding='utf-8',newline='') as f:
- rows = csv.reader(f)
- with open(f"./chaifen/help{i+1}.csv", 'w',encoding='utf-8',newline='') as file:
- writer = csv.writer(file)
- for row in rows:
- if(p==1):
- row.append("result")
-
- else:
- for o in range(len(c)):
- if(p==test_index[o]+2):
- row.append(c[o])
- break
-
- for u in range(len(cf_isnull_remark.index)):
- if(p==cf_isnull_remark.index[u]+2):
- row.append(1)
- break
-
- writer.writerow(row)
- p+=1
-
- print(f"第{i+1}个数据集处理成功,用时{time.time() - t}")
-
- print(f"总计用时{time.time() - time1}")
- print("all is over!")
花了我一晚上时间,终于跑出了最终结果
(或许通过多线程可以加快速度,接下来再学习)
C、合并 =========>>
- import time
-
- for i in range(297):
- time1 = time.time()
-
- with open (f"./chaifen/help{i+1}.csv","r",newline='',encoding = "utf8") as af:
- rows = csv.reader(af)
- with open("./chaifen/a.csv","a+",encoding = "utf8",newline='') as bf:
- for row in rows:
- writer = csv.writer(bf)
- writer.writerow(row)
-
- print(f"第{i}个数据集已加入,用时{time.time()-time1}")
- print(f"all is over,用时{time.time()-time1}")
完结撒花!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。