赞
踩
我只取了一千条谣言,分词,计算出IF-IDF,贴上标签,用作训练集,但是效果不是很好,这其中有很多需要优化的地方。我是小白,正在学东西
-这是数据处理的程序,由于代码写的很乱,不断的修改、不断的调试,最终有很多注释和不满意的地方。
# -*- coding: utf-8 -*- import jieba import numpy import json import copy import os import re import time import logging import csv from collections import Counter from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.decomposition import PCA,KernelPCA #########配置log日志方便打印############# LOG_FORMAT = "%(asctime)s -%(filename)s[line:%(lineno)d]- %(name)s - %(levelname)s - %(message)s" DATE_FORMAT = "%m-%d-%Y %H:%M:%S" logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT) logger = logging.getLogger(__name__) #------------------处理数据-----------------------# # stopword_list = [] # rumor_corpus = [] # unrumor_corpus = [] # training_data = [] # validation_data = [] # test_data = [] # bag_of_word_count = {} def readrumorfile(filename, bag_of_word_count, stopword_list, rumor_corpus): with open(filename, "r", encoding="utf-8") as f: for line in f: # line.encode('utf-8') text_json = json.loads(line) s = jieba.cut(re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", text_json["rumorText"])) # 过滤掉句子中的数字和字母以及标点符号 line_list = list(s) cp_line = copy.deepcopy(line_list) for word in line_list: if word in stopword_list: word.encode('utf-8') cp_line.remove(word) # 去掉停用词 for word in cp_line: if word not in bag_of_word_count: bag_of_word_count[word] = 1 else: bag_of_word_count[word] += 1 rumor_corpus.append(",".join(cp_line)) if len(rumor_corpus) >= 1000: break f.close() def readnewsfile(filename,bag_of_word_count,stopword_list,unrumor_corpus): with open(filename, "r", encoding="utf-8") as f: for line in f: if len(line) <= 43: # 过滤掉短文本 毫无意义的文本 len("①柔道冠军称父亲深夜被带走真相是这样http://t.cn/EJuoyyO")=38 continue s = jieba.cut(re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", line.strip('\n'))) line_list = list(s) cp_line = copy.deepcopy(line_list) for word in line_list: if word in stopword_list: word.encode('utf-8') cp_line.remove(word) # 去掉停用词 for word in cp_line: if word not in bag_of_word_count: bag_of_word_count[word] = 1 else: bag_of_word_count[word] += 1 unrumor_corpus.append(",".join(cp_line)) if len(unrumor_corpus) >= 1000: break f.close() #将词袋中小于frequ的直接去掉 def removeWord(rumor_corpus,unrumor_corpus,bag_of_word_count,frequ): rumor_cor = [] unrumor_cor = [] for s_r,s_u in zip(rumor_corpus,unrumor_corpus): list_s_r = s_r.split(",") list_s_u = s_u.split(",") list_r = copy.deepcopy(list_s_r) list_u = copy.deepcopy(list_s_u) for w in list_s_r: if w not in bag_of_word_count: logger.info(w) continue if bag_of_word_count[w] < frequ: list_r.remove(w) for w in list_s_u: if w not in bag_of_word_count: logger.info(w) continue if bag_of_word_count[w] < frequ: list_u.remove(w) if list_s_r: rumor_cor.append(",".join(list_r)) if list_s_u: unrumor_cor.append(",".join(list_u)) return rumor_cor,unrumor_cor def getdata(stopword_list, bag_of_word_count, rumor_corpus, unrumor_corpus): # remove stopwords from list_corpus with open("../data/stopword.txt", "r", encoding="utf-8") as fp: for line in fp: stopword_list.append(line[:-1]) fp.close() logger.info("读取停用词,构造stopword_list集合") # 谣言 # 数据处理 list_corpus = [rumorText,rumorText,rumorText,...] readrumorfile("../data/rumors_v170613.json", bag_of_word_count, stopword_list, rumor_corpus) logger.info("从 rumors_v170613.json 谣言文本中获取 %g条数据" % (len(rumor_corpus))) # 非谣言 readnewsfile("../data/news20190407-214236.txt", bag_of_word_count, stopword_list, unrumor_corpus) if len(unrumor_corpus) <= 1000: readnewsfile("../data/news20190407-214412.txt", bag_of_word_count, stopword_list, unrumor_corpus) # 释放堆内存 stopword_list.clear() logger.info("从 news20190407-214236.txt | news20190407-214412.txt文本中获取到 %g" % (len(unrumor_corpus))) logger.info("词袋长度:%s" % (len(bag_of_word_count))) corpus = rumor_corpus + unrumor_corpus return corpus,bag_of_word_count, rumor_corpus, unrumor_corpus def Sklearn_getfeature(corpus): # 将list_corpus里面所有的谣言短文本转换向量化,构建词袋 vectoerizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b') X = vectoerizer.fit_transform(corpus) # 计算TF-IDF tfidf_transformer = TfidfTransformer() tfidf = tfidf_transformer.fit_transform(X) logger.info("用sklearn构建词袋,TFIDF计算完成") # logger.info(tfidf[0][0]) # logger.info(type(tfidf.toarray())) # 构造tupple,准备测试: # label = numpy.zeros((1000, 2)) # for i in range(0, 500): # label[i][0] = 1 # for i in range(500, 1000): # label[i][1] = 1 # label = numpy.asarray(label) data_tfidf = tfidf.toarray() with open('../data/roumordataset.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerows(data_tfidf) #利用PCA降维 pca = PCA(n_components=841) data_pca = pca.fit_transform(data_tfidf) with open('../data/roumordatasetPCA.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerows(data_pca) #利用PCA核方法进行降维 kpca = KernelPCA(kernel="rbf") data_kpca = kpca.fit_transform(data_tfidf) with open('../data/roumordatasetKPCA.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerows(data_kpca) return tfidf def gensim_getfeature(corpus): return # 测试时使用的函数----毫无用处 def WriteFile(data, target): if os.path.exists(target): path, suffix = os.path.splitext(target) s = time.strftime("%Y%m%d-%H%M%S", time.localtime()) target = path + s + suffix with open(target, 'w', encoding="utf-8") as fp: for str in data: fp.write(str) fp.write("\n") fp.close() #做数据集 按照训练集:验证集 = 4:1 if __name__ == '__main__': stopword_list = [] rumor_corpus = [] unrumor_corpus = [] training_data = [] validation_data = [] test_data = [] bag_of_word_count = {} frequ = 2 corpus,bag_of_word_count,rumor_corpus, unrumor_corpus = getdata(stopword_list,bag_of_word_count,rumor_corpus,unrumor_corpus) logger.info(len(rumor_corpus)) logger.info(len(unrumor_corpus)) rumor_cor,unrumor_cor = removeWord(rumor_corpus,unrumor_corpus,bag_of_word_count,frequ) logger.info(len(rumor_cor)) logger.info(len(unrumor_cor)) with open("../data/bag_word.json","w",encoding='utf-8') as f: json.dump(bag_of_word_count,f,ensure_ascii=False) Sklearn_getfeature(rumor_cor+unrumor_cor)
-这是训练的程序,效果很差
""" X 841=29*29维特征的文本 Y label[1.,0.] [0.,1.] 第一层:卷积层 输入的是29*29*1的文本特征 过滤器尺寸 3*3 深度为5 不使用全零填充 步长为1 输出为29-3+1=27*27 深度为5 参数w = 3*3*1*5 b = 5 第二层:池化层 输入27*27*5的矩阵 过滤器大小 3*3 步长为 3 输出9*9*5 第三层:卷积层 输入9*9*5的矩阵 过滤器尺寸 2*2 深度为12 不使用全零填充 步长为1 参数w = 2*2*5*12 b = 12 输出9-2+1=8*8*12 第四层:池化层 输入8*8*12 过滤器大小 2*2 步长 2 输出4*4*12 第五层:全连接层 输入4*4*12 过滤器尺寸 4*4*80 不使用全零填充 步长为1 参数w = 4*4*12*80 b = 80 输出1*1*80 第六层:全连接层 输入80 w = 80*56 b = 56 输出56 输出层: 输入56 w = 56*2 b=2 输出2 """ import tensorflow as tf import numpy as np import csv import logging #########配置log日志方便打印############# LOG_FORMAT = "%(asctime)s -%(filename)s[line:%(lineno)d]- %(name)s - %(levelname)s - %(message)s" DATE_FORMAT = "%m-%d-%Y %H:%M:%S" logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT) logger = logging.getLogger(__name__) num_input = 841 num_classes = 2 dropout = 0.5 learning_rate = 0.001 batch_size = 100 num_steps = 10000 display_step = 10 X = tf.placeholder(tf.float32, [None, num_input]) Y = tf.placeholder(tf.float32, [None, num_classes]) X_batch = tf.Variable(tf.constant(0.0), dtype=tf.float32) Y_batch = tf.Variable(tf.constant(0.0), dtype=tf.float32) #权重和偏向 weigths={ "w1":tf.Variable(tf.random_normal([3, 3, 1, 5])), "w2":tf.Variable(tf.random_normal([2, 2, 5, 12])), "w3":tf.Variable(tf.random_normal([4*4*12,80])), "w4":tf.Variable(tf.random_normal([80,56])), "w5":tf.Variable(tf.random_normal([56,2])) } bias = { "b1":tf.Variable(tf.random_normal([5])), "b2":tf.Variable(tf.random_normal([12])), "b3":tf.Variable(tf.random_normal([80])), "b4":tf.Variable(tf.random_normal([56])), "b5":tf.Variable(tf.random_normal([2])) } def conv2d(x, W, b, strides=1): # Conv2D wrapper, with bias and relu activation x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='VALID') x = tf.nn.bias_add(x, b) return tf.nn.relu(x) def maxpool2d(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],padding='VALID') #定义操作 def conv_net(x, weights, biases, dropout): x = tf.reshape(x, shape=[-1, 29, 29, 1]) conv1 = conv2d(x,weights['w1'],biases['b1']) conv1 = maxpool2d(conv1,k=3) conv2 = conv2d(conv1, weights['w2'], biases['b2']) conv2 = maxpool2d(conv2, k=2) fc3 = tf.reshape(conv2,[-1,weights['w3'].get_shape().as_list()[0]]) fc3 = tf.add(tf.matmul(fc3, weights['w3']), biases['b3']) fc3 = tf.nn.relu(fc3) fc3 = tf.nn.dropout(fc3, dropout) fc4 = tf.add(tf.matmul(fc3, weights['w4']), biases['b4']) fc4 = tf.nn.relu(fc4) fc4 = tf.nn.dropout(fc4, dropout) fc5 = tf.add(tf.matmul(fc4, weights['w5']), biases['b5']) # fc5 = tf.nn.relu(fc5) return fc5 # Construct model logits = conv_net(X, weigths, bias, dropout) prediction = tf.nn.softmax(logits) loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) train_op = optimizer.minimize(loss_op) correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) init = tf.global_variables_initializer() #初始化label [0,1] 代表谣言 [1,0]非谣言 r = np.zeros((1000, 1),dtype=float) c = np.ones((1000, 1),dtype=float) a = np.hstack((r, c)) b = np.hstack((c, r)) lable = np.vstack((a, b)) count =0 with tf.Session() as sess: logger.info("-----------") s = [] sess.run(init) with open("D:/WorkSpace/pyWorkspace/deepLearning/GradientDescent/data/roumordatasetPCA.csv", "r") as f: csv_reader = csv.reader(f) for line in csv_reader: s.append(line) f.close() line = np.array([ list(map(float, x))for x in s]) data_x = np.reshape(line, (-1, 841)) for step in range(1,num_steps + 1): data_y = np.reshape(lable,(-1,2)) sess.run(train_op, feed_dict={X: data_x, Y: data_y}) if step % 100 == 0 or step == 1: loss, acc = sess.run([loss_op, accuracy], feed_dict={X: data_x, Y: data_y}) print("Step " + str(step) + ", Minibatch Loss= " + \ "{:.4f}".format(loss) + ", Training Accuracy= " + \ "{:.3f}".format(acc))
-最后贴一下结果,其实我是想做三个数据集,训练集,验证集,测试集,但是刚开始拿所有数据跑都是很差的结果,我就没有心情做下去了,后面我再改进一下,看哪里除了问题,还是数据集量太少,这可能是最根本的原因。
Step 1, Minibatch Loss= 1520.1038, Training Accuracy= 0.491 Step 100, Minibatch Loss= 262.6792, Training Accuracy= 0.549 Step 200, Minibatch Loss= 23.2395, Training Accuracy= 0.686 Step 300, Minibatch Loss= 2.4426, Training Accuracy= 0.615 Step 400, Minibatch Loss= 2.0057, Training Accuracy= 0.592 Step 500, Minibatch Loss= 1.4990, Training Accuracy= 0.586 Step 600, Minibatch Loss= 1.2961, Training Accuracy= 0.589 Step 700, Minibatch Loss= 1.1176, Training Accuracy= 0.576 Step 800, Minibatch Loss= 1.0612, Training Accuracy= 0.584 Step 900, Minibatch Loss= 0.8949, Training Accuracy= 0.595 Step 1000, Minibatch Loss= 0.8426, Training Accuracy= 0.595 Step 1100, Minibatch Loss= 0.8437, Training Accuracy= 0.592 Step 1200, Minibatch Loss= 0.7606, Training Accuracy= 0.576 Step 1300, Minibatch Loss= 0.7009, Training Accuracy= 0.609 Step 1400, Minibatch Loss= 0.6559, Training Accuracy= 0.609 Step 1500, Minibatch Loss= 0.6449, Training Accuracy= 0.598 Step 1600, Minibatch Loss= 0.6219, Training Accuracy= 0.603 Step 1700, Minibatch Loss= 0.5932, Training Accuracy= 0.608 Step 1800, Minibatch Loss= 0.5810, Training Accuracy= 0.623 Step 1900, Minibatch Loss= 0.5983, Training Accuracy= 0.608 Step 2000, Minibatch Loss= 0.5709, Training Accuracy= 0.607 Step 2100, Minibatch Loss= 0.5430, Training Accuracy= 0.626 Step 2200, Minibatch Loss= 0.5401, Training Accuracy= 0.642 Step 2300, Minibatch Loss= 0.5308, Training Accuracy= 0.630 Step 2400, Minibatch Loss= 0.5409, Training Accuracy= 0.627 Step 2500, Minibatch Loss= 0.5284, Training Accuracy= 0.638 Step 2600, Minibatch Loss= 0.5743, Training Accuracy= 0.627 Step 2700, Minibatch Loss= 0.5084, Training Accuracy= 0.649 Step 2800, Minibatch Loss= 0.5221, Training Accuracy= 0.643 Step 2900, Minibatch Loss= 0.5110, Training Accuracy= 0.651 Step 3000, Minibatch Loss= 0.5214, Training Accuracy= 0.663 Step 3100, Minibatch Loss= 0.4978, Training Accuracy= 0.663 Step 3200, Minibatch Loss= 0.5084, Training Accuracy= 0.647 Step 3300, Minibatch Loss= 0.4945, Training Accuracy= 0.677 Step 3400, Minibatch Loss= 0.4991, Training Accuracy= 0.660 Step 3500, Minibatch Loss= 0.4948, Training Accuracy= 0.667 Step 3600, Minibatch Loss= 0.4915, Training Accuracy= 0.660 Step 3700, Minibatch Loss= 0.4986, Training Accuracy= 0.670 Step 3800, Minibatch Loss= 0.4813, Training Accuracy= 0.674 Step 3900, Minibatch Loss= 0.5162, Training Accuracy= 0.682 Step 4000, Minibatch Loss= 0.5086, Training Accuracy= 0.680 Step 4100, Minibatch Loss= 0.4827, Training Accuracy= 0.677 Step 4200, Minibatch Loss= 0.4798, Training Accuracy= 0.686 Step 4300, Minibatch Loss= 0.4738, Training Accuracy= 0.682 Step 4400, Minibatch Loss= 0.4889, Training Accuracy= 0.679 Step 4500, Minibatch Loss= 0.4631, Training Accuracy= 0.690 Step 4600, Minibatch Loss= 0.4766, Training Accuracy= 0.681 Step 4700, Minibatch Loss= 0.4778, Training Accuracy= 0.686 Step 4800, Minibatch Loss= 0.4525, Training Accuracy= 0.704 Step 4900, Minibatch Loss= 0.4552, Training Accuracy= 0.692 Step 5000, Minibatch Loss= 0.4411, Training Accuracy= 0.701 Step 5100, Minibatch Loss= 0.4653, Training Accuracy= 0.694 Step 5200, Minibatch Loss= 0.4400, Training Accuracy= 0.709 Step 5300, Minibatch Loss= 0.4426, Training Accuracy= 0.698 Step 5400, Minibatch Loss= 0.4385, Training Accuracy= 0.705 Step 5500, Minibatch Loss= 0.4365, Training Accuracy= 0.705 Step 5600, Minibatch Loss= 0.4332, Training Accuracy= 0.711 Step 5700, Minibatch Loss= 0.4404, Training Accuracy= 0.708 Step 5800, Minibatch Loss= 0.4188, Training Accuracy= 0.715 Step 5900, Minibatch Loss= 0.4118, Training Accuracy= 0.722 Step 6000, Minibatch Loss= 0.4032, Training Accuracy= 0.713 Step 6100, Minibatch Loss= 0.4179, Training Accuracy= 0.711 Step 6200, Minibatch Loss= 0.4081, Training Accuracy= 0.714 Step 6300, Minibatch Loss= 0.4038, Training Accuracy= 0.713 Step 6400, Minibatch Loss= 0.4081, Training Accuracy= 0.719 Step 6500, Minibatch Loss= 0.3908, Training Accuracy= 0.742 Step 6600, Minibatch Loss= 0.3901, Training Accuracy= 0.735 Step 6700, Minibatch Loss= 0.3915, Training Accuracy= 0.729 Step 6800, Minibatch Loss= 0.3782, Training Accuracy= 0.721 Step 6900, Minibatch Loss= 0.3917, Training Accuracy= 0.712 Step 7000, Minibatch Loss= 0.3819, Training Accuracy= 0.734 Step 7100, Minibatch Loss= 0.3765, Training Accuracy= 0.738 Step 7200, Minibatch Loss= 0.3544, Training Accuracy= 0.749 Step 7300, Minibatch Loss= 0.3634, Training Accuracy= 0.748 Step 7400, Minibatch Loss= 0.3551, Training Accuracy= 0.758 Step 7500, Minibatch Loss= 0.3613, Training Accuracy= 0.746 Step 7600, Minibatch Loss= 0.3574, Training Accuracy= 0.753 Step 7700, Minibatch Loss= 0.3532, Training Accuracy= 0.758 Step 7800, Minibatch Loss= 0.3456, Training Accuracy= 0.762 Step 7900, Minibatch Loss= 0.3695, Training Accuracy= 0.747 Step 8000, Minibatch Loss= 0.3646, Training Accuracy= 0.768 Step 8100, Minibatch Loss= 0.3573, Training Accuracy= 0.756 Step 8200, Minibatch Loss= 0.3461, Training Accuracy= 0.760 Step 8300, Minibatch Loss= 0.3557, Training Accuracy= 0.759 Step 8400, Minibatch Loss= 0.3514, Training Accuracy= 0.756 Step 8500, Minibatch Loss= 0.3472, Training Accuracy= 0.768 Step 8600, Minibatch Loss= 0.3538, Training Accuracy= 0.757 Step 8700, Minibatch Loss= 0.3424, Training Accuracy= 0.763 Step 8800, Minibatch Loss= 0.3516, Training Accuracy= 0.754 Step 8900, Minibatch Loss= 0.3555, Training Accuracy= 0.762 Step 9000, Minibatch Loss= 0.3448, Training Accuracy= 0.767 Step 9100, Minibatch Loss= 0.3467, Training Accuracy= 0.761 Step 9200, Minibatch Loss= 0.3319, Training Accuracy= 0.777 Step 9300, Minibatch Loss= 0.3444, Training Accuracy= 0.765 Step 9400, Minibatch Loss= 0.3430, Training Accuracy= 0.762 Step 9500, Minibatch Loss= 0.3375, Training Accuracy= 0.766 Step 9600, Minibatch Loss= 0.3355, Training Accuracy= 0.768 Step 9700, Minibatch Loss= 0.3285, Training Accuracy= 0.780 Step 9800, Minibatch Loss= 0.3374, Training Accuracy= 0.772 Step 9900, Minibatch Loss= 0.3304, Training Accuracy= 0.781 Step 10000, Minibatch Loss= 0.3401, Training Accuracy= 0.768
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。