赞
踩
本文是在Word2vec的基础上结合TF_IDF对中文文本分类的方式。数据格式见此链接
关于TF_IDF的理论可自行搜索
具体方式参考下图:
from gensim.models import Word2Vec from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras.preprocessing.sequence import pad_sequences import re import jieba from sklearn.preprocessing import MultiLabelBinarizer from tensorflow.keras.models import Model, load_model, Sequential from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional, SpatialDropout1D import numpy as np from sklearn.model_selection import train_test_split from Attention_LSTM.att import Attention import matplotlib.pyplot as plt import jieba.analyse from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer def clear_character(sentence): pattern1 = '[a-zA-Z0-9]' pattern2 = re.compile(u'[^\s1234567890' + '\u4e00-\u9fa5]+') pattern3 = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+【】' line1 = re.sub(pattern1, '', sentence) line2 = re.sub(pattern2, '', line1) line3 = re.sub(pattern3, '', line2) new_Sentence = ''.join(line3.split()) return new_Sentence def content_split(segment): seg = " ".join(jieba.cut(segment)) return seg def rm_stop_word(wordlist, stop): filtered_words = [word for word in wordlist if word not in stop] return filtered_words def train_word2vec(x_data): model = Word2Vec(x_data, sg=1, size=200, window=5, min_count=0, negative=1, sample=0.01, workers=
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。