赞
踩
import pandas as pd
import numpy as np
import re
import nltk #pip install nltk
#jieba
corpus = ['The sky is blue and beautiful.',
'Love this blue and beautiful sky!',
'The quick brown fox jumps over the lazy dog.',
'The brown fox is quick and the blue dog is lazy!',
'The sky is very blue and the sky is very beautiful today',
'The dog is lazy but the brown fox is quick!'
]
labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus,
'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df
Document | Category | |
---|---|---|
0 | The sky is blue and beautiful. | weather |
1 | Love this blue and beautiful sky! | weather |
2 | The quick brown fox jumps over the lazy dog. | animals |
3 | The brown fox is quick and the blue dog is lazy! | animals |
4 | The sky is very blue and the sky is very beaut... | weather |
5 | The dog is lazy but the brown fox is quick! | animals |
nltk.download()
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
True
#词频与停用词 wpt = nltk.WordPunctTokenizer() stop_words = nltk.corpus.stopwords.words('english') print (stop_words) def normalize_document(doc): # lower case and remove special characters\whitespaces doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I) doc = doc.lower() doc = doc.strip() # tokenize document tokens = wpt.tokenize(doc) # filter stopwords out of document filtered_tokens = [token for token in tokens if token not in stop_words] # re-create document from filtered tokens doc = ' '.join(filtered_tokens) return doc normalize_corpus = np.vectorize(normalize_document)
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
norm_corpus = normalize_corpus(corpus)
norm_corpus
#The sky is blue and beautiful.
array(['sky blue beautiful', 'love blue beautiful sky',
'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
'sky blue sky beautiful today', 'dog lazy brown fox quick'],
dtype='<U30')
from sklearn.feature_extraction.text import CountVectorizer
print (norm_corpus)
cv = CountVectorizer(min_df=0., max_df=1.)
cv.fit(norm_corpus)
print (cv.get_feature_names())
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix
['sky blue beautiful' 'love blue beautiful sky'
'quick brown fox jumps lazy dog' 'brown fox quick blue dog lazy'
'sky blue sky beautiful today' 'dog lazy brown fox quick']
['beautiful', 'blue', 'brown', 'dog', 'fox', 'jumps', 'lazy', 'love', 'quick', 'sky', 'today']
array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0],
[0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],
[0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],
[1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1],
[0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0]], dtype=int64)
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)
beautiful | blue | brown | dog | fox | jumps | lazy | love | quick | sky | today | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
2 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 |
3 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
5 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)
beautiful sky | beautiful today | blue beautiful | blue dog | blue sky | brown fox | dog lazy | fox jumps | fox quick | jumps lazy | lazy brown | lazy dog | love blue | quick blue | quick brown | sky beautiful | sky blue | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
3 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
5 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
from sklearn.feature_extraction.text import TfidfVectorizer #中国 蜜蜂 养殖 它们的片频数都是20次
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)
beautiful | blue | brown | dog | fox | jumps | lazy | love | quick | sky | today | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.60 | 0.52 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.60 | 0.00 |
1 | 0.46 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.66 | 0.00 | 0.46 | 0.00 |
2 | 0.00 | 0.00 | 0.38 | 0.38 | 0.38 | 0.54 | 0.38 | 0.00 | 0.38 | 0.00 | 0.00 |
3 | 0.00 | 0.36 | 0.42 | 0.42 | 0.42 | 0.00 | 0.42 | 0.00 | 0.42 | 0.00 | 0.00 |
4 | 0.36 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.72 | 0.52 |
5 | 0.00 | 0.00 | 0.45 | 0.45 | 0.45 | 0.00 | 0.45 | 0.00 | 0.45 | 0.00 | 0.00 |
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 1.000000 | 0.753128 | 0.000000 | 0.185447 | 0.807539 | 0.000000 |
1 | 0.753128 | 1.000000 | 0.000000 | 0.139665 | 0.608181 | 0.000000 |
2 | 0.000000 | 0.000000 | 1.000000 | 0.784362 | 0.000000 | 0.839987 |
3 | 0.185447 | 0.139665 | 0.784362 | 1.000000 | 0.109653 | 0.933779 |
4 | 0.807539 | 0.608181 | 0.000000 | 0.109653 | 1.000000 | 0.000000 |
5 | 0.000000 | 0.000000 | 0.839987 | 0.933779 | 0.000000 | 1.000000 |
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)
Document | Category | ClusterLabel | |
---|---|---|---|
0 | The sky is blue and beautiful. | weather | 0 |
1 | Love this blue and beautiful sky! | weather | 0 |
2 | The quick brown fox jumps over the lazy dog. | animals | 1 |
3 | The brown fox is quick and the blue dog is lazy! | animals | 1 |
4 | The sky is very blue and the sky is very beaut... | weather | 0 |
5 | The dog is lazy but the brown fox is quick! | animals | 1 |
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=2, max_iter=100, random_state=42)
dt_matrix = lda.fit_transform(tv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1', 'T2'])
features
e:\ProgramData\Anaconda3\lib\site-packages\sklearn\decomposition\online_lda.py:294: DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21
DeprecationWarning)
e:\ProgramData\Anaconda3\lib\site-packages\sklearn\decomposition\online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
DeprecationWarning)
T1 | T2 | |
---|---|---|
0 | 0.190615 | 0.809385 |
1 | 0.176860 | 0.823140 |
2 | 0.846148 | 0.153852 |
3 | 0.815229 | 0.184771 |
4 | 0.180563 | 0.819437 |
5 | 0.839140 | 0.160860 |
tt_matrix = lda.components_
for topic_weights in tt_matrix:
topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
topic = sorted(topic, key=lambda x: -x[1])
topic = [item for item in topic if item[1] > 0.6]
print(topic)
print()
[('fox', 1.7265536238698524), ('quick', 1.7264910761871224), ('dog', 1.7264019823624879), ('brown', 1.7263774760262807), ('lazy', 1.7263567668213813), ('jumps', 1.0326450363521607), ('blue', 0.7770158513472083)]
[('sky', 2.263185143458752), ('beautiful', 1.9057084998062579), ('blue', 1.7954559705805624), ('love', 1.1476805311187976), ('today', 1.0064979209198706)]
from gensim.models import word2vec
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]
# Set values for various parameters
feature_size = 10 # Word vector dimensionality
window_context = 10 # Context window size
min_word_count = 1 # Minimum word count
sample = 1e-3 # Downsample setting for frequent words
w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size,
window=window_context, min_count = min_word_count,
sample=sample)
e:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
w2v_model.wv['sky']
array([-0.01302878, -0.0081328 , 0.02701689, 0.03391293, 0.01191998,
-0.00258705, 0.02996921, 0.01644186, -0.03398509, -0.00690445], dtype=float32)
def average_word_vectors(words, model, vocabulary, num_features): feature_vector = np.zeros((num_features,),dtype="float64") nwords = 0. for word in words: if word in vocabulary: nwords = nwords + 1. feature_vector = np.add(feature_vector, model[word]) if nwords: feature_vector = np.divide(feature_vector, nwords) return feature_vector def averaged_word_vectorizer(corpus, model, num_features): vocabulary = set(model.wv.index2word) features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus] return np.array(features)
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
num_features=feature_size)
pd.DataFrame(w2v_feature_array) #lstm
e:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
if __name__ == '__main__':
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -0.000403 | -0.020117 | 0.012235 | 0.006122 | 0.019660 | 0.003892 | 0.016131 | 0.018105 | -0.015472 | 0.002605 |
1 | -0.005323 | -0.021742 | 0.019230 | 0.010198 | 0.004770 | -0.003240 | 0.019134 | 0.003671 | -0.017126 | 0.013931 |
2 | -0.004894 | -0.008632 | 0.027378 | -0.008518 | -0.014232 | -0.015589 | -0.025559 | -0.009956 | 0.000932 | 0.008659 |
3 | -0.003555 | -0.008987 | 0.014459 | 0.000522 | -0.009074 | -0.003259 | -0.014132 | -0.010526 | 0.000448 | 0.007822 |
4 | 0.003716 | -0.010779 | 0.014460 | 0.018747 | 0.023466 | -0.003407 | 0.007588 | 0.013458 | -0.012609 | -0.007683 |
5 | -0.007912 | -0.005816 | 0.025197 | -0.002158 | -0.020683 | -0.011196 | -0.024942 | -0.020571 | 0.008020 | 0.003389 |
赞
踩
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。