赞
踩
数据:使用丘吉尔的人物传记作为我的学习语料
框架:Keras
- import os
- import numpy as np
- import nltk
- from keras.models import Sequential
- from keras.layers import Dense
- from keras.layers import Dropout
- from keras.layers import LSTM
- from keras.callbacks import ModelCheckpoint
- from keras.utils import np_utils
- from gensim.models.word2vec import Word2Vec
-
- #读入文本
- raw_text=""
- for file in os.listdir("../input/"):
- if file.endswith(".txt"):
- raw_text+=open("../input/"+file,errors="ignore").read()+\"n\n"
- #row_test=open("../input/Winston_Churchil.txt").read()
- raw_text=raw_text.lower()
- sentensor=nltk.data.load("tokenizers/punkt/english.pickle")
- sents=sentensor.tokenize(raw_text)
- corpus=[]
- for sen in sents:
- corpus.append(nltk.word_tokenize(sen))
-
- print(len(corpus))
- print(corpus[:3])
-
-
- #结果
- 91007
- [['\ufeffthe', 'project', 'gutenberg', 'ebook', 'of', 'great', 'expectations', ',', 'by', 'charles', 'dickens', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.'], ['you', 'may', 'copy', 'it', ',', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'www.gutenberg.org', 'title', ':', 'great', 'expectations', 'author', ':', 'charles', 'dickens', 'posting', 'date', ':', 'august', '20', ',', '2008', '[', 'ebook', '#'

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。