赞
踩
很多介绍NLP的,都会提到NLTK库。还以为NLTK是多牛逼的必需品。看了之后,感觉NLTK对实际项目,作用不大。很多内容都是从语义、语法方面解决NLP问题的。感觉不太靠谱。而且本身中文语料库不多。很多介绍NLTK的书籍和blog都比较陈旧。
《NLTK基础教程--用NLTK和Python库构建机器学习应用》虽然是2017年6月第一版。但内容大部分还是很陈旧的。基本都是采用英文的素材。书中排版类、文字类错误很多。
《Python自然语言处理》 [美] Steven Bird,Ewan Klein & Edward Loper著 陈涛 张旭 催杨 刘海平 译 的介绍的更全面。代码及其陈旧,知识点很全面。
下面整理了1、2、3、4、6、8章的代码。在win10 nltk3.2.4 python3.5.3/python3.6.1环境,可以正常运行。一定要注意nltk_data代码的下载,还有缺少库的时候,按需安装。其中 pywin32-221.win-amd64-py3.6.exe/pywin32-221.win-amd64-py3.5.exe 需要手工下载[https://sourceforge.net/projects/pywin32/files/pywin32/Build%20221/]。
需要下载的数据,都在代码里给出链接,或者说明。
# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》01 自然语言处理简介
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials01.py # 自然语言处理简介
import nltk
#nltk.download() # 完全下载需要很久,很可能需要多次尝试,才能下载成功
print("Python and NLTK installed successfully")
'''Python and NLTK installed successfully'''
# 1.2 先从Python开始
# 1.2.1 列表
lst = [1, 2, 3, 4]
print(lst)
'''[1, 2, 3, 4]'''
# print('Fisrt element: ' + lst[0])
# '''TypeError: must be str, not int'''
print('Fisrt element: ' + str(lst[0]))
'''Fisrt element: 1'''
print('First element: ' + str(lst[0]))
print('last element: ' + str(lst[-1]))
print('first three elemenets: ' + str(lst[0:2]))
print('last three elements: ' + str(lst[-3:]))
'''
First element: 1
last element: 4
first three elemenets: [1, 2]
last three elements: [2, 3, 4]
'''
# 1.2.2 自主功能
print(dir(lst))
'''
['__add__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__rmul__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort']
'''
print(' , '.join(dir(lst)))
'''
__add__ , __class__ , __contains__ , __delattr__ , __delitem__ , __dir__ , __doc__ , __eq__ , __format__ , __ge__ , __getattribute__ , __getitem__ , __gt__ , __hash__ , __iadd__ , __imul__ , __init__ , __init_subclass__ , __iter__ , __le__ , __len__ , __lt__ , __mul__ , __ne__ , __new__ , __reduce__ , __reduce_ex__ , __repr__ , __reversed__ , __rmul__ , __setattr__ , __setitem__ , __sizeof__ , __str__ , __subclasshook__ , append , clear , copy , count , extend , index , insert , pop , remove , reverse , sort
'''
help(lst.index)
'''
Help on built-in function index:
index(...) method of builtins.list instance
L.index(value, [start, [stop]]) -> integer -- return first index of value.
Raises ValueError if the value is not present.
'''
mystring = "Monty Python ! And the holy Grail ! \n"
print(mystring.split())
'''
['Monty', 'Python', '!', 'And', 'the', 'holy', 'Grail', '!']
'''
print(mystring.strip())
'''Monty Python ! And the holy Grail !'''
print(mystring.lstrip())
'''
Monty Python ! And the holy Grail !
'''
print(mystring.rstrip())
'''Monty Python ! And the holy Grail !'''
print(mystring.upper())
'''
MONTY PYTHON ! AND THE HOLY GRAIL !
'''
print(mystring.replace('!', ''''''))
'''
Monty Python And the holy Grail
'''
# 1.2.3 正则表达式
import re
if re.search('Python', mystring):
print("We found python ")
else:
print("No ")
'''We found python '''
import re
print(re.findall('!', mystring))
'''['!', '!']'''
# 1.2.4 字典
word_freq = {}
for tok in mystring.split():
if tok in word_freq:
word_freq[tok] += 1
else:
word_freq[tok] = 1
print(word_freq)
'''{'Monty': 1, 'Python': 1, '!': 2, 'And': 1, 'the': 1, 'holy': 1, 'Grail': 1}'''
# 1.2.5 编写函数
import sys
def wordfreq(mystring):
'''
Function to generated the frequency distribution of the given text
'''
print(mystring)
word_freq = {}
for tok in mystring.split():
if tok in word_freq:
word_freq[tok] += 1
else:
word_freq[tok] = 1
print(word_freq)
def main():
str = "This is my fist python program"
wordfreq(str)
if __name__ == '__main__':
main()
'''
This is my fist python program
{'This': 1, 'is': 1, 'my': 1, 'fist': 1, 'python': 1, 'program': 1}
'''
# 1.3 向NLTK迈进
from urllib import request
response = request.urlopen('http://python.org/')
html = response.read()
html = html.decode('utf-8')
print(len(html))
'''48141'''
#print(html)
tokens = [tok for tok in html.split()]
print("Total no of tokens :" + str(len(tokens)))
'''Total no of tokens :2901'''
print(tokens[0: 100])
'''
['<!doctype', 'html>', '<!--[if', 'lt', 'IE', '7]>', '<html', 'class="no-js', 'ie6', 'lt-ie7', 'lt-ie8', 'lt-ie9">', '<![endif]-->', '<!--[if', 'IE', '7]>', '<html', 'class="no-js', 'ie7', 'lt-ie8', 'lt-ie9">', '<![endif]-->', '<!--[if', 'IE', '8]>', '<html', 'class="no-js', 'ie8', 'lt-ie9">', '<![endif]-->', '<!--[if', 'gt', 'IE', '8]><!--><html', 'class="no-js"', 'lang="en"', 'dir="ltr">', '<!--<![endif]-->', '<head>', '<meta', 'charset="utf-8">', '<meta', 'http-equiv="X-UA-Compatible"', 'content="IE=edge">', '<link', 'rel="prefetch"', 'href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js">', '<meta', 'name="application-name"', 'content="Python.org">', '<meta', 'name="msapplication-tooltip"', 'content="The', 'official', 'home', 'of', 'the', 'Python', 'Programming', 'Language">', '<meta', 'name="apple-mobile-web-app-title"', 'content="Python.org">', '<meta', 'name="apple-mobile-web-app-capable"', 'content="yes">', '<meta', 'name="apple-mobile-web-app-status-bar-style"', 'content="black">', '<meta', 'name="viewport"', 'content="width=device-width,', 'initial-scale=1.0">', '<meta', 'name="HandheldFriendly"', 'content="True">', '<meta', 'name="format-detection"', 'content="telephone=no">', '<meta', 'http-equiv="cleartype"', 'content="on">', '<meta', 'http-equiv="imagetoolbar"', 'content="false">', '<script', 'src="/static/js/libs/modernizr.js"></script>', '<link', 'href="/static/stylesheets/style.css"', 'rel="stylesheet"', 'type="text/css"', 'title="default"', '/>', '<link', 'href="/static/stylesheets/mq.css"', 'rel="stylesheet"', 'type="text/css"', 'media="not', 'print,', 'braille,']
'''
import re
tokens = re.split('\W+', html)
print(len(tokens))
'''6131'''
print(tokens[0: 100])
'''
['', 'doctype', 'html', 'if', 'lt', 'IE', '7', 'html', 'class', 'no', 'js', 'ie6', 'lt', 'ie7', 'lt', 'ie8', 'lt', 'ie9', 'endif', 'if', 'IE', '7', 'html', 'class', 'no', 'js', 'ie7', 'lt', 'ie8', 'lt', 'ie9', 'endif', 'if', 'IE', '8', 'html', 'class', 'no', 'js', 'ie8', 'lt', 'ie9', 'endif', 'if', 'gt', 'IE', '8', 'html', 'class', 'no', 'js', 'lang', 'en', 'dir', 'ltr', 'endif', 'head', 'meta', 'charset', 'utf', '8', 'meta', 'http', 'equiv', 'X', 'UA', 'Compatible', 'content', 'IE', 'edge', 'link', 'rel', 'prefetch', 'href', 'ajax', 'googleapis', 'com', 'ajax', 'libs', 'jquery', '1', '8', '2', 'jquery', 'min', 'js', 'meta', 'name', 'application', 'name', 'content', 'Python', 'org', 'meta', 'name', 'msapplication', 'tooltip', 'content', 'The', 'official']
'''
'''pip3 install bs4 lxml'''
import nltk
from bs4 import BeautifulSoup
#clean = nltk.clean_html(html)
#tokens = [tok for tok in clean.split()]
soup = BeautifulSoup(html, "lxml")
clean = soup.get_text()
tokens = [tok for tok in clean.split()]
print(tokens[:100])
'''
['Welcome', 'to', 'Python.org', '{', '"@context":', '"http://schema.org",', '"@type":', '"WebSite",', '"url":', '"https://www.python.org/",', '"potentialAction":', '{', '"@type":', '"SearchAction",', '"target":', '"https://www.python.org/search/?q={search_term_string}",', '"query-input":', '"required', 'name=search_term_string"', '}', '}', 'var', '_gaq', '=', '_gaq', '||', '[];', "_gaq.push(['_setAccount',", "'UA-39055973-1']);", "_gaq.push(['_trackPageview']);", '(function()', '{', 'var', 'ga', '=', "document.createElement('script');", 'ga.type', '=', "'text/javascript';", 'ga.async', '=', 'true;', 'ga.src', '=', "('https:'", '==', 'document.location.protocol', '?', "'https://ssl'", ':', "'http://www')", '+', "'.google-analytics.com/ga.js';", 'var', 's', '=', "document.getElementsByTagName('script')[0];", 's.parentNode.insertBefore(ga,', 's);', '})();', 'Notice:', 'While', 'Javascript', 'is', 'not', 'essential', 'for', 'this', 'website,', 'your', 'interaction', 'with', 'the', 'content', 'will', 'be', 'limited.', 'Please', 'turn', 'Javascript', 'on', 'for', 'the', 'full', 'experience.', 'Skip', 'to', 'content', '▼', 'Close', 'Python', 'PSF', 'Docs', 'PyPI', 'Jobs', 'Community', '▲', 'The', 'Python', 'Network']
'''
import operator
freq_dis = {}
for tok in tokens:
if tok in freq_dis:
freq_dis[tok] += 1
else:
freq_dis[tok] = 1
sorted_freq_dist = sorted(freq_dis.items(), key = operator.itemgetter(1), reverse = True)
print(sorted_freq_dist[:25])
'''
[('Python', 60), ('>>>', 24), ('and', 22), ('is', 18), ('the', 18), ('to', 17), ('of', 15), ('=', 14), ('Events', 11), ('News', 11), ('a', 10), ('for', 10), ('More', 9), ('#', 9), ('3', 8), ('in', 8), ('Community', 7), ('with', 7), ('...', 7), ('Docs', 6), ('Guide', 6), ('Software', 6), ('now', 5), ('that', 5), ('The', 5)]
'''
import nltk
Freq_dist_nltk = nltk.FreqDist(tokens)
print(Freq_dist_nltk)
'''<FreqDist with 600 samples and 1105 outcomes>'''
for k, v in Freq_dist_nltk.items():
print(str(k) + ':' + str(v))
'''
This:1
[fruit.upper():1
Forums:2
Check:1
...
GUI:1
Intuitive:1
X:2
growth:1
advance:1
'''
# below is the plot for the frequency distributions
# 显示累积词频
Freq_dist_nltk.plot(50, cumulative=False)
## 停用词处理
#stopwords = [word.strip().lower() for word in open("PATH/english.stop.txt")]
#clean_tokens=[tok for tok in tokens if len(tok.lower()) > 1 and (tok.lower() not in stopwords)]
#Freq_dist_nltk = nltk.FreqDist(clean_tokens)
#Freq_dist_nltk.plot(50, cumulative = False)
# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》02 文本的歧义及其清理
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials02.py # 文本的歧义及其清理
# 标识化处理、词干提取、词形还原(lemmatization)、停用词移除
# 2.1 文本歧义
'''
# examples.csv
"test01",99
"test02",999
"test03",998
"test04",997
"test05",996
'''
import csv
with open('examples.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f, delimiter = ',', quotechar = '"')
for line in reader:
print(line[1])
'''
99
999
998
997
996
'''
'''
# examples.json
{
"array": [1, 2, 3, 4],
"boolean": true,
"object": {"a": "b"},
"string": "Hello, World"
}
'''
import json
jsonfile = open('examples.json')
data = json.load(jsonfile)
print(data['string'])
'''Hello, World'''
with open('examples.json', 'r', encoding='utf-8') as f:
data = json.load(f)
print(data['string'])
'''Hello, World'''
# 2.2 文本清理
# 2.3 语句分离
import nltk
inputstring = 'This is an examples sent. The sentence splitter will split on sent markers. Ohh really !!'
from nltk.tokenize import sent_tokenize
#all_sent = sent_tokenize(inputstring, language="english")
all_sent = sent_tokenize(inputstring)
print(all_sent)
import nltk.tokenize.punkt
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
# 2.4分词(标识化处理) toeknize http://text-processing.com/demo
s = "Hi Everyone ! hola gr8"
print(s.split())
from nltk.tokenize import word_tokenize
word_tokenize(s)
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
regexp_tokenize(s, pattern = '\w+')
regexp_tokenize(s, pattern = '\d+')
wordpunct_tokenize(s)
blankline_tokenize(s)
# 2.5 词干提取(stemming)
# eat eatting eaten eats ==> eat
# 对于中文、日文,词干提取很难实现
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
pst = PorterStemmer()
lst = LancasterStemmer()
print(lst.stem("eating"))
'''eat'''
print(pst.stem("shopping"))
'''shop'''
# 2.6 词形还原(lemmatization),词根(lemma)
from nltk.stem import WordNetLemmatizer
wlem = WordNetLemmatizer()
wlem.lemmatize("ate")
# Resource 'corpora/wordnet.zip/wordnet/' not found. Please use the NLTK Downloader to obtain the resource: >>> nltk.download()
# 2.7 停用词移除(Stop word removal)
import nltk
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
text = "This is just a test"
cleanwordlist = [word for word in text.split() if word not in stoplist]
print(cleanwordlist)
'''['This', 'test']'''
# 2.8 罕见词移除
'''
import nltk
token = text.split()
freq_dist = nltk.FreqDist(token)
rarewords = freq_dist.keys()[-50:]
after_rare_words = [word for word in token not in rarewords]
print(after_rare_words)
'''
# 2.9 拼写纠错(speelchecker)
from nltk.metrics import edit_distance
print(edit_distance("rain", "shine")) # 3
# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》03 词性标注
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials03.py # 词性标注
# 3.1 词性标注
# 词性(POS)
# PennTreebank
import nltk
from nltk import word_tokenize
s = "I was watching TV"
print(nltk.pos_tag(word_tokenize(s)))
tagged = nltk.pos_tag(word_tokenize(s))
allnoun = [word for word, pos in tagged if pos in ['NN', 'NNP']]
print(allnoun)
# 3.1.1 Stanford标注器
# https://nlp.stanford.edu/software/stanford-postagger-full-2017-06-09.zip
from nltk.tag.stanford import StanfordPOSTagger
import nltk
stan_tagger = StanfordPOSTagger('D:/nltk_data/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger',
'D:/nltk_data/stanford-postagger-full-2017-06-09/stanford-postagger.jar')
s = "I was watching TV"
tokens = nltk.word_tokenize(s)
stan_tagger.tag(tokens)
# 3.1.2 深入了解标注器
from nltk.corpus import brown
import nltk
tags = [tag for (word, tag) in brown.tagged_words(categories = 'news')]
print(nltk.FreqDist(tags))
brown_tagged_sents = brown.tagged_sents(categories = 'news')
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.evaluate(brown_tagged_sents))
# 3.1.3 顺序性标注器
# 1 N-Gram标注器
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]
unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
print(unigram_tagger.evaluate(test_data))
biggram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(biggram_tagger.evaluate(test_data))
trigram_tagger = TrigramTagger(train_data, backoff=biggram_tagger)
print(trigram_tagger.evaluate(test_data))
# 2 正则表达式标注器
from nltk.tag.sequential import RegexpTagger
regexp_tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adj
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
print(regexp_tagger.evaluate(test_data))
# 3.1.4 Brill 标注器
# 3.1.5 基于机器学习的标注器
# 最大熵分类器(MEC)
# 隐性马尔科夫模型(HMM)
# 条件随机场(CRF)
# 3.2 命名实体识别(NER)
# NER标注器
import nltk
from nltk import ne_chunk
sent = "Mark is studying at Stanford University in California"
print(ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False))
from nltk.tag.stanford import StanfordNERTagger
# https://nlp.stanford.edu/software/stanford-ner-2017-06-09.zip
st = StanfordNERTagger('D:/nltk_data/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',
'D:/nltk_data/stanford-ner-2017-06-09/stanford-ner.jar')
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())
# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》04 文本结构解析
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials04.py # 文本结构解析
# 4.1 浅解析与深解析
# CFG(context-free grammar):上下文无关语法
# PCFG(probabilistic context-free grammar):概率性上下文无关语法
# 浅解析:shallow parsing
# 深解析:deep parsing
# 4.2 两种解析方法
# 基于规则
# 基于概率
# 4.3 为什么需要进行解析
# 语法解析器(syntactic parser)
'''
import nltk
from nltk import CFG
toy_grammar = nltk.CFG.fromstring(
"""
S -> NP VP # S indicate the entire sentence
VP -> V NP # VP is verb phrase the
V -> "eats" | "drinks" # V is verb
NP -> Det N # NP is noun phrase (chunk that has noun in it)
Det -> "a" | "an" | "the" # Det is determiner used in the sentences
N -> "president" | "Obama" | "apple" | "coke" # N some example nouns
""")
toy_grammar.productions()
'''
# 4.4 不同的解析器类型
# 4.4.1 递归下降解析器
# 4.4.2 移位-规约解析器
# 4.4.3 图表解析器
# 4.4.4 正则表达式解析器
import nltk
from nltk.chunk.regexp import *
chunk_rules = ChunkRule("<.*>+", "chunk everything")
reg_parser = RegexpParser('''
NP: {<DT>? <JJ>* <NN>*} # NP
P: {<IN>} # Preposition
V: {<V.*>} # Verb
PP: {<P> <NP>} # PP -> P NP
VP: {<V> <NP|PP>*} # VP -> V (NP|PP)*
''')
test_sent = "Mr. Obama played a big role in the Health insurance bill"
test_sent_pos = nltk.pos_tag(nltk.word_tokenize(test_sent))
paresed_out = reg_parser.parse(test_sent_pos)
print(paresed_out)
# 4.5 依存性文本解析(dependency parsing, DP)
# 基于概率的投射依存性解析器(probabilistic, projective dependency parser)
from nltk.parse.stanford import StanfordParser
# https://nlp.stanford.edu/software/stanford-parser-full-2017-06-09.zip
english_parser = StanfordParser('D:/nltk_data/stanford-parser-full-2017-06-09/stanford-parser.jar',
'D:/nltk_data/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar')
english_parser.raw_parse_sents(("this is the english parser test"))
# 4.6 语块解析
'''
from nltk.chunk.regexp import *
test_sent = "The prime minister announced he had asked the chief government whip, \
Philip Ruddock, to call a special party room meeting for 9am on Monday to consider the spill motion."
test_sent_pos = nltk.pos_tag(nltk.word_tokenize(test_sent))
rule_vp = ChunkRule(r'(<VB.*>)?(<VB.*>)+(<PRP>)?', 'Chunk VPs')
parser_vp = RegexpChunkParser([rule_vp], chunk_label = 'VP')
print(parser_vp.parse(test_sent_pos))
rule_np = ChunkRule(r'(<DT>?<RB>?)?<JJ|CD>*(<JJ|CD><,>)*(<NN.*>)+', 'Chunk NPs')
parser_np = RegexpChunkParser([rule_np], chunk_label="NP")
print(parser_np.parse(test_sent_pos))
'''
# 4.7 信息提取
# 4.7.1 命名实体识别(NER)
f = open("D:/nltk_data/ner_sample.txt")# absolute path for the file of text for which we want NER
text = f.read()
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
for sent in tagged_sentences:
print(nltk.ne_chunk(sent))
# 4.7.2 关系提取
import re
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus = 'ieer', pattern = IN):
print(nltk.sem.rtuple(rel))
# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》06 文本分类
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials06.py # 文本分类
# 6.2 文本分类
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
def preprocessing(text):
#text = text.decode("utf8")
# tokenize into words
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
# remove stopwords
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
# remove words less than three letters
tokens = [word for word in tokens if len(word) >= 3]
# lower capitalization
tokens = [word.lower() for word in tokens]
# lemmatize
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(word) for word in tokens]
preprocessed_text = ' '.join(tokens)
return preprocessed_text
# https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
sms = open('D:/nltk_data/SMSSpamCollection', encoding='utf8') # check the structure of this file!
sms_data = []
sms_labels = []
csv_reader = csv.reader(sms, delimiter = '\t')
for line in csv_reader:
# adding the sms_id
sms_labels.append(line[0])
# adding the cleaned text We are calling preprocessing method
sms_data.append(preprocessing(line[1]))
sms.close()
# 6.3 采样操作
import sklearn
import numpy as np
trainset_size = int(round(len(sms_data)*0.70))
# i chose this threshold for 70:30 train and test split.
print('The training set size for this classifier is ' + str(trainset_size) + '\n')
x_train = np.array([''.join(el) for el in sms_data[0: trainset_size]])
y_train = np.array([el for el in sms_labels[0: trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]])
#or el in sms_labels[trainset_size+1:len(sms_labels)]
print(x_train)
print(y_train)
from sklearn.feature_extraction.text import CountVectorizer
sms_exp = []
for line in sms_data:
sms_exp.append(preprocessing(line))
vectorizer = CountVectorizer(min_df = 1, encoding='utf-8')
X_exp = vectorizer.fit_transform(sms_exp)
print("||".join(vectorizer.get_feature_names()))
print(X_exp.toarray())
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 2, ngram_range=(1, 2),
stop_words = 'english', strip_accents = 'unicode', norm = 'l2')
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)
# 6.3.1 朴素贝叶斯法
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
clf = MultinomialNB().fit(X_train, y_train)
y_nb_predicted = clf.predict(X_test)
print(y_nb_predicted)
print('\n confusion_matrix \n')
#cm = confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_nb_predicted)
print(cm)
print('\n Here is the classification report:')
print(classification_report(y_test, y_nb_predicted))
feature_names = vectorizer.get_feature_names()
coefs = clf.coef_
intercept = clf.intercept_
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
n = 10
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
print('\t%.4f\t%-15s\t\t%.4f\t%-15s' %(coef_1, fn_1, coef_2, fn_2))
# 6.3.2 决策树
from sklearn import tree
clf = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train)
y_tree_predicted = clf.predict(X_test.toarray())
print(y_tree_predicted)
print('\n Here is the classification report:')
print(classification_report(y_test, y_tree_predicted))
# 6.3.3 随机梯度下降法
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
clf = SGDClassifier(alpha = 0.0001, n_iter=50).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, y_pred))
print(' \n confusion_matrix \n')
cm = confusion_matrix(y_test, y_pred)
print(cm)
# 6.3.4 逻辑回归
# 6.3.5 支持向量机
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC().fit(X_train, y_train)
y_svm_predicted = svm_classifier.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, y_svm_predicted))
cm = confusion_matrix(y_test, y_pred)
print(cm)
# 6.4 随机森林
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
predicted = RF_clf.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, predicted))
cm = confusion_matrix(y_test, y_pred)
print(cm)
# 6.5 文本聚类
# K 均值法
from sklearn.cluster import KMeans, MiniBatchKMeans
from collections import defaultdict
true_k = 5
km = KMeans(n_clusters = true_k, init='k-means++', max_iter=100, n_init= 1)
kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=2)
km_model = km.fit(X_train)
kmini_model = kmini.fit(X_train)
print("For K-mean clustering ")
clustering = defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
print("For K-mean Mini batch clustering ")
clustering = defaultdict(list)
for idx, label in enumerate(kmini_model.labels_):
clustering[label].append(idx)
# 6.6 文本中的主题建模
# https://pypi.python.org/pypi/gensim#downloads
import gensim
from gensim import corpora, models, similarities
from itertools import chain
import nltk
from nltk.corpus import stopwords
from operator import itemgetter
import re
documents = [document for document in sms_data]
stoplist = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
print(texts)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word = dictionary, num_topics = 100)
# print(lsi.print_topics(20))
n_topics = 5
lda = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = n_topics)
for i in range(0, n_topics):
temp = lda.show_topic(i, 10)
terms = []
for term in temp:
terms.append(str(term[0]))
print("Top 10 terms for topic #" + str(i) + ": " + ",".join(terms))
# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》08 NLTK与其他Python库的搭配使用
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials08.py # NLTK与其他Python库的搭配使用
# 8.1 numpy
# 8.1.1 多维数组
import numpy as np
x = [1, 2, 5, 7, 3, 11, 14, 25]
np_arr = np.array(x)
print(np_arr)
'''[ 1 2 5 7 3 11 14 25]'''
arr = [[1, 2], [13, 4], [33, 78]]
np_2darr = np.array(arr)
print(type(np_2darr))
'''<class 'numpy.ndarray'>'''
# 索引操作
print(np_2darr.tolist())
print(np_2darr[:])
'''
[[ 1 2]
[13 4]
[33 78]]
'''
print(np_2darr[:2])
'''
[[ 1 2]
[13 4]]
'''
print(np_2darr[:1])
'''[[1 2]]'''
print(np_2darr[2])
'''[33 78]'''
print(np_2darr[2][0])
'''33'''
print(np_2darr[:-1])
'''
[[ 1 2]
[13 4]]
'''
# 8.1.2 基本运算
# import numpy as np
print(np.arange(0.0, 1.0, 0.1))
'''[ 0. 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]'''
print(np.ones([2, 4]))
'''
[[ 1. 1. 1. 1.]
[ 1. 1. 1. 1.]]
'''
print(np.zeros([3, 4]))
'''
[[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]]
'''
print(np.linspace(0, 2, 10))
'''
[ 0. 0.22222222 0.44444444 0.66666667 0.88888889 1.11111111
1.33333333 1.55555556 1.77777778 2. ]
'''
print(np.logspace(0, 1))
'''
[ 1. 1.04811313 1.09854114 1.1513954 1.20679264
1.26485522 1.32571137 1.38949549 1.45634848 1.52641797
1.59985872 1.67683294 1.75751062 1.84206997 1.93069773
2.02358965 2.12095089 2.22299648 2.32995181 2.44205309
2.55954792 2.6826958 2.8117687 2.9470517 3.0888436
3.23745754 3.39322177 3.55648031 3.72759372 3.90693994
4.09491506 4.29193426 4.49843267 4.71486636 4.94171336
5.17947468 5.42867544 5.68986603 5.96362332 6.25055193
6.55128557 6.86648845 7.19685673 7.54312006 7.90604321
8.28642773 8.68511374 9.10298178 9.54095476 10. ]
'''
help(np.logspace)
'''
Help on function logspace in module numpy.core.function_base:
logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None)
Return numbers spaced evenly on a log scale.
In linear space, the sequence starts at ``base ** start``
(`base` to the power of `start`) and ends with ``base ** stop``
(see `endpoint` below).
Parameters
----------
start : float
``base ** start`` is the starting value of the sequence.
stop : float
``base ** stop`` is the final value of the sequence, unless `endpoint`
is False. In that case, ``num + 1`` values are spaced over the
interval in log-space, of which all but the last (a sequence of
length `num`) are returned.
num : integer, optional
Number of samples to generate. Default is 50.
endpoint : boolean, optional
If true, `stop` is the last sample. Otherwise, it is not included.
Default is True.
base : float, optional
The base of the log space. The step size between the elements in
``ln(samples) / ln(base)`` (or ``log_base(samples)``) is uniform.
Default is 10.0.
dtype : dtype
The type of the output array. If `dtype` is not given, infer the data
type from the other input arguments.
Returns
-------
samples : ndarray
`num` samples, equally spaced on a log scale.
See Also
--------
arange : Similar to linspace, with the step size specified instead of the
number of samples. Note that, when used with a float endpoint, the
endpoint may or may not be included.
linspace : Similar to logspace, but with the samples uniformly distributed
in linear space, instead of log space.
geomspace : Similar to logspace, but with endpoints specified directly.
Notes
-----
Logspace is equivalent to the code
>>> y = np.linspace(start, stop, num=num, endpoint=endpoint)
... # doctest: +SKIP
>>> power(base, y).astype(dtype)
... # doctest: +SKIP
Examples
--------
>>> np.logspace(2.0, 3.0, num=4)
array([ 100. , 215.443469 , 464.15888336, 1000. ])
>>> np.logspace(2.0, 3.0, num=4, endpoint=False)
array([ 100. , 177.827941 , 316.22776602, 562.34132519])
>>> np.logspace(2.0, 3.0, num=4, base=2.0)
array([ 4. , 5.0396842 , 6.34960421, 8. ])
Graphical illustration:
>>> import matplotlib.pyplot as plt
>>> N = 10
>>> x1 = np.logspace(0.1, 1, N, endpoint=True)
>>> x2 = np.logspace(0.1, 1, N, endpoint=False)
>>> y = np.zeros(N)
>>> plt.plot(x1, y, 'o')
[<matplotlib.lines.Line2D object at 0x...>]
>>> plt.plot(x2, y + 0.5, 'o')
[<matplotlib.lines.Line2D object at 0x...>]
>>> plt.ylim([-0.5, 1])
(-0.5, 1)
>>> plt.show()
'''
# 8.1.3 从数组中提取数据
A = np.array([[0, 0, 0], [0, 1, 2], [0, 2, 4], [0, 3, 6]])
B = np.array([n for n in range(4)])
print(B)
'''[0 1 2 3]'''
less_than_3 = B < 3
print(less_than_3)
'''[ True True True False]'''
print(B[less_than_3])
'''[0 1 2]'''
B[less_than_3] = 0
print(B)
'''[0 0 0 3]'''
print(np.diag(A))
'''[0 1 4]'''
# 8.1.4 复杂矩阵运算
A = np.array([[1, 2], [3, 4]])
print(A * A) #
'''
[[ 1 4]
[ 9 16]]
'''
print(np.dot(A, A)) # 点积
'''
[[ 7 10]
[15 22]]
'''
print(A - A)
'''
[[0 0]
[0 0]]
'''
print(A + A)
'''
[[2 4]
[6 8]]
'''
print(np.transpose(A))
'''
[[1 3]
[2 4]]
'''
print(np.transpose(A, axes = [0, 1]))
'''
[[1 2]
[3 4]]
'''
print(A.T)
'''
[[1 3]
[2 4]]
'''
M = np.matrix(A)
print(M)
'''
[[1 2]
[3 4]]
'''
print(np.invert(M))
'''
[[-2 -3]
[-4 -5]]
'''
N = np.random.randn(1, 10)
print(N)
'''
[[-0.08839128 1.25979204 -0.46311213 -0.27113081 0.85757258 -1.28109429
-1.00875299 0.10666042 -0.49751293 0.81362605]]
'''
# 1 重塑和堆叠
print(A)
'''
[[1 2]
[3 4]]
'''
(r, c) = A.shape
print(r, c)
'''2 2'''
print(A.reshape((1, r * c)))
'''[[1 2 3 4]]'''
print(np.repeat(A, 2))
'''[1 1 2 2 3 3 4 4]'''
print(A)
'''
[[1 2]
[3 4]]
'''
print(np.tile(A, 4))
'''
[[1 2 1 2 1 2 1 2]
[3 4 3 4 3 4 3 4]]
'''
B = np.array([[5, 6]])
print(np.concatenate((A, B), axis = 0))
'''
[[1 2]
[3 4]
[5 6]]
'''
print(np.vstack((A, B)))
'''
[[1 2]
[3 4]
[5 6]]
'''
print(np.concatenate((A, B.T), axis = 1))
'''
[[1 2 5]
[3 4 6]]
'''
# 2 随机数
from numpy import random
# uniform random number from [0, 1]
print(random.rand(2, 5))
'''
[[ 0.15398327 0.88990373 0.99180579 0.89229317 0.40380238]
[ 0.10244161 0.16451004 0.45110841 0.3621777 0.4680435 ]]
'''
print(random.randn(2, 5))
'''
[[ 0.87847643 -0.87712286 0.75692718 1.43164752 0.26695439]
[ 1.45320364 -0.7812028 -0.17562589 1.72513472 1.35974398]]
'''
# 8.2 SciPy
import scipy as sp
from scipy.integrate import quad, dblquad, tplquad
def f(x):
return x
x_lower = 0 # the lower limit of x
x_upper = 1 # the upper limit of x
val, abserr = quad(f, x_lower, x_upper)
print(val, abserr)
'''0.5 5.551115123125783e-15'''
# 插值运算 scipy.interpolate
# 傅里叶变换 scipy.fftpack
# 信号处理 scipy.signal
# 8.2.1 线性代数
A = sp.rand(2, 2)
B = sp.rand(2, 2)
from scipy import linalg as LA
X = LA.solve(A, B)
print(X)
'''
[[ 0.21226312 1.92812885]
[ 0.54343623 -0.08202333]]
'''
print(A.dot(B))
'''
[[ 0.41041687 0.6001985 ]
[ 0.46383677 0.79950073]]
'''
# 8.2.2 特征值与特征向量
evals = LA.eigvals(A)
print(evals)
'''[-0.00542105+0.j 0.45753295+0.j]'''
evals, evect = LA.eig(A)
print((evals, evect))
'''
(array([ 1.09776801+0.j, 0.19939128+0.j]), array([[ 0.56486092, -0.35585864],
[ 0.82518613, 0.93453979]]))
'''
print(LA.eig(A))
'''
(array([ 1.52391308+0.j, 0.29130459+0.j]), array([[ 0.62099076, -0.54159873],
[ 0.78381789, 0.84063715]]))
'''
print(LA.inv(A))
'''[[-0.28075038 3.9631977 ]
[ 1.58581322 -2.69374912]]
'''
# 8.2.3 稀疏矩阵
from scipy import sparse as s
A = np.array([[1, 0, 0], [0, 2, 0], [0, 0, 3]])
print(A)
'''
[[1 0 0]
[0 2 0]
[0 0 3]]
'''
C = s.csr_matrix(A)
print(C)
'''
(0, 0) 1
(1, 1) 2
(2, 2) 3
'''
print(C.toarray())
'''
[[1 0 0]
[0 2 0]
[0 0 3]]
'''
print(C * C.todense())
'''
[[1 0 0]
[0 4 0]
[0 0 9]]
'''
print(sp.dot(C, C).todense())
'''
[[1 0 0]
[0 4 0]
[0 0 9]]
'''
# 8.2.4 优化措施
def f(x):
return x**2 - 4
sp.optimize.fmin_bfgs(f, 0)
'''
Optimization terminated successfully.
Current function value: -4.000000
Iterations: 0
Function evaluations: 3
Gradient evaluations: 1
'''
help(sp.optimize.fmin_bfgs)
'''
Help on function fmin_bfgs in module scipy.optimize.optimize:
fmin_bfgs(f, x0, fprime=None, args=(), gtol=1e-05, norm=inf, epsilon=1.4901161193847656e-08, maxiter=None, full_output=0, disp=1, retall=0, callback=None)
Minimize a function using the BFGS algorithm.
Parameters
----------
f : callable f(x,*args)
Objective function to be minimized.
x0 : ndarray
Initial guess.
fprime : callable f'(x,*args), optional
Gradient of f.
args : tuple, optional
Extra arguments passed to f and fprime.
gtol : float, optional
Gradient norm must be less than gtol before successful termination.
norm : float, optional
Order of norm (Inf is max, -Inf is min)
epsilon : int or ndarray, optional
If fprime is approximated, use this value for the step size.
callback : callable, optional
An optional user-supplied function to call after each
iteration. Called as callback(xk), where xk is the
current parameter vector.
maxiter : int, optional
Maximum number of iterations to perform.
full_output : bool, optional
If True,return fopt, func_calls, grad_calls, and warnflag
in addition to xopt.
disp : bool, optional
Print convergence message if True.
retall : bool, optional
Return a list of results at each iteration if True.
Returns
-------
xopt : ndarray
Parameters which minimize f, i.e. f(xopt) == fopt.
fopt : float
Minimum value.
gopt : ndarray
Value of gradient at minimum, f'(xopt), which should be near 0.
Bopt : ndarray
Value of 1/f''(xopt), i.e. the inverse hessian matrix.
func_calls : int
Number of function_calls made.
grad_calls : int
Number of gradient calls made.
warnflag : integer
1 : Maximum number of iterations exceeded.
2 : Gradient and/or function calls not changing.
allvecs : list
`OptimizeResult` at each iteration. Only returned if retall is True.
See also
--------
minimize: Interface to minimization algorithms for multivariate
functions. See the 'BFGS' `method` in particular.
Notes
-----
Optimize the function, f, whose gradient is given by fprime
using the quasi-Newton method of Broyden, Fletcher, Goldfarb,
and Shanno (BFGS)
References
----------
Wright, and Nocedal 'Numerical Optimization', 1999, pg. 198.
'''
print(sp.optimize.fsolve(f, 0.2))
'''[ 2.]'''
# 坏的测试
def f1(x):
return [x[0] ** 2 + x[1]**2 + 4, x[0] ** 2 + x[1]**2 - 4]
print(sp.optimize.fsolve(f1, [1, 1]))
'''
[ 0.02449328 -0.00592522]
C:\Python36\lib\site-packages\scipy\optimize\minpack.py:161: RuntimeWarning: The iteration is not making good progress, as measured by the
improvement from the last ten iterations.
warnings.warn(msg, RuntimeWarning)
'''
# 8.3 pandas
import pandas as pd
# https://archive.ics.uci.edu/ml/machine-learning-databases/iris/
data = pd.read_csv("./iris/iris.data", header = 0)
print(data.head())
'''
5.1 3.5 1.4 0.2 Iris-setosa
0 4.9 3.0 1.4 0.2 Iris-setosa
1 4.7 3.2 1.3 0.2 Iris-setosa
2 4.6 3.1 1.5 0.2 Iris-setosa
3 5.0 3.6 1.4 0.2 Iris-setosa
4 5.4 3.9 1.7 0.4 Iris-setosa
'''
data = pd.read_csv("./iris/iris.data", names = ["sepal length", "sepal width", "petal length", "petal width", "cat"], header = None)
print(data.head())
'''
sepal length sepal width petal length petal width cat
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
'''
sepal_len_cnt = data['sepal length'].value_counts()
print(sepal_len_cnt)
'''
5.0 10
6.3 9
5.1 9
6.7 8
5.7 8
5.5 7
5.8 7
6.4 7
6.0 6
4.9 6
6.1 6
5.4 6
5.6 6
6.5 5
4.8 5
7.7 4
6.9 4
5.2 4
6.2 4
4.6 4
7.2 3
6.8 3
4.4 3
5.9 3
6.6 2
4.7 2
7.6 1
7.4 1
4.3 1
7.9 1
7.3 1
7.0 1
4.5 1
5.3 1
7.1 1
Name: sepal length, dtype: int64
'''
print(data['cat'].value_counts())
'''
Name: sepal length, dtype: int64
Iris-versicolor 50
Iris-setosa 50
Iris-virginica 50
Name: cat, dtype: int64
'''
# 8.3.2 数列
# http://archive.ics.uci.edu/ml/machine-learning-databases/00312/
stockdata = pd.read_csv("./data/dow_jones_index.data", parse_dates = ['date'], index_col = ['date'], nrows = 100)
print(stockdata.head())
'''
quarter stock open high low close volume \
date
2011-01-07 1 AA $15.82 $16.72 $15.78 $16.42 239655616
2011-01-14 1 AA $16.71 $16.71 $15.64 $15.97 242963398
2011-01-21 1 AA $16.19 $16.38 $15.60 $15.79 138428495
2011-01-28 1 AA $15.87 $16.63 $15.82 $16.13 151379173
2011-02-04 1 AA $16.18 $17.39 $16.18 $17.14 154387761
percent_change_price percent_change_volume_over_last_wk \
date
2011-01-07 3.79267 NaN
2011-01-14 -4.42849 1.380223
2011-01-21 -2.47066 -43.024959
2011-01-28 1.63831 9.355500
2011-02-04 5.93325 1.987452
previous_weeks_volume next_weeks_open next_weeks_close \
date
2011-01-07 NaN $16.71 $15.97
2011-01-14 239655616.0 $16.19 $15.79
2011-01-21 242963398.0 $15.87 $16.13
2011-01-28 138428495.0 $16.18 $17.14
2011-02-04 151379173.0 $17.33 $17.37
percent_change_next_weeks_price days_to_next_dividend \
date
2011-01-07 -4.428490 26
2011-01-14 -2.470660 19
2011-01-21 1.638310 12
2011-01-28 5.933250 5
2011-02-04 0.230814 97
percent_return_next_dividend
date
2011-01-07 0.182704
2011-01-14 0.187852
2011-01-21 0.189994
2011-01-28 0.185989
2011-02-04 0.175029
'''
print(max(stockdata['volume']))
'''1453438639'''
print(max(stockdata['percent_change_price']))
'''7.62174'''
print(stockdata.index)
'''
DatetimeIndex(['2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
'2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
'2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
'2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
'2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
'2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
'2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
'2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
'2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
'2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
'2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
'2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
'2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
'2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
'2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
'2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
'2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
'2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
'2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
'2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
'2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
'2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
'2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
'2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
'2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28'],
dtype='datetime64[ns]', name='date', freq=None)
'''
print(stockdata.index.day)
'''
Int64Index([ 7, 14, 21, 28, 4, 11, 18, 25, 4, 11, 18, 25, 7, 14, 21, 28, 4,
11, 18, 25, 4, 11, 18, 25, 7, 14, 21, 28, 4, 11, 18, 25, 4, 11,
18, 25, 7, 14, 21, 28, 4, 11, 18, 25, 4, 11, 18, 25, 7, 14, 21,
28, 4, 11, 18, 25, 4, 11, 18, 25, 7, 14, 21, 28, 4, 11, 18, 25,
4, 11, 18, 25, 7, 14, 21, 28, 4, 11, 18, 25, 4, 11, 18, 25, 7,
14, 21, 28, 4, 11, 18, 25, 4, 11, 18, 25, 7, 14, 21, 28],
dtype='int64', name='date')
'''
print(stockdata.index.month)
'''
Int64Index([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3,
3, 3, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2,
3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2,
2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1,
2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1],
dtype='int64', name='date')
'''
print(stockdata.index.year)
'''
Int64Index([2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011],
dtype='int64', name='date')
'''
print(stockdata.resample('M').apply(np.sum))
'''
quarter volume percent_change_price \
date
2011-01-31 36 6779916771 19.637287
2011-02-28 32 5713027799 28.553732
2011-03-31 32 5535580114 -7.317345
percent_change_volume_over_last_wk previous_weeks_volume \
date
2011-01-31 165.675299 5.057285e+09
2011-02-28 279.247846 6.077730e+09
2011-03-31 -23.774935 5.596445e+09
percent_change_next_weeks_price days_to_next_dividend \
date
2011-01-31 34.302458 2618
2011-02-28 -4.583387 1637
2011-03-31 3.263918 1560
percent_return_next_dividend
date
2011-01-31 18.519712
2011-02-28 13.819996
2011-03-31 13.930990
'''
# 8.3.3 列转换
# 删除指定列
stockdata.drop(["percent_change_volume_over_last_wk"], axis = 1)
stockdata_new = pd.DataFrame(stockdata, columns = ['stock', 'open', 'high', "low", "close", "volume"])
print(stockdata_new.head())
'''
stock open high low close volume
date
2011-01-07 AA $15.82 $16.72 $15.78 $16.42 239655616
2011-01-14 AA $16.71 $16.71 $15.64 $15.97 242963398
2011-01-21 AA $16.19 $16.38 $15.60 $15.79 138428495
2011-01-28 AA $15.87 $16.63 $15.82 $16.13 151379173
2011-02-04 AA $16.18 $17.39 $16.18 $17.14 154387761
'''
stockdata["previous_weeks_volume"] = 0
# 8.3.4 噪声数据
#print(stockdata.head())
print(stockdata.dropna().head(2))
'''
quarter stock open high low close volume \
date
2011-01-14 1 AA $16.71 $16.71 $15.64 $15.97 242963398
2011-01-21 1 AA $16.19 $16.38 $15.60 $15.79 138428495
percent_change_price percent_change_volume_over_last_wk \
date
2011-01-14 -4.42849 1.380223
2011-01-21 -2.47066 -43.024959
previous_weeks_volume next_weeks_open next_weeks_close \
date
2011-01-14 0 $16.19 $15.79
2011-01-21 0 $15.87 $16.13
percent_change_next_weeks_price days_to_next_dividend \
date
2011-01-14 -2.47066 19
2011-01-21 1.63831 12
percent_return_next_dividend
date
2011-01-14 0.187852
2011-01-21 0.189994
'''
print(stockdata_new.open.describe())
'''
count 100
unique 99
top $43.86
freq 2
Name: open, dtype: object
'''
stockdata_new.open = pd.to_numeric(stockdata_new.open.str.replace('$', ''))
stockdata_new.close = pd.to_numeric(stockdata_new.close.str.replace('$', ''))
print(stockdata_new.open.describe())
'''
count 100.000000
mean 51.286800
std 32.154889
min 13.710000
25% 17.705000
50% 46.040000
75% 72.527500
max 106.900000
Name: open, dtype: float64
'''
stockdata_new['newopen'] = stockdata_new.open.apply(lambda x: 0.8*x)
print(stockdata_new.newopen.head(5))
'''
date
2011-01-07 12.656
2011-01-14 13.368
2011-01-21 12.952
2011-01-28 12.696
2011-02-04 12.944
Name: newopen, dtype: float64
'''
stockAA = stockdata_new.query('stock=="AA"')
print(stockAA.head())
'''
stock open high low close volume newopen
date
2011-01-07 AA 15.82 $16.72 $15.78 16.42 239655616 12.656
2011-01-14 AA 16.71 $16.71 $15.64 15.97 242963398 13.368
2011-01-21 AA 16.19 $16.38 $15.60 15.79 138428495 12.952
2011-01-28 AA 15.87 $16.63 $15.82 16.13 151379173 12.696
2011-02-04 AA 16.18 $17.39 $16.18 17.14 154387761 12.944
'''
# 8.4 matplotlib
import matplotlib.pyplot as plt
from matplotlib import figure
stockCSCO = stockdata_new.query('stock=="CSCO"')
stockCSCO.head()
plt.figure()
plt.scatter(stockdata_new.index.date, stockdata_new.volume)
plt.xlabel('day')
plt.ylabel('stock close value')
plt.title('title')
plt.savefig("nltkplot01.png")
# 8.4.1 子图绘制
plt.subplot(2, 2, 1)
plt.plot(stockAA.index.weekofyear, stockAA.open, 'r--')
plt.subplot(2, 2, 2)
plt.plot(stockCSCO.index.weekofyear, stockCSCO.open, 'g-*')
plt.subplot(2, 2, 3)
plt.plot(stockAA.index.weekofyear, stockAA.open, 'g--')
plt.subplot(2, 2, 4)
plt.plot(stockCSCO.index.weekofyear, stockCSCO.open, 'r-*')
plt.savefig('nltkplot02.png')
# bad test
x = [1, 3, 4, 5, 8, 14]
y = [0, 2, 4, 7, 9, 19]
fig, axes = plt.subplots(nrows = 1, ncols = 2)
for ax in axes:
ax.plot(x, y, 'r')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('title')
#plt.show(ax)
plt.savefig("nltkplot03.png")
# 8.4.2 添加坐标轴
fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])
axes.plot(x, y, 'r')
#plt.show(axes)
plt.savefig("nltkplot04.png")
fig = plt.figure()
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
ax.plot(stockAA.index.weekofyear, stockAA.open, label="AA")
ax.plot(stockAA.index.weekofyear, stockCSCO.open, label="CSCO")
ax.set_xlabel("weekofyear")
ax.set_ylabel("stock value")
ax.set_title('Weekly change in stock price')
ax.legend(loc = 2)
plt.savefig("nltkplot05.png")
# 8.4.3 绘制散点图
plt.scatter(stockAA.index.weekofyear, stockAA.open)
plt.savefig("nltkplot06.png")
# 8.4.4 绘制条形图
n = 12
X = np.arange(n)
Y1 = np.random.uniform(0.5, 1.0, n)
Y2 = np.random.uniform(0.5, 1.0, n)
plt.bar(X, +Y1, facecolor='#9999ff', edgecolor='white')
plt.bar(X, -Y2, facecolor='#ff9999', edgecolor='white')
plt.savefig("nltkplot07.png")
# 8.4.5 3D绘图
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = Axes3D(fig)
X = np.arange(-4, 4, 0.25)
Y = np.arange(-4, 4, 0.25)
X, Y = np.meshgrid(X, Y)
R = np.sqrt(X**2, Y**2)
Z = np.sin(R)
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='hot')
#plt.show(ax)
plt.savefig("nltkplot08.png")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。