赞
踩
- #return true or false
- re.search(regex, string);
- #return [] exist in string;
- re.selectall(regex, string);
将空值替换为“0”的操作:
- import numpy as np
- matrix = np.genfromtxt("....csv", dtype = 'U75', skip_header = 1, delimiter = ',')
- for i in range(np.shape[0]):
- column = (matrix[:, i] == '')
- matrix[column, i] = '0'
- #数据转换
- vector = vector.astype(float)
- import nltk
- nltk.download('gutenberg')
Ubuntu 自带python2和python3 设置默认Python版本和切换:
- 直接执行这两个命令即可:
- sudo update-alternatives --install /usr/bin/python python /usr/bin/python2 100
- sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 150
- 如果要切换到Python2,执行:
- sudo update-alternatives --config python
- 按照提示输入选择数字回车即可。
由于Ubuntu自带的pip最高版本达不到下载nltk的要求:(强制重装pip)
- curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
- sudo python get-pip.py --force-reinstall
然后再
pip install nltk
gedit出错:gedit是用户图形界面,在服务器上打不开...所以无法进入gedit编辑器
此时直接修改的配置文件:
- JAVA_HOME=/usr/lib/jvm/java1.8
- JRE_HOME=/usr/lib/jvm/java1.8/jre
- PATH=$JAVA_HOME/bin:$PATH
- CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
- export JAVA_HOME
- export JRE_HOME
- export PATH
添加相同的代码就可以,最后java -version(一个-),就成功了。
linux nltk包的位置:/usr/local/lib/python3.6/dist-packages
Python NLTK结合Stanford NLP工具包进行分词、词性标注、句法分析
一些不清楚的地方:
比如说文中说的的stanfordNLTK目录在哪?
作者的资源链接应该标错了,估摸着最近应该得不到回复,我一个一个下载,整理一下:
- import nltk
- from nltk.corpus import gutenberg
- from pprint import pprint
- import numpy as np
- alice = gutenberg.raw(fileids='carroll-alice.txt')
- default_st = nltk.sent_tokenize
-
- print('\nTotal sentences in alice:', len(alice_sentences))
- print('First 5 sentences in alice:-')
- print(np.array(alice_sentences[0:5]))
- punkt_st = nltk.tokenize.PunktSentenceTokenizer()
- sample_sentences = punkt_st.tokenize(sample_text)
- print(np.array(sample_sentences))
正则表达式:
- SENTENCE_TOKENS_PATTERN = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s'
- regex_st = nltk.tokenize.RegexpTokenizer(
- pattern=SENTENCE_TOKENS_PATTERN,
- gaps=True)
- sample_sentences = regex_st.tokenize(sample_text)
- print(np.array(sample_sentences))
默认:
- default_wt = nltk.word_tokenize
- words = default_wt(sample_text)
- np.array(words)
Treebank
- treebank_wt = nltk.TreebankWordTokenizer()
- words = treebank_wt.tokenize(sample_text)
- np.array(words)
正则表达式:r'\w+'
- GAP_PATTERN = r'\s+'
- regex_wt = nltk.RegexpTokenizer(pattern=GAP_PATTERN,
- gaps=True)
- words = regex_wt.tokenize(sample_text)
- np.array(words)
- word_indices = list(regex_wt.span_tokenize(sample_text))
- print(word_indices)#输出分割后字符在原始的位置
- print(np.array([sample_text[start:end] for start, end in word_indices]))
- #输出每个字符
- def tokenize_text(text):
- sentences = nltk.sent_tokenize(text)
- word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
- return word_tokens
-
- sents = tokenize_text(sample_text)
- np.array(sents)
-
- words = [word for sentence in sents for word in sentence]
- np.array(words)
更快的分词分句:
- import spacy
- nlp = spacy.load('en_core', parse = True, tag=True, entity=True)
-
- text_spacy = nlp(sample_text)
- sents = np.array(list(text_spacy.sents))
- sent_words = [[word.text for word in sent] for sent in sents]
- np.array(sent_words)
- words = [word.text for word in text_spacy]
- np.array(words)
去除重音字符:
- import unicodedata
-
- def remove_accented_chars(text):
- text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
- return text
-
- remove_accented_chars('Sómě Áccěntěd těxt')
删除特殊字符:
- def remove_special_characters(text, remove_digits=False):
- pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
- text = re.sub(pattern, '', text)
- return text
-
- remove_special_characters("Well this was fun! What do you think? 123#@!",
- remove_digits=True)
拓展略缩词:
- from contractions import CONTRACTION_MAP
- import re
-
- def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
-
- contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
- flags=re.IGNORECASE|re.DOTALL)
- def expand_match(contraction):
- match = contraction.group(0)
- first_char = match[0]
- expanded_contraction = contraction_mapping.get(match)\
- if contraction_mapping.get(match)\
- else contraction_mapping.get(match.lower())
- expanded_contraction = first_char+expanded_contraction[1:]
- return expanded_contraction
-
- expanded_text = contractions_pattern.sub(expand_match, text)
- expanded_text = re.sub("'", "", expanded_text)
- return expanded_text
缩略词列表:链接:https://pan.baidu.com/s/1qu44acyb6pwMuUtfBqimig 提取码:5rnf
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。