import pymarkdown
import codecs
# 读取Markdown文件
with codecs.open('example.md', 'r', encoding='utf-8') as file:
markdown_text = file.read()
# 处理中文文本
processed_text = markdown_text.replace('中文', 'Python中文文本处理')
# 生成Markdown文件
with codecs.open('processed_example.md', 'w', encoding='utf-8') as file:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
# 英文分词
text = "This is an example sentence."
tokens = word_tokenize(text)
# 中文分词
import jieba
text = "中文文本处理很有趣。"
tokens = list(jieba.cut(text))
import nltk nltk.download('stopwords') from nltk.corpus import stopwords # 去除英文停用词 text = "This is an example sentence." tokens = word_tokenize(text) stop_words = set(stopwords.words('english')) filtered_tokens = [word for word in tokens if not word.lower() in stop_words] print(filtered_tokens) # 去除中文停用词 import jieba text = "中文文本处理很有趣。" tokens = list(jieba.cut(text)) stop_words = ['的', '很', '有趣'] filtered_tokens = [word for word in tokens if not word in stop_words] print(filtered_tokens)
import nltk
# 英文词性标注
text = "This is an example sentence."
tokens = word_tokenize(text)
tags = nltk.pos_tag(tokens)
# 中文词性标注
import jieba.posseg as pseg
text = "中文文本处理很有趣。"
tokens = pseg.cut(text)
tags = [(word, tag) for word, tag in tokens]
import nltk nltk.download('maxent_ne_chunker') nltk.download('words') # 英文实体识别 text = "Barack Obama was born in Hawaii." tokens = word_tokenize(text) tags = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tags) print(entities) # 中文实体识别 import jieba.posseg as pseg import re import jieba text = "没有什么比机场远避世外桃源更让人向往" # 合并人名、地名等实体 jieba.load_userdict('user_dict.txt') tokens = pseg.cut(text) entities = [] for word, tag in tokens: if re.search('^n', tag) or re.search('^nr', tag) or re.search('^ns', tag) or re.search('^nt', tag): entities.append(word) print(entities)
from textblob import TextBlob # 英文情感分析 text = "This is an awful movie!" blob = TextBlob(text) polarity = blob.sentiment.polarity if polarity > 0: print("Positive") elif polarity < 0: print("Negative") else: print("Neutral") # 中文情感分析 from snownlp import SnowNLP text = "这家饭店的菜很好吃!" s = SnowNLP(text) if s.sentiments > 0.5: print("Positive") else: print("Negative")
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB # 英文文本分类 train_data = ["This is a good movie.", "This is a bad movie.", "The acting in this movie is great."] train_labels = ["positive", "negative", "positive"] test_data = ["I love this film very much."] vectorizer = TfidfVectorizer(stop_words="english") train_vectors = vectorizer.fit_transform(train_data) test_vectors = vectorizer.transform(test_data) classifier = MultinomialNB().fit(train_vectors, train_labels) predicted_label = classifier.predict(test_vectors) print(predicted_label) # 中文文本分类 import jieba import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB train_path = "./data/train/" test_path = "./data/test/" # 加载训练集和测试集 train_data = [] train_labels = [] test_data = [] test_labels = [] for label in os.listdir(train_path): if label.startswith("."): continue dir_path = os.path.join(train_path, label) for file_name in os.listdir(dir_path): if file_name.startswith("."): continue file_path = os.path.join(dir_path, file_name) with open(file_path, "r", encoding="utf-8") as f: train_data.append(f.read()) train_labels.append(label) for label in os.listdir(test_path): if label.startswith("."): continue dir_path = os.path.join(test_path, label) for file_name in os.listdir(dir_path): if file_name.startswith("."): continue file_path = os.path.join(dir_path, file_name) with open(file_path, "r", encoding="utf-8") as f: test_data.append(f.read()) test_labels.append(label) # 对训练集和测试集进行分词和特征提取 vectorizer = TfidfVectorizer(stop_words="english") train_vectors = vectorizer.fit_transform(train_data) test_vectors = vectorizer.transform(test_data) # 训练分类器并进行预测 classifier = MultinomialNB().fit(train_vectors, train_labels) predicted_labels = classifier.predict(test_vectors) print(predicted_labels)
import gensim from gensim import corpora # 英文主题建模 documents = ["Machine learning is useful in data analysis.", "Python is a popular programming language."] texts = [[word for word in document.lower().split()] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=3, id2word=dictionary, passes=10) for topic in lda.print_topics(num_topics=3, num_words=3): print(topic) # 中文主题建模 import jieba import os import gensim from gensim import corpora data_path = "./data/" # 加载语料库 documents = [] for file_name in os.listdir(data_path): if file_name.startswith("."): continue file_path = os.path.join(data_path, file_name) with open(file_path, "r", encoding="utf-8") as f: documents.append(f.read()) stop_words = ["的", "是", "有", "在", "可", "到", "就", "也"] # 对语料库进行分词和特征提取 texts = [] for document in documents: words = [word for word in jieba.cut(document, cut_all=False) if word not in stop_words] texts.append(words) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # 训练主题模型并显示主题 lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=3, id2word=dictionary, passes=10) for topic in lda.print_topics(num_topics=3, num_words=3): print(topic)
test_text = “约翰·福布斯兰顿是美国德克萨斯州的一位政治家和商人。”
test_text = test_text.split(" ")
test = [[word_to_index.get(word, 1) for word in test_text]]
test = pad_sequences(test, maxlen=MAX_LEN)
y_pred = model.predict(test)[0]
y_pred = [index_to_tag[i] for i in np.argmax(y_pred, axis=1)]
print(list(zip(test_text, y_pred)))
[(‘约翰·福布斯兰顿是’, ‘O’),
(‘美国’, ‘B-LOC’),
(‘德克萨斯州’, ‘I-LOC’),
(‘的’, ‘O’),
(‘一位’, ‘O’),
(‘政治家和商人。’, ‘O’)]
pip install spacy
python -m spacy download en_core_web_sm
import spacy
nlp = spacy.load('en_core_web_sm')
text = "John Forbes-Robertson was born in London, England. He was a British actor and director."
doc = nlp(text)
for ent in doc.ents:
print(ent.text, ent.label_)
John Forbes-Robertson PERSON
London GPE
England GPE
British NORP
同样,模型成功识别出了“John Forbes-Robertson”作为人物实体,而“London”和“England”则被标注为地理位置实体。同时,“British”被标注为国家或地区集合实体。其它非命名实体则没有被标注。
import spacy from spacy.tokens import Span # 定义新的实体类型MAGIC_ITEM MAGIC_ITEM = nlp.vocab.strings[u"MAGIC_ITEM"] nlp.entity.add_label(MAGIC_ITEM) # 获取"MAGIC_ITEM"标签的ID mitem_id = nlp.vocab.strings[u"MAGIC_ITEM"] # 将"哥布林"标记为"MAGIC_ITEM" text = "Harry and his friends went to Gringotts to retrieve the Sword of Gryffindor from the goblins." doc = nlp(text) goblin = doc[11] mitem_span = Span(doc, goblin.i, goblin.i+1, label=mitem_id) doc.ents = list(doc.ents) + [mitem_span] # 输出标注结果 for ent in doc.ents: print(ent.text, ent.label_)
Sword of Gryffindor MAGIC_ITEM
goblins ORG
这里,“Sword of Gryffindor”被标注为“MAGIC_ITEM”,而“哥布林”被当做普通实体“ORG”(组织机构)处理了。需要注意的是,新标签会被后续的pipelines和训练过程使用。
