当前位置:   article > 正文

nlp 词性标注demo_nlp词性标注实战traindata.txt

nlp词性标注实战traindata.txt

用python做的词性标注demo

代码及训练集在github

https://github.com/howard789/nlp_pos_tag

测试数据 trying to keep pace with rival Time magazine

运行结果如下:

word: trying
result: VBG
explain: Verb, gerund or present participle 动名词和现在分词
-----------------------------
word: to
result: TO
explain: to 作为介词或不定式格式
-----------------------------
word: keep
result: VB
explain: Verb, base form 动词基本形式
-----------------------------
word: pace
result: NN
explain: Noun, singular or mass 常用名词 单数形式
-----------------------------
word: with
result: IN
explain: Preposition or subordinating conjunction 介词或从属连词
-----------------------------
word: rival
result: JJ
explain: Adjective 形容词或序数词
-----------------------------
word: Time
result: NNP
explain: Proper noun, singular  专有名词,单数形式
-----------------------------
word: magazine
result: NN
explain: Noun, singular or mass 常用名词 单数形式
-----------------------------

Code:

  1. import pandas as pd
  2. import numpy as np
  3. def get_explain_dic():
  4. dic={}
  5. dic['CC']= 'Coordinating conjunction 连接词'
  6. dic['CD']= 'Cardinal number 基数词'
  7. dic['DT']= 'Determiner 限定词'
  8. dic['EX']= 'Existential there 存在句'
  9. dic['FW']= 'Foreign word 外来词'
  10. dic['IN']= 'Preposition or subordinating conjunction 介词或从属连词'
  11. dic['JJ']= 'Adjective 形容词或序数词'
  12. dic['JJR']= 'Adjective, comparative 形容词比较级'
  13. dic['JJS']= 'Adjective, superlative 形容词最高级'
  14. dic['LS']= 'List item marker 列表标示'
  15. dic['MD']= 'Modal 情态助动词'
  16. dic['NN']= 'Noun, singular or mass 常用名词 单数形式'
  17. dic['NNS']= 'Noun, plural 常用名词 复数形式'
  18. dic['NNP']= 'Proper noun, singular 专有名词,单数形式'
  19. dic['NNPS']='Proper noun, plural 专有名词,复数形式'
  20. dic['PDT']= 'Predeterminer 前位限定词'
  21. dic['POS']= 'Possessive ending 所有格结束词'
  22. dic['PRP']= 'Personal pronoun 人称代词'
  23. dic['PRP$']='Possessive pronoun 所有格代名词'
  24. dic['RB']= 'Adverb 副词'
  25. dic['RBR']= 'Adverb, comparative 副词比较级'
  26. dic['RBS']= 'Adverb, superlative 副词最高级'
  27. dic['RP']= 'Particle 小品词'
  28. dic['SYM']= 'Symbol 符号'
  29. dic['TO']= 'to 作为介词或不定式格式'
  30. dic['UH']= 'Interjection 感叹词'
  31. dic['VB']= 'Verb, base form 动词基本形式'
  32. dic['VBD']= 'Verb, past tense 动词过去式'
  33. dic['VBG']= 'Verb, gerund or present participle 动名词和现在分词'
  34. dic['VBN']= 'Verb, past participle 过去分词'
  35. dic['VBP']= 'Verb, non-3rd person singular present 动词非第三人称单数'
  36. dic['VBZ']= 'Verb, 3rd person singular present 动词第三人称单数'
  37. dic['WDT']= 'Wh-determiner 限定词'
  38. dic['WP']= 'Wh-pronoun 代词'
  39. dic['WP$']= 'Possessive wh-pronoun 所有格代词'
  40. dic['WRB']= 'Wh-adverb 疑问代词'
  41. return dic
  42. def get_data(data_size=0):
  43. df = pd.read_table('traindata.txt', header=None, encoding='gb2312', sep='\n', index_col=None)
  44. df['word'] = df[0].apply(lambda x: str(x).split('/')[0].strip())
  45. df['tag'] = df[0].apply(lambda x: str(x).split('/')[1].strip())
  46. df.drop(columns=[0], inplace=True)
  47. if (data_size > 0):
  48. # 只取一小部分资料
  49. del_list = list(range(data_size, df.shape[0]))
  50. df.drop(index=del_list, inplace=True)
  51. return df
  52. def get_transfer_mat(df):
  53. words_uniq = df['word'].unique()
  54. tags_uniq = df['tag'].unique()
  55. pi_series = pd.Series(data=0, index=tags_uniq)
  56. A_tag_tag_mat = pd.DataFrame(data=0, index=tags_uniq, columns=tags_uniq) # 给定一个tag,出现另一个tag的几率,隐变量,状态转移矩阵
  57. B_tag_word_mat = pd.DataFrame(data=0, index=tags_uniq, columns=words_uniq) # 给定一个tag,出现word的几率,观测变量转移矩阵
  58. words = df['word']
  59. tags = df['tag']
  60. pre_tag = None
  61. for i in range(df.shape[0]):
  62. print(i)
  63. word = words[i]
  64. tag = tags[i]
  65. B_tag_word_mat.loc[tag, word] += 1
  66. if (pre_tag is None):
  67. pi_series[tag] += 1
  68. else:
  69. A_tag_tag_mat.loc[pre_tag, tag] += 1
  70. if (word == '.'):
  71. pre_tag = None
  72. else:
  73. pre_tag = tag
  74. # 将 pi归一化
  75. pi_sum = pi_series.sum()
  76. pi_series = pi_series / pi_sum
  77. # 将 A B 归一化
  78. A_tag_tag_mat['sum'] = 0
  79. B_tag_word_mat['sum'] = 0
  80. for tag in tags_uniq:
  81. sum_of_tag=A_tag_tag_mat.loc[tag, :].sum()
  82. A_tag_tag_mat.loc[tag, 'sum'] = sum_of_tag
  83. B_tag_word_mat.loc[tag, 'sum'] = sum_of_tag
  84. for tag in tags_uniq:
  85. A_tag_tag_mat[tag] = A_tag_tag_mat[tag] / A_tag_tag_mat['sum']
  86. for word in words_uniq:
  87. B_tag_word_mat[word] = B_tag_word_mat[word] / B_tag_word_mat['sum']
  88. A_tag_tag_mat.drop(columns='sum', inplace=True)
  89. B_tag_word_mat.drop(columns='sum', inplace=True)
  90. print('finish get_transfer_mat')
  91. return pi_series, A_tag_tag_mat, B_tag_word_mat
  92. def check_sen(words, B_tag_word_mat):
  93. dics = B_tag_word_mat.columns.tolist()
  94. for word in words:
  95. if (word not in dics):
  96. return word
  97. return ''
  98. def sentence_to_words(sen):
  99. words = str(sen).split(" ")
  100. rtn_words = []
  101. for word in words:
  102. word = word.strip()
  103. if (len(word) > 0):
  104. if(len(word)>1 and word.endswith(',')):
  105. rtn_words.append(word[0:len(word)-1])
  106. rtn_words.append(',')
  107. else:
  108. rtn_words.append(word)
  109. return rtn_words
  110. def log(value):
  111. if (value == 0):
  112. return np.log(0.000001)
  113. else:
  114. return np.log(value)
  115. def viterbi(words, pi_series, A_tag_tag_mat, B_tag_word_mat):
  116. tags = pi_series.index
  117. T = list(range(len(words)))
  118. # data表示最大概率的前一个tag的名称和概率
  119. dp = pd.DataFrame(data=None, columns=tags, index=T)
  120. for tag in tags:
  121. prob = log(pi_series[tag]) + log(B_tag_word_mat.loc[tag, words[0]])
  122. dp.loc[0, tag] = ("na", prob)
  123. for i in range(1, dp.shape[0]):
  124. word = words[i]
  125. for tag in tags:
  126. max_prob_tag = None
  127. max_prob = -np.inf
  128. for pre_tag in tags:
  129. prob = dp.loc[i - 1, pre_tag][1] + log(A_tag_tag_mat.loc[pre_tag, tag]) + log(B_tag_word_mat.loc[tag, word])
  130. if (prob > max_prob):
  131. max_prob_tag = pre_tag
  132. max_prob = prob
  133. dp.loc[i, tag] = (max_prob_tag, max_prob)
  134. print(dp.head())
  135. result = []
  136. # 先求出最后一个节点的结果
  137. max_prob_tag = None
  138. max_prob = -np.inf
  139. final_index = dp.shape[0] - 1
  140. pre_tag=None
  141. for tag in tags:
  142. pre_tag_tmp=dp.loc[final_index, tag][0]
  143. prob = float(dp.loc[final_index, tag][1])
  144. if (max_prob_tag is None or prob > max_prob):
  145. max_prob_tag = tag
  146. max_prob = prob
  147. pre_tag=pre_tag_tmp
  148. result.append(max_prob_tag)
  149. result.append(pre_tag)
  150. # 依序查出之前的
  151. na = False
  152. index = final_index - 1
  153. while (na == False):
  154. pre_tag = dp.loc[index, pre_tag][0]
  155. if (pre_tag == 'na'):
  156. na = True
  157. else:
  158. result.append(pre_tag)
  159. index -= 1
  160. result.reverse()
  161. return result
  162. def print_verb_exp(words,result):
  163. dic=get_explain_dic()
  164. for i in range(len(words)):
  165. print("word:",words[i])
  166. print("result:", result[i])
  167. print("explain:", dic[result[i]])
  168. print("-----------------------------")
  169. if __name__ == '__main__':
  170. # 取得训练集
  171. df = get_data(data_size=1000)
  172. # 训练 pi A B
  173. pi_series, A_tag_tag_mat, B_tag_word_mat = get_transfer_mat(df)
  174. # 测试句子
  175. sen = "Social Security number , passport number and details about the services provided for the payment"
  176. sen = "trying to keep pace with rival Time magazine"
  177. words = sentence_to_words(sen)
  178. #校验字典是否包含要测试的word
  179. result = check_sen(words, B_tag_word_mat)
  180. #维特比算法
  181. if (result == ''):
  182. print("viterbi start")
  183. result=viterbi(words, pi_series, A_tag_tag_mat, B_tag_word_mat)
  184. #打印注释
  185. print_verb_exp(words, result)
  186. else:
  187. print("字典里不包含{}".format(result))

 

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/588753
推荐阅读
相关标签
  

闽ICP备14008679号