当前位置:   article > 正文

基于Python的简易文本分析程序_python文本分析代码实现

python文本分析代码实现
"""
项目介绍

情景假设:一位考生根据题目要求写出了一份作文,考生略做修改后提交第二份文档,
        考生将题目、两份作文输入程序后得到一份评分报告。评分报告给出以百分数为形式的数据为考生提供修改建议。
         
实现功能:该项目主要关注以下几点:主题契合度、词汇高级度、查重率与体检报告。
        考生输入文本后,评分报告以pyGUI的形式呈现给用户,显示各类指标、图表和建议。
"""
  1. from tkinter import *
  2. from tkinter import scrolledtext
  3. import nltk
  4. from sklearn.feature_extraction.text import TfidfVectorizer
  5. from tkinter import *
  6. from tkinter import scrolledtext
  7. import nltk
  8. from sklearn.feature_extraction.text import TfidfVectorizer
  9. import matplotlib
  10. matplotlib.use("TkAgg")
  11. import re
  12. import hashlib
  13. import pandas as pds
  14. import numpy as np
  15. import string
  16. import matplotlib.pyplot as plt
  17. import matplotlib.font_manager as fm
  18. import os
  19. import matplotlib.pyplot as plt
  20. import re
  21. import hashlib
  22. import pandas as pds
  23. import numpy as np
  24. import string
  25. import matplotlib.pyplot as plt
  26. import matplotlib.font_manager as fm
  27. import time
  28. """
  29. 项目介绍
  30. 情景假设:一位考生根据题目要求写出了一份作文,考生略做修改后提交第二份文档,
  31. 考生将题目、两份作文输入程序后得到一份评分报告。评分报告给出以百分数为形式的数据为考生提供修改建议。
  32. 实现功能:该项目主要关注以下几点:主题契合度、词汇高级度、查重率与体检报告。
  33. 考生输入文本后,评分报告以pyGUI的形式呈现给用户,显示各类指标、图表和建议。
  34. """
  35. # 数据库输入
  36. word_storage = pds.read_excel('词库.xlsx')
  37. """
  38. 从不同词库中提取单词
  39. 词库有四个,分别为:初中词汇,高中词汇,四六级词汇,托福词
  40. """
  41. """
  42. TF-IDF计算词频以体现主题契合度,徐铭阳提供
  43. """
  44. def TF_IDF():
  45. # 函数封装:对词频进行统计
  46. def word_tfidf_calculating(the_address_of_text):
  47. # 读取文本
  48. with open(the_address_of_text) as f:
  49. text=f.read()
  50. # 提取关键词
  51. tfidf=TfidfVectorizer(stop_words='english', tokenizer=None)
  52. tfs=tfidf.fit_transform([text])
  53. # 获取关键词和对应的得分
  54. feature_names=tfidf.get_feature_names_out()
  55. scores=tfs.todense().tolist()[0]
  56. # 将关键词和对应的得分组成元组,然后根据得分进行排序
  57. # 将关键词和对应的得分组成元组,然后根据得分进行排序
  58. keywords_in_list=sorted(zip(feature_names, scores), key=lambda x: x[1], reverse=True)
  59. keywords_in_dic=dict(keywords_in_list)
  60. return keywords_in_list
  61. # 调用TF-IDF函数,实现对题目、文本对词频统计
  62. keyword_topic=word_tfidf_calculating("topic.txt")
  63. keyword_text1=word_tfidf_calculating("text1.txt")
  64. keyword_text2=word_tfidf_calculating("text2.txt")
  65. # keyword_topic为一个字典,其键为关键词,值为得分,找出得分前十的关键词,以字典形式输出
  66. # 函数封装:输出得分前十的关键词,以字典形式输出 # 函数封装:输出得分前十的关键词,以字典形式输出
  67. def the_top_word_output(which_text):
  68. the_top_word={}
  69. for item in which_text[:10]:
  70. the_top_word[item[0]]=item[1]
  71. return the_top_word
  72. top_word_topic = the_top_word_output(keyword_topic)
  73. top_word_text1 = the_top_word_output(keyword_text1)
  74. top_word_topi2 = the_top_word_output(keyword_text2)
  75. # 函数封装: 关键词词频得分计算
  76. def the_total_score_of_keyword(which_text):
  77. set1=set([item[0] for item in keyword_topic])
  78. set2=set(item[0] for item in which_text)
  79. keyword_common_elements=list(set1.intersection(set2))
  80. # 函数封装:从列表中抓取元素
  81. def extract_element1_in_list(the_list, element0):
  82. global element1
  83. for item in the_list:
  84. if item[0] == element0:
  85. element1=item[1]
  86. return element1
  87. # 利用自定义数学公式计算得分
  88. the_total_score_of_keyword_in_circle=0
  89. for marks_of_keys in keyword_common_elements:
  90. the_total_score_of_keyword_in_circle+=extract_element1_in_list(keyword_topic, marks_of_keys) \
  91. * extract_element1_in_list(which_text, marks_of_keys)
  92. return the_total_score_of_keyword_in_circle
  93. # 调用函数:求得text1与text2的得分
  94. score_of_text1=the_total_score_of_keyword(keyword_text1)
  95. score_of_text2=the_total_score_of_keyword(keyword_text2)
  96. # 得出关键词词频结论
  97. the_ratio_after_modified=1 - score_of_text2 / score_of_text1
  98. print("您的文章修改后,关键词复现词频得分提高了{:%}".format(the_ratio_after_modified))
  99. return [top_word_topic, top_word_text1, top_word_topi2]
  100. x = TF_IDF()
  101. """
  102. SimHash—Hamming算法比较文本相似度,以计算查重率,徐铭阳提供
  103. """
  104. def CALC_repetition(text1_address, text2_address):
  105. def CALC_SimHash_Hamming(text1_address1, text2_address1): # 徐铭阳提供
  106. """
  107. 计算出两个文本的哈希值,返回汉明距离
  108. """
  109. global distance_between_texts
  110. with open(text1_address, 'r') as f:
  111. text_topic=f.read()
  112. with open(text2_address, 'r') as f:
  113. text1=f.read()
  114. class Simhash:
  115. def __init__(self, text, hashbits=64):
  116. self.hashbits=hashbits
  117. self.hash=self.simhash(text)
  118. def __str__(self):
  119. return str(self.hash)
  120. def simhash(self, tokens):
  121. # 初始化一个64位的列表,用于存储特征哈希值
  122. v=[0] * self.hashbits
  123. # 遍历每一个单词
  124. for t in [self._string_hash(x) for x in tokens]:
  125. # 对每一个哈希值的每一个位进行更新
  126. for i in range(self.hashbits):
  127. bitmask=1 << i
  128. if t & bitmask:
  129. v[i]+=1 # 如果该位为1,则该位的计数器加1
  130. else:
  131. v[i]-=1 # 如果该位为0,则该位的计数器减1
  132. fingerprint=0
  133. for i in range(self.hashbits):
  134. if v[i] >= 0:
  135. fingerprint|=1 << i # 如果计数器大于等于0,则该位设为1
  136. return fingerprint
  137. def _string_hash(self, v):
  138. # 将字符串进行hash,产生一个64位的整数
  139. return int(hashlib.md5(v.encode('utf-8')).hexdigest(), 16)
  140. def hamming_distance(self, other):
  141. # 计算两个整数二进制表示的汉明距离
  142. x=(self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
  143. d=0
  144. while x:
  145. d+=1
  146. x&=x - 1
  147. return d
  148. def get_words(text):
  149. # 按照空格进行分词,同时过滤掉长度小于等于1的词语
  150. words=re.compile('\w+').findall(text.lower())
  151. return [w for w in words if len(w) > 1]
  152. def get_stopwords():
  153. # 获取停用词表
  154. stop_words=set()
  155. with open("停用词.txt", "r", encoding="utf-8") as f:
  156. for line in f:
  157. stop_words.add(line.strip())
  158. return stop_words
  159. if __name__ == '__main__':
  160. text_a=text_topic
  161. text_b=text1
  162. words1=get_words(text_a)
  163. words2=get_words(text_b)
  164. stopwords=get_stopwords()
  165. words1=[w for w in words1 if w not in stopwords]
  166. words2=[w for w in words2 if w not in stopwords]
  167. sh1=Simhash(words1)
  168. sh2=Simhash(words2)
  169. distance_between_texts=sh1.hamming_distance(sh2)
  170. return distance_between_texts
  171. def text_split(text):
  172. """
  173. 将文本按句子划分
  174. """
  175. with open(text, 'r') as f:
  176. text_after_split=f.read()
  177. sentences=text_after_split.split(". ")
  178. sentences=[s.strip() for s in sentences]
  179. return sentences
  180. def text_compared_based_on_sentences(list_sentence_of_text1,
  181. list_sentence_of_text2):
  182. """
  183. 以两个被分割的文本进行比较,返回相同的句子数目、总句子数所组成的列表
  184. 其中text1作为文本库,text2作为待查重的文本
  185. 返回「重复的句子的单词数,待查重的文本的单词数」
  186. """
  187. def CALC_words_in_sentences(list1):
  188. word_lst=' '.join(list1).split()
  189. return len(word_lst)
  190. list1=text_split(list_sentence_of_text1)
  191. list2=text_split(list_sentence_of_text2)
  192. the_words_amount_of_list2=CALC_words_in_sentences(list2)
  193. amount_of_words_of_similar_sentence=0 # 双列表遍历后找哈希近似,统计重复的句子数和句子词数
  194. for element1 in list1:
  195. for element2 in list2:
  196. if CALC_SimHash_Hamming(element1, element2) <= 1000:
  197. the_words_amount_similar=CALC_words_in_sentences(element2)
  198. amount_of_words_of_similar_sentence+=the_words_amount_similar
  199. info_text_compared_based_on_sentences=[amount_of_words_of_similar_sentence, the_words_amount_of_list2]
  200. return info_text_compared_based_on_sentences
  201. a=text_compared_based_on_sentences(text1_address, text2_address)
  202. def CALC_ratio(info_text_compared_based_on_sentences):
  203. """
  204. 输入被计算后的文本比较信息,其格式为:「重复的句子的单词数,待查重的文本的单词数」
  205. 输出目标文本的重复比例
  206. 其中text1作为文本库,text2作为待查重的文本
  207. """
  208. the_amount_of_sentence_in_texts=int(info_text_compared_based_on_sentences[0])
  209. the_amount_of_sentence_similar=int(info_text_compared_based_on_sentences[1])
  210. the_ratio_of_repetition=the_amount_of_sentence_similar / the_amount_of_sentence_in_texts
  211. print(f"您这篇文章的重复率为:{the_ratio_of_repetition:.2%}")
  212. return the_ratio_of_repetition
  213. b=CALC_ratio(a)
  214. return b
  215. rating1_of_repetition = CALC_repetition("topic.txt",
  216. "text1.txt")
  217. rating2_of_repetition = CALC_repetition("topic.txt",
  218. "text2.txt")
  219. ratio_of_repetition = rating2_of_repetition/rating1_of_repetition
  220. """
  221. 高频词汇统计与评分
  222. """
  223. class Storage_level: # 定义Storage_level类
  224. def __init__(self, text):
  225. import string
  226. with open(text, 'r') as fp: # 打开目标作文
  227. self.text = fp.read() # 读取作文并保存在self.text中
  228. words = self.text.split() # 读取作文单词,并保存在一个列表中
  229. self.words = [word.lower().strip(string.punctuation + string.whitespace) for word in words]
  230. def word_level(self, excel, sheet): # 定义word_level函数,对词库进行处理
  231. read = pds.read_excel(excel, sheet_name=sheet) # 读取词库,将单词储存在列表中
  232. data = read.values.tolist()
  233. words_set = set(tuple(i) for i in data) # 将列表化为集合
  234. words_level = {str(word[0]).replace(',', '') for word in words_set}
  235. return words_level
  236. def word_storage(self, storage, junior, high, universe, toefl): # 定义word_storage函数,将词库划分为四个等级
  237. # 运用word_level函数处理词库
  238. words_junior = self.word_level(storage, junior)
  239. words_high = self.word_level(storage, high)
  240. words_universe = self.word_level(storage, universe)
  241. words_toefl = self.word_level(storage, toefl)
  242. # 用集合的方法将词汇进行分类,共分为四个等级,等级越高,词汇越高级
  243. word_level1 = words_junior - (words_high | words_universe | words_toefl)
  244. word_level2 = words_high - words_junior - words_universe - words_toefl
  245. word_level3 = words_universe - words_high - words_toefl - words_junior
  246. word_level4 = words_toefl - words_junior - words_universe - words_high
  247. return [word_level1, word_level2, word_level3, word_level4]
  248. def analysis(self, storage, junior, high, universe, toefl): # 定义analysis函数分析作文
  249. level1 = self.word_storage(storage, junior, high, universe, toefl)[0]
  250. level2 = self.word_storage(storage, junior, high, universe, toefl)[1]
  251. level3 = self.word_storage(storage, junior, high, universe, toefl)[2]
  252. level4 = self.word_storage(storage, junior, high, universe, toefl)[3]
  253. quantity_of_word = len(self.words)
  254. quantity_of_advanced_word = 0
  255. total_goals = 0
  256. quantity_of_level2 = 0
  257. quantity_of_level3 = 0
  258. quantity_of_level4 = 0
  259. for word in self.words: # 运用循环结构对词汇进行赋分
  260. if word in level2:
  261. quantity_of_level2 += 1
  262. quantity_of_advanced_word += 1
  263. total_goals += 60
  264. elif word in level3:
  265. quantity_of_level3 += 1
  266. quantity_of_advanced_word += 1
  267. total_goals += 80
  268. elif word in level4:
  269. quantity_of_level4 += 1
  270. quantity_of_advanced_word += 1
  271. total_goals += 100
  272. # 计算普通词汇和四个等级的高级词汇占比
  273. ordinary_word = 1 - quantity_of_advanced_word/quantity_of_word
  274. advanced_word = quantity_of_advanced_word/ quantity_of_word
  275. level2_word = quantity_of_level2/quantity_of_word
  276. level3_word = quantity_of_level3/quantity_of_word
  277. level4_word = quantity_of_level4/quantity_of_word
  278. average_advanced_score = total_goals/quantity_of_advanced_word
  279. # 返回计算结果
  280. try:
  281. return [ordinary_word, advanced_word, level2_word, level3_word, level4_word, average_advanced_score]
  282. except:
  283. print('请重新输入数据')
  284. # 定义get_result函数,输入原文和修改后作文
  285. def get_result(text_name1, text_name2):
  286. text1 = Storage_level(text_name1)
  287. text2 = Storage_level(text_name2)
  288. text1_analysis = text1.analysis("词库.xlsx", 'junior', 'high', 'universe', 'toefl')
  289. text2_analysis = text2.analysis("词库.xlsx", 'junior', 'high', 'universe', 'toefl')
  290. # 将分析数据存储在result字典中
  291. result={'Proportion of Common Words': text1_analysis[0],
  292. 'Proportion of Advanced Words': text1_analysis[1],
  293. 'Proportion of Level 2 Advanced Words': text1_analysis[2],
  294. 'Proportion of Level 3 Advanced Words': text1_analysis[3],
  295. 'Proportion of Level 4 Advanced Words': text1_analysis[4],
  296. 'Average Score of Advanced Words': text1_analysis[5],
  297. 'Proportion of Common Words in Modified Text': text2_analysis[0],
  298. 'Proportion of Advanced Words in Modified Text': text2_analysis[1],
  299. 'Proportion of Level 2 Advanced Words in Modified Text': text2_analysis[2],
  300. 'Proportion of Level 3 Advanced Words in Modified Text': text2_analysis[3],
  301. 'Proportion of Level 4 Advanced Words in Modified Text': text2_analysis[4],
  302. 'Average Score of Advanced Words in Modified Text': text2_analysis[5]}
  303. return result
  304. words_level= get_result('text1.txt', 'text2.txt')
  305. selected_keys = ['Proportion of Common Words', 'Proportion of Level 2 Advanced Words', 'Proportion of Level 3 Advanced Words', 'Proportion of Level 4 Advanced Words', 'Proportion of Common Words in Modified Text', 'Proportion of Level 2 Advanced Words in Modified Text', 'Proportion of Level 3 Advanced Words in Modified Text', 'Proportion of Level 4 Advanced Words in Modified Text']
  306. selected_words_level = {k: words_level[k] for k in selected_keys}
  307. words_level = get_result('text1.txt', 'text2.txt')
  308. selected_keys = ['Proportion of Common Words', 'Proportion of Level 2 Advanced Words', 'Proportion of Level 3 Advanced Words', 'Proportion of Level 4 Advanced Words', 'Proportion of Common Words in Modified Text', 'Proportion of Level 2 Advanced Words in Modified Text', 'Proportion of Level 3 Advanced Words in Modified Text', 'Proportion of Level 4 Advanced Words in Modified Text']
  309. selected_words_level1 = {k: words_level[k] for k in selected_keys[:4]}
  310. words_level = get_result('text1.txt', 'text2.txt')
  311. selected_keys = ['Proportion of Common Words', 'Proportion of Level 2 Advanced Words', 'Proportion of Level 3 Advanced Words', 'Proportion of Level 4 Advanced Words', 'Proportion of Common Words in Modified Text', 'Proportion of Level 2 Advanced Words in Modified Text', 'Proportion of Level 3 Advanced Words in Modified Text', 'Proportion of Level 4 Advanced Words in Modified Text']
  312. selected_words_level2 = {k: words_level[k] for k in selected_keys[-4:]}
  313. # 计算原文和修改后作文的'Average Score of Advanced Words'的比值
  314. ratio_of_words_level = words_level['Average Score of Advanced Words in Modified Text']/words_level['Average Score of Advanced Words']
  315. # 输出结果
  316. print(get_result('text1.txt', 'text2.txt'))
  317. """
  318. 体检报告
  319. """
  320. # 基本的语法修改
  321. def correct_text(text):
  322. # 将中文标点替换为英文标点
  323. text=text.replace(",", ",")
  324. text=text.replace("。", ".")
  325. text=text.replace("!", "!")
  326. text=text.replace("?", "?")
  327. text=text.replace(";", ";")
  328. text=text.replace(":", ":")
  329. # 如果标点后忘记加空格,则在标点后面补上一个空格
  330. for i in range(len(text)):
  331. if text[i] in [",", ".", "!", "?", ";", ":"] and i < len(text) - 1 and text[i + 1] not in [" ", "\n"]:
  332. text=text[:i + 1] + " " + text[i + 1:]
  333. return text
  334. text1 = correct_text(open("text1.txt", "r").read())
  335. text2 = correct_text(open("text2.txt", "r").read())
  336. # 体检报告
  337. def get_info(text):
  338. # 段落数初始化为1
  339. num_paragraphs=1
  340. # 句子数初始化为0
  341. num_sentences=0
  342. for i in range(len(text)):
  343. if text[i] == "\n":
  344. num_paragraphs+=1
  345. if text[i] in [".", "?", "!"]:
  346. num_sentences+=1
  347. # 获取词汇列表
  348. totalwords=text.split()
  349. word_list=[word.lower().strip(string.punctuation + string.whitespace) for word in totalwords]
  350. # 统计形符数(词数)
  351. num_of_words=len(word_list)
  352. # 统计类符数(不重复出现的形符数)
  353. num_of_nonrep_words=len(set(word_list))
  354. # 统计平均词长
  355. avg_length=sum(map(len, word_list)) / len(word_list)
  356. # 统计词长标准差
  357. standard_diviation=sum(map(lambda x: (x - avg_length) ** 2, map(len, word_list))) / len(word_list)
  358. return num_paragraphs, num_sentences, num_of_words, num_of_nonrep_words, avg_length, standard_diviation
  359. """
  360. 可视化输出
  361. """
  362. my_dict = {"key1": 0.25, "key2": 0.25, "key3": 0.25, "key4": 0.25}
  363. workDir = os.getcwd()
  364. def zhifangtu(a_dict):
  365. import matplotlib.pyplot as plt
  366. import matplotlib.font_manager as fm
  367. plt.rcParams['font.sans-serif']=['Times New Roman']
  368. keys=list(a_dict.keys())
  369. values=list(a_dict.values())
  370. # 设置直方图边界和颜色等属性
  371. plt.bar(keys, values, align='center', alpha=0.5, edgecolor='black', linewidth=1.2)
  372. for i, v in enumerate(values):
  373. plt.text(i, v + 1, str(v), ha='center', va='bottom', fontweight='bold')
  374. # 设置标题和标签等属性
  375. title_font={'fontsize': 50, 'fontweight': 'bold', 'fontstyle': 'italic', 'color': 'blue'}
  376. title_font1={'fontsize': 40}
  377. plt.title('TF–IDF Word Frequency Chart', fontdict=title_font)
  378. plt.xlabel('Words', fontdict=title_font1)
  379. plt.ylabel('TF–IDF Word Frequency', fontdict=title_font1)
  380. fig=plt.gcf()
  381. fig.set_size_inches(20, 30)
  382. fontprop=fm.FontProperties(size=20)
  383. fontprop1=fm.FontProperties(size=30)
  384. plt.xticks(rotation=45, ha='right', fontproperties=fontprop)
  385. plt.yticks(ha='right', fontproperties=fontprop1)
  386. # 显示图像
  387. plt.savefig('TF–IDF Word Frequency Chart 1')
  388. def zhifangtu1(a_dict):
  389. import matplotlib.pyplot as plt
  390. import matplotlib.font_manager as fm
  391. plt.rcParams['font.sans-serif']=['Times New Roman']
  392. keys=list(a_dict.keys())
  393. values=list(a_dict.values())
  394. # 设置直方图边界和颜色等属性
  395. plt.bar(keys, values, align='center', alpha=0.5, edgecolor='black', linewidth=1.2)
  396. for i, v in enumerate(values):
  397. plt.text(i, v + 1, str(v), ha='center', va='bottom', fontweight='bold')
  398. # 设置标题和标签等属性
  399. title_font={'fontsize': 50, 'fontweight': 'bold', 'fontstyle': 'italic', 'color': 'blue'}
  400. title_font1={'fontsize': 40}
  401. plt.title('TF–IDF Word Frequency Chart', fontdict=title_font)
  402. plt.xlabel('Words', fontdict=title_font1)
  403. plt.ylabel('TF–IDF Word Frequency', fontdict=title_font1)
  404. fig=plt.gcf()
  405. fig.set_size_inches(20, 30)
  406. fontprop=fm.FontProperties(size=20)
  407. fontprop1=fm.FontProperties(size =30)
  408. plt.xticks(rotation=45, ha='right', fontproperties=fontprop)
  409. plt.yticks(ha='right', fontproperties=fontprop1)
  410. # 显示图像
  411. plt.savefig('TF–IDF Word Frequency Chart 2')
  412. '''fic_of_TFIDF1 = zhifangtu(x[0])
  413. fic_of_TFIDF2 = zhifangtu(x[1])
  414. fic_of_TFIDF3 = zhifangtu(x[2])'''
  415. def bing(b):
  416. import matplotlib.pyplot as plt
  417. plt.rcParams['font.sans-serif']=['Times New Roman']
  418. keys=list(b.keys())[:4] # 仅选择前四个键
  419. values=list(b.values())[:4] # 仅选择前四个值
  420. colors=['pink', 'red', 'red', 'purple', 'blue']
  421. alpha=[0.3, 0.3, 0.9, 0.8]
  422. title_font1={'fontsize': 15, 'color': 'black'}
  423. plt.title('Chart of Advanced Words1', fontdict=title_font1)
  424. pie_chart=plt.pie(values, labels=keys, autopct='%1.1f%%', startangle=90, counterclock=False)
  425. for i in range(len(pie_chart[0])):
  426. pie_chart[0][i].set_color(colors[i])
  427. pie_chart[0][i].set_alpha(alpha[i])
  428. plt.savefig(os.path.join(workDir, 'Chart_of_Advanced_Words1''.png'))
  429. fig=plt.gcf()
  430. fig.set_size_inches(5, 5)
  431. fontprop=fm.FontProperties(size=16)
  432. plt.legend(prop=fontprop)
  433. bing(selected_words_level1)
  434. def bing1(b):
  435. import matplotlib.pyplot as plt
  436. plt.rcParams['font.sans-serif']=['Times New Roman']
  437. keys=list(b.keys())[:4] # 仅选择前四个键
  438. values=list(b.values())[:4] # 仅选择前四个值
  439. colors=['pink', 'red', 'red', 'purple', 'blue']
  440. alpha=[0.3, 0.3, 0.9, 0.8]
  441. title_font1={'fontsize': 15, 'color': 'black'}
  442. plt.title('Chart of Advanced Words2', fontdict=title_font1)
  443. pie_chart=plt.pie(values, labels=keys, autopct='%1.1f%%', startangle=90, counterclock=False)
  444. for i in range(len(pie_chart[0])):
  445. pie_chart[0][i].set_color(colors[i])
  446. pie_chart[0][i].set_alpha(alpha[i])
  447. plt.savefig(os.path.join(workDir, 'Chart_of_Advanced_Words2'+'.png'))
  448. fig.set_size_inches(5, 5)
  449. fontprop=fm.FontProperties(size=16)
  450. plt.legend(prop=fontprop)
  451. bing(selected_words_level2)
  452. def get_info(text):
  453. # 段落数初始化为1
  454. num_paragraphs=1
  455. # 句子数初始化为0
  456. num_sentences=0
  457. for i in range(len(text)):
  458. if text[i] == "\n":
  459. num_paragraphs+=1
  460. if text[i] in [".", "?", "!"]:
  461. num_sentences+=1
  462. # 获取词汇列表
  463. totalwords=text.split()
  464. word_list=[word.lower().strip(string.punctuation + string.whitespace) for word in totalwords]
  465. # 统计形符数(词数)
  466. num_of_words=len(word_list)
  467. # 统计类符数(不重复出现的形符数)
  468. num_of_nonrep_words=len(set(word_list))
  469. # 统计平均词长
  470. avg_length=sum(map(len, word_list)) / len(word_list)
  471. # 统计词长标准差
  472. standard_diviation=sum(map(lambda x: (x - avg_length) ** 2, map(len, word_list))) / len(word_list)
  473. return [[num_paragraphs], [num_sentences], [num_of_words], [num_of_nonrep_words], [round(avg_length, 2)],
  474. [round(standard_diviation, 2)]]
  475. def blocki(f):
  476. import matplotlib.pyplot as plt
  477. headers=['Result']
  478. table=plt.table(cellText=f, colLabels=headers,
  479. rowLabels=['Paragraph Num', 'Sentence Num', 'Word Num', 'Nonrep word Num', 'Average word Length',
  480. 'Word Standard Divation'], colWidths=[0.2], cellLoc='center', loc='center')
  481. plt.axis('off')
  482. plt.savefig(os.path.join(workDir, 'ff.png'))
  483. blocki(get_info(text1))
  484. def calculate_time(func):
  485. def wrapper(*args, **kwargs):
  486. start_time = time.time()
  487. result = func(*args, **kwargs)
  488. end_time = time.time()
  489. print(f"程序运行时间为 {end_time - start_time} 秒")
  490. return result
  491. return wrapper
  492. @calculate_time
  493. def my_function():
  494. # 这里放你要计算时间的代码
  495. pass
  496. my_function() # 程序运行时间为 X 秒 ··
  497. from tkinter import *
  498. from tkinter import scrolledtext
  499. from PIL import Image, ImageTk
  500. Img1Open = Image.open(os.path.join(workDir, 'Chart_of_Advanced_Words1.png'))
  501. Img2Open = Image.open(os.path.join(workDir, 'Chart_of_Advanced_Words1.png'))
  502. Img3Open = Image.open(os.path.join(workDir, 'Chart_of_Advanced_Words1.png'))
  503. Img4Open = Image.open(os.path.join(workDir, 'Chart_of_Advanced_Words1.png'))
  504. Img5Open = Image.open(os.path.join(workDir, 'Chart_of_Advanced_Words1.png'))
  505. checkReportPng = Image.open(os.path.join(workDir, 'ff.png'))
  506. windowTitle = "作文评分系统"
  507. # 用format方法给windowPrompt1和windowPrompt2传入参数text1、2
  508. windowPrompt1 = str.format("文本1:{}", text1)
  509. windowPrompt2 = str.format("文本2(已为您纠正基本的标点与空格错误):{}", text2)
  510. similarityDecPercent = str(ratio_of_repetition * 100) + "%"
  511. wordLevelIncPercent = str(ratio_of_words_level * 100) + "%"
  512. themeFitIncPercent = ""
  513. # 用format方法给requirementText数据:打开题目文件,读取题目文件内容
  514. requirementText = "题目:\n" + str.format("{}", open("text1.txt", "r").read())
  515. img1Png = None
  516. img2Png = None
  517. img3Png = None
  518. img4Png = None
  519. img5Png = None
  520. checkReportPng = None
  521. # 窗口参数
  522. # 窗口宽度
  523. windowWidth = 1200
  524. # 窗口高度
  525. windowHeight = 600
  526. # 子窗口百分比宽度
  527. subWindowWidth = 0.3
  528. # 子窗口百分比高度
  529. subWindowHeight = 0.8
  530. # 按钮百分比宽度
  531. buttonWidth = 0.25
  532. # 按钮百分比高度
  533. buttonHeight = 0.2
  534. # 控件类
  535. class mainWindow(Frame):
  536. def __init__(self, master = None):
  537. super().__init__(master)
  538. # 创建窗口
  539. self.rootFrame = Frame(root, width = windowWidth, height = windowHeight)
  540. self.rootFrame.pack(side = "top", fill = "both", expand = 1)
  541. self.createMainWindow(self.rootFrame)
  542. # 创建主窗体
  543. def createMainWindow(self, mainFrame):
  544. # 创建左子窗体
  545. mainFrame.leftSubWindow = scrolledtext.ScrolledText(mainFrame)
  546. mainFrame.leftSubWindow.insert(END,windowPrompt1)
  547. mainFrame.leftSubWindow.place(relheight = subWindowHeight, relwidth = subWindowWidth, relx = 0.1, rely = 0.05)
  548. # 创建中子窗体
  549. mainFrame.midSubWindow = scrolledtext.ScrolledText(mainFrame)
  550. mainFrame.midSubWindow.insert(END,windowPrompt2)
  551. mainFrame.midSubWindow.place(relheight = subWindowHeight, relwidth = subWindowWidth, relx = 0.4, rely = 0.05)
  552. # 创建右子窗体
  553. mainFrame.rightSubWindow = Frame(mainFrame)
  554. self.createRightWindow(mainFrame.rightSubWindow)
  555. mainFrame.rightSubWindow.place(relheight = subWindowHeight, relwidth = subWindowWidth, relx = 0.7, rely = 0.05)
  556. # 创建退出按钮
  557. mainFrame.quit = Button(mainFrame, text = "退出", bg = "red", command = self.master.destroy)
  558. mainFrame.quit.place(anchor = S, relheight = 0.05, relwidth = 0.2, relx = 0.5, rely = 0.95)
  559. # 填充右子窗体内容
  560. def createRightWindow(self, rightWindow):
  561. # 创建显示子窗体
  562. rightWindow.showWindow = Frame(rightWindow)
  563. rightWindow.showWindow.place(relheight = 0.7, relwidth = 0.8, relx = 0.1, rely = 0.25)
  564. # 创建评分比较按钮
  565. rightWindow.scoreCompare = Button(rightWindow, text = "评分比较", command = lambda : self.showScore(rightWindow.showWindow))
  566. rightWindow.scoreCompare.place(relheight = buttonHeight, relwidth = buttonWidth, relx = 0.1, rely = 0)
  567. # 创建检测报告按钮
  568. rightWindow.checkReport = Button(rightWindow, text = "检测报告", command = lambda : self.showCheckReport(rightWindow.showWindow))
  569. rightWindow.checkReport.place(relheight = buttonHeight, relwidth = buttonWidth, relx = 0.4, rely = 0)
  570. # 创建题目要求按钮
  571. rightWindow.requirement = Button(rightWindow, text = "题目要求", command = lambda : self.showRequirement(rightWindow.showWindow))
  572. rightWindow.requirement.place(relheight = buttonHeight, relwidth = buttonWidth, relx = 0.7, rely = 0)
  573. # 显示评分
  574. def showScore(self, subWindow):
  575. # 清空子窗体内控件
  576. for widget in subWindow.winfo_children() :
  577. widget.destroy()
  578. # 将子窗体分为两部分
  579. subWindow.rightUpSubWindow = Frame(subWindow)
  580. subWindow.rightUpSubWindow.place(relheight = 0.5, relwidth = 1, relx = 0, rely = 0)
  581. subWindow.rightDownSubWindow = Frame(subWindow)
  582. subWindow.rightDownSubWindow.place(relheight = 0.5, relwidth = 1, relx = 0, rely = 0.5)
  583. # 初始化要显示的图片
  584. global img1Png
  585. img1Png = ImageTk.PhotoImage(Img1Open,master = subWindow.rightUpSubWindow)
  586. global img2Png
  587. img2Png = ImageTk.PhotoImage(Img2Open,master = subWindow.rightUpSubWindow)
  588. global img3Png
  589. img3Png = ImageTk.PhotoImage(Img3Open, master = subWindow.rightDownSubWindow)
  590. global img4Png
  591. img4Png = ImageTk.PhotoImage(Img4Open, master = subWindow.rightDownSubWindow)
  592. global img5Png
  593. img5Png = ImageTk.PhotoImage(Img5Open, master = subWindow.rightDownSubWindow)
  594. # 显示相似度降低值
  595. subWindow.rightUpSubWindow.similarityDec = Label(subWindow.rightUpSubWindow, text = "您的文章相似度降低了:" + similarityDecPercent)
  596. subWindow.rightUpSubWindow.similarityDec.place(relheight = 0.1, relwidth = 0.9, relx = 0.05, rely = 0.05)
  597. # 显示高级度提升值
  598. subWindow.rightUpSubWindow.wordLevelInc = Label(subWindow.rightUpSubWindow, text = "您的文章词汇高级度提升了:" + wordLevelIncPercent)
  599. subWindow.rightUpSubWindow.wordLevelInc.place(relheight = 0.1, relwidth = 0.9, relx = 0.05, rely = 0.2)
  600. # 显示fig1
  601. subWindow.rightUpSubWindow.fig1 = Label(subWindow.rightUpSubWindow, image = img1Png)
  602. subWindow.rightUpSubWindow.fig1.place(relheight = 0.4, relwidth = 0.4, relx = 0.05, rely = 0.5)
  603. # 显示fig2
  604. subWindow.rightUpSubWindow.fig2 = Label(subWindow.rightUpSubWindow, image = img2Png)
  605. subWindow.rightUpSubWindow.fig2.place(relheight = 0.4, relwidth = 0.4, relx = 0.55, rely = 0.5)
  606. # 显示文章主题契合度提升值
  607. subWindow.rightDownSubWindow.themeFit = Label(subWindow.rightDownSubWindow, text = "您的文章主题契合度提升了:" + themeFitIncPercent)
  608. subWindow.rightDownSubWindow.themeFit.place(relheight = 0.1, relwidth = 0.9, relx = 0.05, rely = 0.05)
  609. # 显示关键词分布
  610. subWindow.rightDownSubWindow.keyword = Label(subWindow.rightDownSubWindow, text = "具体关键词分布如下:")
  611. subWindow.rightDownSubWindow.keyword.place(relheight = 0.1, relwidth = 0.9, relx = 0.05, rely = 0.2)
  612. # 显示fig3
  613. subWindow.rightDownSubWindow.fig3 = Label(subWindow.rightDownSubWindow, image = img3Png)
  614. subWindow.rightDownSubWindow.fig3.place(relheight = 0.4, relwidth = 0.3, relx = 0, rely = 0.5)
  615. # 显示fig4
  616. subWindow.rightDownSubWindow.fig4 = Label(subWindow.rightDownSubWindow, image = img4Png)
  617. subWindow.rightDownSubWindow.fig4.place(relheight = 0.4, relwidth = 0.3, relx = 0.35, rely = 0.5)
  618. # 显示fig5
  619. subWindow.rightDownSubWindow.fig5 = Label(subWindow.rightDownSubWindow, image = img5Png)
  620. subWindow.rightDownSubWindow.fig5.place(relheight = 0.4, relwidth = 0.3, relx = 0.7, rely = 0.5)
  621. # 显示检测报告
  622. def showCheckReport(self, subWindow) :
  623. # 清空子窗体内控件
  624. for widget in subWindow.winfo_children() :
  625. widget.destroy()
  626. # 初始化报告图片
  627. global checkReportPng
  628. checkReportPng = ImageTk.PhotoImage(Img1Open,master = subWindow)
  629. # 显示报告
  630. subWindow.checkReport = Label(subWindow, image = checkReportPng)
  631. subWindow.checkReport.place(relheight = 0.9, relwidth = 0.9, relx = 0.05, rely = 0.05)
  632. # 显示题目要求
  633. def showRequirement(self, subWindow) :
  634. # 清空子窗体内控件
  635. for widget in subWindow.winfo_children() :
  636. widget.destroy()
  637. # 显示题目要求
  638. subWindow.requirementTextBox = scrolledtext.ScrolledText(subWindow)
  639. subWindow.requirementTextBox.insert(END,requirementText)
  640. subWindow.requirementTextBox.place(relheight = 0.9, relwidth = 0.9, relx = 0.05, rely = 0.05)
  641. root = Tk()
  642. root.title(windowTitle)
  643. window1 = mainWindow(master = root)
  644. window1.mainloop()
'
运行

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家自动化/article/detail/819369
推荐阅读
相关标签
  

闽ICP备14008679号