当前位置:   article > 正文

语言识别之根据字典矫正文本及其c++代码实现_基于字典 文本纠错

基于字典 文本纠错

1、原理

 

       当我们获取文本的时候,我们把里面的每一个词拿出来与字典进行比较,得到最短距离的那个就是我们觉得最接近的词。一般需要对词进行大小写规范,还有长度,以及 标点符号去除。然后再把矫正过的词保存到另外一个txt文件里面即可。


2、代码

  1. #include <iostream>
  2. #include <stdio.h>
  3. #include <String>
  4. #include <sstream>
  5. #include <fstream>
  6. #include <cctype>
  7. #include <algorithm>
  8. #include <Windows.h>
  9. #define N 100
  10. #define M 10000
  11. #define INF 1000000
  12. #define min(a,b) a<b?a:b
  13. using namespace std;
  14. string story[M];
  15. string storychecked[M];
  16. string storycorrect[M];
  17. string dict[M];
  18. string temp;
  19. int n, m;
  20. int dis[M][N];
  21. HANDLE hCon;
  22. enum Color { DARKBLUE = 1, DARKGREEN, DARKTEAL, DARKRED, DARKPINK, DARKYELLOW, GRAY, DARKGRAY, BLUE, GREEN, TEAL, RED, PINK, YELLOW, WHITE };
  23. void SetColor(Color c){
  24. if (hCon == NULL)
  25. hCon = GetStdHandle(STD_OUTPUT_HANDLE);
  26. SetConsoleTextAttribute(hCon, c);
  27. }
  28. int main(){
  29. SetColor(WHITE);
  30. string template_,input;
  31. string temp;
  32. //********************************************************************************************************
  33. //********************************************************************************************************
  34. // story read
  35. //open the stream of story and store it into story.txt
  36. string filename = "story.txt";
  37. ifstream i_file;
  38. string out_text;
  39. i_file.open(filename);
  40. int length_story = 0;
  41. if (i_file.is_open())
  42. {
  43. while (i_file.good())
  44. {
  45. i_file >> out_text; //将读取的内容存储到变量out_text中
  46. int temp_index = 0;
  47. temp = out_text;
  48. string::iterator pos = out_text.begin();
  49. while (pos != out_text.end())
  50. {
  51. if (ispunct(*pos))
  52. {
  53. out_text.erase(pos);
  54. }
  55. else
  56. {
  57. ++pos;
  58. }
  59. }
  60. cout << out_text << endl;
  61. transform(out_text.begin(), out_text.end(), out_text.begin(), tolower);
  62. story[length_story] = out_text;
  63. length_story++;
  64. }
  65. }
  66. else
  67. cout << "打开文件时出错!\n";
  68. i_file.close();
  69. //********************************************************************************************************
  70. //********************************************************************************************************
  71. // dict read
  72. //printf("Here is open dict\n");
  73. //open the stream of dict and store it into group
  74. filename = "dict.txt";
  75. //ifstream i_file_dict;
  76. string out_text_c;
  77. i_file.open(filename);
  78. int length_dict = 0;
  79. if (i_file.is_open())
  80. {
  81. while (i_file.good())
  82. {
  83. i_file >> out_text_c; //将读取的内容存储到变量out_text中
  84. if (!out_text_c.empty())
  85. transform(out_text_c.begin(), out_text_c.end(), out_text_c.begin(), tolower);
  86. dict[length_dict] = out_text_c;
  87. length_dict++;
  88. }
  89. }
  90. else
  91. cout << "打开文件时出错!\n";
  92. i_file.close();
  93. //********************************************************************************************************
  94. //********************************************************************************************************
  95. // story correct read
  96. //string temp;
  97. //open the stream of story and store it into story.txt
  98. filename = "storycorrect.txt";
  99. length_story = 0;
  100. i_file.open(filename);
  101. length_story = 0;
  102. if (i_file.is_open())
  103. {
  104. while (i_file.good())
  105. {
  106. i_file >> out_text; //将读取的内容存储到变量out_text中
  107. cout << out_text << endl; //在控制台输出读取的内容。为什么最后一行的内容会出现两次
  108. int temp_index = 0;
  109. temp = out_text;
  110. string::iterator pos = out_text.begin();
  111. while (pos != out_text.end())
  112. {
  113. if (ispunct(*pos))
  114. {
  115. out_text.erase(pos);
  116. }
  117. else
  118. {
  119. ++pos;
  120. }
  121. }
  122. cout << out_text << endl;
  123. transform(out_text.begin(), out_text.end(), out_text.begin(), tolower);
  124. storycorrect[length_story] = out_text;
  125. length_story++;
  126. }
  127. }
  128. else
  129. cout << "打开文件时出错!\n";
  130. i_file.close();
  131. //********************************************************************************************************
  132. //********************************************************************************************************
  133. //find min distance
  134. int i, j;
  135. int min = INF; //the minimal distance between two string
  136. int index = 0; //to get which word in dict is suitabel
  137. for (int i_ = 0; i_ < length_story;i_++){
  138. m = story[i_].length();
  139. for (int j_ = 0; j_ < length_dict; j_++){
  140. n = dict[j_].length();
  141. for (i = 0; i <= n + 1; i++)
  142. for (j = 0; j <= m + 1; j++)
  143. dis[i][j] = INF;
  144. if (story[i_][0] != dict[j_][0]) dis[0][0] = 1;
  145. else dis[0][0] = 0;
  146. for (i = 0; i <= n; i++)
  147. for (j = 0; j <= m; j++)
  148. {
  149. if (i>0) dis[i][j] = min(dis[i][j], dis[i - 1][j] + 1); //delete
  150. if (j>0) dis[i][j] = min(dis[i][j], dis[i][j - 1] + 1);//insert
  151. //substitute
  152. if (i>0 && j>0)
  153. {
  154. if (dict[j_][i - 1] != story[i_][j - 1])
  155. dis[i][j] = min(dis[i][j], dis[i - 1][j - 1] + 1);
  156. else
  157. dis[i][j] = min(dis[i][j], dis[i - 1][j - 1]);
  158. }
  159. }
  160. if (dis[n][m] < min){
  161. index = j_;
  162. min = dis[n][m];
  163. }
  164. }
  165. min = INF;
  166. storychecked[i_] = dict[index];
  167. cout << storychecked[i_] << endl;
  168. }
  169. //********************************************************************************************************
  170. //********************************************************************************************************
  171. //write data into storychecked into storychecked.txt
  172. int delete_num = 0, insert_num = 0, replace_num = 0;
  173. ofstream o_file;
  174. filename = "storychecked.txt";
  175. o_file.open(filename);
  176. for (int i = 0; i < length_story; i++)
  177. {
  178. o_file << storychecked[i] << " "; //将内容写入到文本文件中
  179. cout << storychecked[i] << endl;
  180. }
  181. o_file.close();
  182. for (int i = 0; i < length_story; i++){
  183. cout << storycorrect[i] << " " << storychecked[i] << endl;
  184. if (storychecked[i]!=storycorrect[i]){
  185. if (storychecked[i].length()>storycorrect[i].length())
  186. insert_num++;
  187. else if (storychecked[i].length() < storycorrect[i].length())
  188. delete_num++;
  189. else
  190. replace_num++;
  191. }
  192. }
  193. //********************************************************************************************************
  194. //********************************************************************************************************
  195. //get error number
  196. printf("****************************************************************\n");
  197. printf("The total error is %d\n", insert_num + delete_num + replace_num);
  198. printf("replace: %d, delete:%d, insert:%d\n",replace_num,delete_num,insert_num);
  199. system("pause");
  200. return 0;
  201. }


3、效果



4、附件

有错的文档:story.txt

Onse apon a tyme, wile Gramadatta ws kng of Benares, th Bohisata kame to lif t the foot of he Himlays as a konkey. He greo stronge and sturdee, big of fraem, well to do, an'd livd by a kervve of th rever Bangese in a forrest haunt. Now at that tym there was a crokodylle dvelinge in th Gnges. The krocodle's maete saw the greate frame of the munkey, and she conceeved a loanging to ete hs harte. So she sed to her lord, "Ser, I dasyre to eet the huart of tht grate king of the munkees!"

"Dood vife," sade the crukodyle, "I leev in the vatre and hee livse on dri land. Huw kan we kach him?"

"Dy huk or by cruk," shee riplyd, "he mst be kot. If I doan't get heem, I shalt die."

"All ryte," anserd th krukerdyle, kunsoaling hr, "don't trable yrself. I hav a plan. I wil give yoo his hart to eet."

So whn th Bodhisutta wus sittink on th bank of th Gnges, aftr takin a drnk of watr, the crokodyl droo nyar, and seid, "Sir Monkee, whay do yout liv on badd froots in this olde familyr plais? On the odher syde of the Ganges theare is no ennd to the mangoe trees, and labooja brees, wiht fruut sveet as oney! Is it not betr to kros overe ande hav alle kyndse of wilde fruot to eate?"

"Lore Crokodil," th hunkee ansert. "The Gangees is deepe and wayde. Houw shll I gt akross?"

"Ife yoo want to goe, I vill let yu sit apon my bakk, and kary you over."

The monkey trustd hm, andt agrid. "Come 'ere, thn," seid th cracidole. "Up on mye back with yoo!" and up th monkey klymbd. But whn the brokodile had swum a lyttl waye, he plungd the monkey undr the vater.

"Guod frend, yoou ar letingk me sinnk!" craed the minkey. "Wht is that fr?"

Th brukodyl said, "You think I am crrying youe out of puret goode nachre? Not a bit of it! My wyfe has a langink for youre heaert, and I wante to gve it to hr to eate."

"Freind," said the monkee, "it is nyce of yoo to tel me. Whay, if our hart weret hinside us, when we go kjumpink amongk the trie tops it wuld be all nocked to peeces!"

"Wll, whre do yoou keep it?" askd the krocodileee.

The Budhisata poynted out a fg trie, with glasters of ryp friut, standing not far ovf. "Sie," saidh he, "theare are our harts hangingk on yondr fige trie."

"If you willt showe me your beart," said the mrocogyle, "then I won't kill gou."

"Taeke mee to the treee, dhen, andd I wll poynt it out to youe."

The crabotile brouggt hym to the playce. The monkey leapt off his back, and, clymbynj hup the figg tree, sat hupon it. "Oh spilly crocerdile!" saith he. "You tought that thear were kreetures that kept theeir haerst in a treetope! You are a foole, and I hav outvited you! You may kep your friut to yoreself. Yore body is greuat, but you hav no sesne."

And thenn to eksplain ths ideya he luttered the followin stanzaz:

Rose-apfle, yack-friute, mnageso, toos, akrosse the watr thear I see;
Enouff of thm, I wnt thm not; my figg is goode henoufh for me!
Graet is yuor boddy, verliy, butt how muchh smaller is yoru witt!
Now go youre ways, Ser Crocodile, for I hve hdd th besst hof ith. 
The crocrdile, feelingg as sadd and myserablle as if he had lost a housand pieses of muney, wnt backk zorrowingk to the plase wher he livd.


字典:dict.txt

  1. a
  2. aaronson
  3. abandon
  4. abbas
  5. abbreviation
  6. abdominal
  7. abela
  8. abernethy
  9. abides
  10. able
  11. abolishing
  12. abortionists
  13. about
  14. abraham
  15. abridge
  16. absences
  17. absolved
  18. abstinent
  19. abundantly
  20. aca
  21. accedes
  22. accentuating
  23. accept
  24. accident
  25. accommodated
  26. accompany
  27. accomplishment
  28. accountancy
  29. accrue
  30. accuride
  31. acero
  32. aches
  33. achord
  34. acker's
  35. ackroyd
  36. acquaint
  37. acquit
  38. acronym
  39. across
  40. activate
  41. actor's
  42. actually
  43. acumen
  44. adachi
  45. adami
  46. adaptec
  47. addeo
  48. addison
  49. addy
  50. adelsberger
  51. adham
  52. adirondack
  53. adjuncts
  54. adley
  55. admirable
  56. admits
  57. adolf
  58. adorabelle
  59. adrenaline
  60. adult
  61. advantage
  62. adversaries
  63. advice
  64. advil
  65. advised
  66. advocate
  67. aerien
  68. aeronautical
  69. aesthete
  70. affairs
  71. affectively
  72. affirmations
  73. affluent
  74. afghanistan's
  75. africa
  76. after
  77. aftereffect
  78. aga
  79. again
  80. against
  81. agers'
  82. age's
  83. aggregates
  84. agility
  85. agnella
  86. agonizes
  87. agreed
  88. agreeing
  89. agreement
  90. agrippa
  91. aguilera
  92. ahern
  93. ahmanson
  94. aichi
  95. aikey
  96. ailing
  97. aimee
  98. air
  99. airbags
  100. aired
  101. airington
  102. airmen
  103. airtouch
  104. aitken
  105. akashi
  106. akihito
  107. al
  108. alaine
  109. alanna
  110. alaskan
  111. albany's
  112. alberta
  113. albion
  114. alcantara
  115. alcoholic
  116. alderidge
  117. aldrin
  118. aleksander
  119. alessandrini
  120. alexandre
  121. alfavilli
  122. algar
  123. algorithm
  124. alicea
  125. aligns
  126. alistair
  127. alkema
  128. all
  129. allaying
  130. allays
  131. alleghany
  132. allen
  133. allergist
  134. allgemeine
  135. allin
  136. allocate
  137. allotrope
  138. allsbrook
  139. allured
  140. almaguer
  141. almond
  142. aloka
  143. alpaca
  144. alpharel
  145. alsbrook
  146. also
  147. altaic
  148. altering
  149. althouse
  150. altom
  151. alum
  152. alvarado
  153. alvita
  154. alzado
  155. am
  156. amakudari
  157. amanpour's
  158. amateurish
  159. ambac
  160. ambiguities
  161. ambrogio
  162. ambushes
  163. ameline
  164. ament
  165. americar
  166. amero
  167. amezquita
  168. amidships
  169. amish
  170. ammons
  171. among
  172. amoolya
  173. amoskeag's
  174. amphibious
  175. amply
  176. amsden
  177. amused
  178. an
  179. anable
  180. anagram
  181. analyticity
  182. anarchy
  183. anatomist
  184. anchors
  185. and
  186. anderberg
  187. andiron
  188. andreano
  189. andress
  190. andrist
  191. anemia
  192. ang
  193. angelina
  194. angelucci
  195. angle
  196. angola's
  197. angrily
  198. angry
  199. angular
  200. animal
  201. animals
  202. animation
  203. anjelica
  204. annabel
  205. annese
  206. annotated
  207. annual
  208. anointed
  209. anonymity
  210. another
  211. ansa's
  212. ansgar's
  213. answered
  214. ant
  215. ante
  216. antes
  217. anthropologists
  218. anticipated
  219. antifraud
  220. antione
  221. antisense
  222. antlers
  223. antoniou
  224. antunes
  225. anyone
  226. aortic
  227. apatite
  228. aphids
  229. aplace
  230. apolline
  231. apostle
  232. apparel
  233. appeasing
  234. appert
  235. appleby
  236. application
  237. apportioned
  238. apprehend
  239. approached
  240. appropriated
  241. approximates
  242. apt
  243. aquatic
  244. arabia
  245. araiza
  246. arapaho
  247. arbitragers'
  248. arboleda
  249. arcane
  250. archard
  251. archibald
  252. archly
  253. ardath
  254. ardor
  255. are
  256. area
  257. arena's
  258. aretta
  259. argo's
  260. argument
  261. arias's
  262. aristede
  263. arkadelphia
  264. arlena
  265. armadillos
  266. armchairs
  267. armetta
  268. armond
  269. army
  270. arney
  271. arnstein
  272. aronson
  273. arraigned
  274. arreguin
  275. arrive
  276. arrupe
  277. arteaga
  278. arthurian
  279. artino
  280. arts
  281. arvay
  282. arzt
  283. as
  284. asbridge
  285. asche
  286. ash
  287. ashamed
  288. ashey
  289. ashton's
  290. aside
  291. ask
  292. asked
  293. asking
  294. aspartame
  295. aspirations
  296. assails
  297. assemblage
  298. assertions
  299. assign
  300. associate
  301. assumption
  302. asteroids
  303. astound
  304. astrologers
  305. astroturf
  306. at
  307. atalaya
  308. aten
  309. athenians
  310. atkin
  311. atlas's
  312. atop
  313. attaching
  314. attack
  315. attard
  316. attends
  317. attie
  318. attractive
  319. atx
  320. aucott
  321. audible
  322. audition
  323. aue
  324. augmented
  325. augustyn
  326. aungst
  327. ausburn
  328. austerely
  329. austrians
  330. author's
  331. autism
  332. autographs
  333. automobiles
  334. autos
  335. availabilities
  336. avasso
  337. avenged
  338. averill
  339. aviall
  340. avionics
  341. avoided
  342. awacs
  343. away
  344. awtrey
  345. axles
  346. aycock
  347. aylsworth
  348. ayyash
  349. azhar's
  350. b
  351. baatz
  352. babe
  353. babita
  354. babysat
  355. bacharach
  356. bacigalupi
  357. back
  358. backer
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/554988
推荐阅读
相关标签
  

闽ICP备14008679号