当前位置:   article > 正文

Google原生输入法LatinIME词库构建流程分析(三)--N-gram信息构建_latinime 词库更新

latinime 词库更新

N-gram信息的构建在ngram.cpp中进行构建:

  1. bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num,
  2. LemmaIdType next_idx_unused) {
  3. ...
  4. //1、初始化freqs数组,lemma_arr数组元素的idx_by_hz字段对应freqs的索引
  5. //2、计算total_freq
  6. for (size_t pos = 0; pos < lemma_num; pos++) {
  7. if (lemma_arr[pos].idx_by_hz == idx_now)
  8. continue;
  9. idx_now++;
  10. assert(lemma_arr[pos].idx_by_hz == idx_now);
  11. freqs[idx_now] = lemma_arr[pos].freq;
  12. if (freqs[idx_now] <= 0)
  13. freqs[idx_now] = 0.3;
  14. total_freq += freqs[idx_now];
  15. }
  16. ...
  17. //更新freqs数组并计算最大freq,计算方式为freq/total_freq
  18. for (size_t pos = 0; pos < idx_num_; pos++) {
  19. freqs[pos] = freqs[pos] / total_freq;
  20. assert(freqs[pos] > 0);
  21. if (freqs[pos] > max_freq)
  22. max_freq = freqs[pos];
  23. }
  24. // calculate the code book
  25. ...
  26. //初始化freq_codes_df_数组,元素个数为256
  27. for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
  28. bool found = true;
  29. while (found) {
  30. found = false;
  31. double cand = freqs[freq_pos];
  32. for (size_t i = 0; i < code_pos; i++)
  33. if (freq_codes_df_[i] == cand) {
  34. found = true;
  35. break;
  36. }
  37. if (found)
  38. freq_pos++;
  39. }
  40. freq_codes_df_[code_pos] = freqs[freq_pos];
  41. freq_pos++;
  42. }
  43. //对freq_codes_df_进行排序,共256个元素
  44. myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);
  45. ...
  46. iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_);
  47. delete [] freqs;
  48. ...
  49. for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
  50. double log_score = log(freq_codes_df_[code_pos]);
  51. float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
  52. if (kPrintDebug0) {
  53. printf("code:%ld, probability:%.9f, log score:%.3f, final score: %.3f\n",
  54. code_pos, freq_codes_df_[code_pos], log_score, final_score);
  55. }
  56. freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
  57. }
  58. initialized_ = true;
  59. return true;
  60. }

目前对LatinIME中的ngram还没有一个整体的了解,因此针对ngram信息构建流程分析目前先按照代码执行次序进行分析,在分析中进一步增加了解,先来看下第一个for循环:

  1. //1、初始化freqs数组,lemma_arr数组元素的idx_by_hz字段对应freqs的索引
  2. //2、计算total_freq
  3. for (size_t pos = 0; pos < lemma_num; pos++) {
  4. if (lemma_arr[pos].idx_by_hz == idx_now)
  5. continue;
  6. idx_now++;
  7. assert(lemma_arr[pos].idx_by_hz == idx_now);
  8. freqs[idx_now] = lemma_arr[pos].freq;
  9. if (freqs[idx_now] <= 0)
  10. freqs[idx_now] = 0.3;
  11. total_freq += freqs[idx_now];
  12. }

lemma_num是一个unsigned long类型变量,它的值在这里等于65101,也就是lemma_arr的长度,第八行可以看到,从lemma_arr数组中取出freq字段赋值给freqs数组,同时计算total_freq,第零个元素的freq是=0.3的(第十行),这个for循环实际上是为了初始化freqs数组,此时的freqs数组元素都还是lemma_arr数组中的freq字段,无序也未经抓换的,但是紧接着往下就是对freqs数组进行转换了:

  1. //更新freqs数组并计算最大freq,计算方式为freq/total_freq
  2. for (size_t pos = 0; pos < idx_num_; pos++) {
  3. freqs[pos] = freqs[pos] / total_freq;
  4. assert(freqs[pos] > 0);
  5. if (freqs[pos] > max_freq)
  6. max_freq = freqs[pos];
  7. }

在这个循环中使用上一个for循环计算出来的total_freq来更新freqs数组内容,同时找出max_freq变量,经过这一步转换freqs数组内容为(前十个元素为例):

  1. (gdb) p *freqs@10
  2. $19 = {3.1514010662811756e-09, 2.6102481776047228e-06, 0.0014117510264284483, 4.2135427367586289e-05, 3.5443016051193335e-08, 6.6320132654683491e-05, 5.0999042085147344e-08, 0.00027250210157388183,
  3. 3.2141248599444556e-06, 0.00028112528687696588}

然后下一个for循环中取出freqs数组256个不重复的元素并初始化给了数组freq_codes_df_:

  1. for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
  2. bool found = true;
  3. while (found) {
  4. found = false;
  5. double cand = freqs[freq_pos];
  6. for (size_t i = 0; i < code_pos; i++)
  7. if (freq_codes_df_[i] == cand) {
  8. found = true;
  9. break;
  10. }
  11. if (found)
  12. freq_pos++;
  13. }
  14. freq_codes_df_[code_pos] = freqs[freq_pos];
  15. freq_pos++;
  16. }

初始化完成后又对该数组进行了排序:

  myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);

排序完成后调用了iterator_code方法:

  1. void iterate_codes(double freqs[], size_t num, double code_book[],
  2. CODEBOOK_TYPE *code_idx) {
  3. size_t iter_num = 0;
  4. double delta_last = 0;
  5. do {
  6. size_t changed = update_code_idx(freqs, num, code_book, code_idx);
  7. double delta = recalculate_kernel(freqs, num, code_book, code_idx);
  8. if (kPrintDebug0) {
  9. printf("---Unigram codebook iteration: %ld : %ld, %.9f\n",
  10. iter_num, changed, delta);
  11. }
  12. iter_num++;
  13. if (iter_num > 1 &&
  14. (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001))
  15. break;
  16. delta_last = delta;
  17. } while (true);
  18. }

这个方法主要是调用了update_code_idx和recalculate_kernel方法,先来看update_code_idx方法:

  1. size_t update_code_idx(double freqs[], size_t num, double code_book[],
  2. CODEBOOK_TYPE *code_idx) {
  3. //使用code_book数组中的元素索引更新code_idx指针数组
  4. size_t changed = 0;
  5. for (size_t pos = 0; pos < num; pos++) {
  6. CODEBOOK_TYPE idx;
  7. idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1);
  8. if (idx != code_idx[pos])
  9. changed++;
  10. code_idx[pos] = idx;
  11. }
  12. return changed;
  13. }

根据函数名称就大概能了解这个函数做了什么,更新code_idx数组,这个数组类型是CODEBOOK_TYPE,定义为:

typedef unsigned char CODEBOOK_TYPE;

数组code_idx中的元素值来自一个二分查找方法qsearch_nearest(),传入的参数为code_book和freqs数组,freqs数组前面说过了是经过freq/total_freq转化过的数组,那code_book数组根据调用栈可以知道它就是前面的freq_codes_df_数组并且是经过排序的,那么他们是如何进行二分查找呢?

  1. // Find the index of the code value which is nearest to the given freq
  2. int qsearch_nearest(double code_book[], double freq, int start, int end) {
  3. //根据freq查找元素,离给定freq最近的freq对应的元素索引进行返回
  4. if (start == end)
  5. return start;
  6. //查找到了,但是根据distance来判断返回前一个还是后一个
  7. if (start + 1 == end) {
  8. if (distance(freq, code_book[end]) > distance(freq, code_book[start]))
  9. return start;
  10. return end;
  11. }
  12. int mid = (start + end) / 2;
  13. //继续二分查找
  14. if (code_book[mid] > freq)
  15. return qsearch_nearest(code_book, freq, start, mid);
  16. else
  17. return qsearch_nearest(code_book, freq, mid, end);
  18. }

start和end分别是0和255,查找的对象是freq变量也就是freqs[pos],查找的范围是code_book数组,如果start+1 != end就一直二分查找,直到落到两个相邻元素的区间内,判断离哪个元素比较近就返回较近的哪个元素的索引,最终code_idx数组中存储的是freqs数组中freq离code_book数组中每个freq比较近的元素索引,这样code_idx数组就初始化完成了,iterator_codes中还调用了recalculate_kernel方法:

  1. double recalculate_kernel(double freqs[], size_t num, double code_book[],
  2. CODEBOOK_TYPE *code_idx) {
  3. double ret = 0;
  4. //初始化容器
  5. size_t *item_num = new size_t[kCodeBookSize];
  6. assert(item_num);
  7. memset(item_num, 0, sizeof(size_t) * kCodeBookSize);
  8. double *cb_new = new double[kCodeBookSize];
  9. assert(cb_new);
  10. memset(cb_new, 0, sizeof(double) * kCodeBookSize);
  11. //
  12. for (size_t pos = 0; pos < num; pos++) {
  13. ret += distance(freqs[pos], code_book[code_idx[pos]]);
  14. cb_new[code_idx[pos]] += freqs[pos];
  15. item_num[code_idx[pos]] += 1;
  16. }
  17. for (size_t code = 0; code < kCodeBookSize; code++) {
  18. assert(item_num[code] > 0);
  19. code_book[code] = cb_new[code] / item_num[code];
  20. }
  21. delete [] item_num;
  22. delete [] cb_new;
  23. return ret;
  24. }


整体上看,这个函数最终也是循环遍历kCodeBookSize(256)次,使用cb_new和item_num两个数组来跟新code_book数组,具体过程就不说了,最终更新后的code_book数组为(前十个元素为例):

  1. (gdb) p *code_book@10
  2. $1 = {5.635799479070388e-09, 1.1536144437805393e-08, 1.8915161279992252e-08, 2.345806003473422e-08, 2.6750659323917793e-08, 2.9549793110303631e-08, 3.2228569987532223e-08, 3.4500060174875951e-08,
  3. 3.5988886505094894e-08, 3.7613863559902464e-08}

综上,iterator_codes方法主要是更新code_book和code_idx数组的内容。最后这个for循环:

  1. for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
  2. double log_score = log(freq_codes_df_[code_pos]);
  3. float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
  4. if (kPrintDebug0) {
  5. printf("code:%ld, probability:%.9f, log score:%.3f, final score: %.3f\n",
  6. code_pos, freq_codes_df_[code_pos], log_score, final_score);
  7. }
  8. freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
  9. }

这个for循环遍历freq_codes_df_数组,计算每个元素的自然对数(然而并未用到),调用convert_psb_to_score方法计算final_score更新到freq_codes_数组中:

  1. (gdb) p *freq_codes_@256
  2. $8 = {14978, 14732, 14555, 14403, 14276, 14166, 14069, 13976, 13895, 13831, 13772, 13720, 13670, 13622, 13586, 13557, 13537, 13516, 13493, 13469, 13450, 13435, 13422, 13406, 13386, 13367, 13349, 13334,
  3. 13317, 13303, 13289, 13273, 13256, 13235, 13208, 13181, 13156, 13132, 13108, 13088, 13075, 13060, 13042, 13024, 13007, 12990, 12967, 12944, 12917, 12890, 12863, 12834, 12799, 12763, 12728, 12693,
  4. 12661, 12625, 12598, 12564, 12520, 12468, 12416, 12353, 12289, 12231, 12184, 12140, 12105, 12068, 12034, 12005, 11981, 11954, 11920, 11880, 11837, 11795, 11750, 11699, 11658, 11624, 11588, 11541,
  5. 11501, 11456, 11412, 11369, 11332, 11298, 11270, 11245, 11222, 11199, 11178, 11151, 11123, 11091, 11063, 11019, 10958, 10925, 10894, 10865, 10836, 10808, 10783, 10758, 10734, 10707, 10679, 10651,
  6. 10621, 10590, 10556, 10520, 10481, 10440, 10398, 10353, 10306, 10261, 10212, 10163, 10115, 10068, 10020, 9970, 9919, 9867, 9816, 9767, 9717, 9669, 9621, 9578, 9538, 9501, 9465, 9429, 9391, 9350, 9308,
  7. 9266, 9225, 9190, 9155, 9126, 9102, 9080, 9063, 9048, 9033, 9018, 8997, 8978, 8956, 8934, 8911, 8887, 8857, 8827, 8789, 8750, 8715, 8685, 8656, 8625, 8601, 8582, 8563, 8545, 8525, 8506, 8493, 8479,
  8. 8467, 8455, 8444, 8431, 8418, 8403, 8382, 8357, 8328, 8295, 8263, 8232, 8204, 8174, 8145, 8110, 8081, 8056, 8025, 7990, 7956, 7922, 7891, 7868, 7846, 7823, 7805, 7789, 7773, 7756, 7735, 7709, 7688,
  9. 7667, 7640, 7614, 7588, 7561, 7533, 7506, 7474, 7435, 7391, 7342, 7293, 7245, 7204, 7157, 7104, 7056, 6997, 6941, 6896, 6852, 6801, 6737, 6632, 6510, 6378, 6228, 6077, 5950, 5795, 5663, 5512, 5432,
  10. 5341, 5236, 5107, 5030, 4934, 4883, 4794, 4761, 4640, 4526, 4397, 4044, 3522, 2385}

到此ngram信息构建就完成了,最终也就是初始化了一个freq_codes_数组,此数组在预测逻辑中会使用,ngram这块单独拿出来分析没什么意义,还是要结合在预测输入的过程才能体现其价值所在,本片文章在分析latinIME输入预测等整体流程的过程中将会持续更新,欢迎批评指正!
 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/600983
推荐阅读
相关标签
  

闽ICP备14008679号