赞
踩
N-gram信息的构建在ngram.cpp中进行构建:
- bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num,
- LemmaIdType next_idx_unused) {
- ...
- //1、初始化freqs数组,lemma_arr数组元素的idx_by_hz字段对应freqs的索引
- //2、计算total_freq
- for (size_t pos = 0; pos < lemma_num; pos++) {
- if (lemma_arr[pos].idx_by_hz == idx_now)
- continue;
- idx_now++;
- assert(lemma_arr[pos].idx_by_hz == idx_now);
- freqs[idx_now] = lemma_arr[pos].freq;
- if (freqs[idx_now] <= 0)
- freqs[idx_now] = 0.3;
- total_freq += freqs[idx_now];
- }
- ...
-
- //更新freqs数组并计算最大freq,计算方式为freq/total_freq
- for (size_t pos = 0; pos < idx_num_; pos++) {
- freqs[pos] = freqs[pos] / total_freq;
- assert(freqs[pos] > 0);
- if (freqs[pos] > max_freq)
- max_freq = freqs[pos];
- }
- // calculate the code book
- ...
- //初始化freq_codes_df_数组,元素个数为256个
- for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
- bool found = true;
-
- while (found) {
- found = false;
- double cand = freqs[freq_pos];
- for (size_t i = 0; i < code_pos; i++)
- if (freq_codes_df_[i] == cand) {
- found = true;
- break;
- }
- if (found)
- freq_pos++;
- }
-
- freq_codes_df_[code_pos] = freqs[freq_pos];
- freq_pos++;
- }
-
- //对freq_codes_df_进行排序,共256个元素
- myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);
-
- ...
- iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_);
-
- delete [] freqs;
-
- ...
- for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
- double log_score = log(freq_codes_df_[code_pos]);
- float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
- if (kPrintDebug0) {
- printf("code:%ld, probability:%.9f, log score:%.3f, final score: %.3f\n",
- code_pos, freq_codes_df_[code_pos], log_score, final_score);
- }
- freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
- }
-
- initialized_ = true;
- return true;
- }

目前对LatinIME中的ngram还没有一个整体的了解,因此针对ngram信息构建流程分析目前先按照代码执行次序进行分析,在分析中进一步增加了解,先来看下第一个for循环:
- //1、初始化freqs数组,lemma_arr数组元素的idx_by_hz字段对应freqs的索引
- //2、计算total_freq
- for (size_t pos = 0; pos < lemma_num; pos++) {
- if (lemma_arr[pos].idx_by_hz == idx_now)
- continue;
- idx_now++;
- assert(lemma_arr[pos].idx_by_hz == idx_now);
- freqs[idx_now] = lemma_arr[pos].freq;
- if (freqs[idx_now] <= 0)
- freqs[idx_now] = 0.3;
- total_freq += freqs[idx_now];
- }
lemma_num是一个unsigned long类型变量,它的值在这里等于65101,也就是lemma_arr的长度,第八行可以看到,从lemma_arr数组中取出freq字段赋值给freqs数组,同时计算total_freq,第零个元素的freq是=0.3的(第十行),这个for循环实际上是为了初始化freqs数组,此时的freqs数组元素都还是lemma_arr数组中的freq字段,无序也未经抓换的,但是紧接着往下就是对freqs数组进行转换了:
- //更新freqs数组并计算最大freq,计算方式为freq/total_freq
- for (size_t pos = 0; pos < idx_num_; pos++) {
- freqs[pos] = freqs[pos] / total_freq;
- assert(freqs[pos] > 0);
- if (freqs[pos] > max_freq)
- max_freq = freqs[pos];
- }
在这个循环中使用上一个for循环计算出来的total_freq来更新freqs数组内容,同时找出max_freq变量,经过这一步转换freqs数组内容为(前十个元素为例):
- (gdb) p *freqs@10
- $19 = {3.1514010662811756e-09, 2.6102481776047228e-06, 0.0014117510264284483, 4.2135427367586289e-05, 3.5443016051193335e-08, 6.6320132654683491e-05, 5.0999042085147344e-08, 0.00027250210157388183,
- 3.2141248599444556e-06, 0.00028112528687696588}
然后下一个for循环中取出freqs数组256个不重复的元素并初始化给了数组freq_codes_df_:
- for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
- bool found = true;
-
- while (found) {
- found = false;
- double cand = freqs[freq_pos];
- for (size_t i = 0; i < code_pos; i++)
- if (freq_codes_df_[i] == cand) {
- found = true;
- break;
- }
- if (found)
- freq_pos++;
- }
-
- freq_codes_df_[code_pos] = freqs[freq_pos];
- freq_pos++;
- }

初始化完成后又对该数组进行了排序:
myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);
排序完成后调用了iterator_code方法:
- void iterate_codes(double freqs[], size_t num, double code_book[],
- CODEBOOK_TYPE *code_idx) {
- size_t iter_num = 0;
- double delta_last = 0;
- do {
- size_t changed = update_code_idx(freqs, num, code_book, code_idx);
-
- double delta = recalculate_kernel(freqs, num, code_book, code_idx);
-
- if (kPrintDebug0) {
- printf("---Unigram codebook iteration: %ld : %ld, %.9f\n",
- iter_num, changed, delta);
- }
- iter_num++;
-
- if (iter_num > 1 &&
- (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001))
- break;
- delta_last = delta;
- } while (true);
- }

这个方法主要是调用了update_code_idx和recalculate_kernel方法,先来看update_code_idx方法:
- size_t update_code_idx(double freqs[], size_t num, double code_book[],
- CODEBOOK_TYPE *code_idx) {
- //使用code_book数组中的元素索引更新code_idx指针数组
- size_t changed = 0;
- for (size_t pos = 0; pos < num; pos++) {
- CODEBOOK_TYPE idx;
- idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1);
- if (idx != code_idx[pos])
- changed++;
- code_idx[pos] = idx;
- }
- return changed;
- }
根据函数名称就大概能了解这个函数做了什么,更新code_idx数组,这个数组类型是CODEBOOK_TYPE,定义为:
typedef unsigned char CODEBOOK_TYPE;
数组code_idx中的元素值来自一个二分查找方法qsearch_nearest(),传入的参数为code_book和freqs数组,freqs数组前面说过了是经过freq/total_freq转化过的数组,那code_book数组根据调用栈可以知道它就是前面的freq_codes_df_数组并且是经过排序的,那么他们是如何进行二分查找呢?
- // Find the index of the code value which is nearest to the given freq
- int qsearch_nearest(double code_book[], double freq, int start, int end) {
- //根据freq查找元素,离给定freq最近的freq对应的元素索引进行返回
- if (start == end)
- return start;
- //查找到了,但是根据distance来判断返回前一个还是后一个
- if (start + 1 == end) {
- if (distance(freq, code_book[end]) > distance(freq, code_book[start]))
- return start;
- return end;
- }
-
- int mid = (start + end) / 2;
- //继续二分查找
- if (code_book[mid] > freq)
- return qsearch_nearest(code_book, freq, start, mid);
- else
- return qsearch_nearest(code_book, freq, mid, end);
- }

start和end分别是0和255,查找的对象是freq变量也就是freqs[pos],查找的范围是code_book数组,如果start+1 != end就一直二分查找,直到落到两个相邻元素的区间内,判断离哪个元素比较近就返回较近的哪个元素的索引,最终code_idx数组中存储的是freqs数组中freq离code_book数组中每个freq比较近的元素索引,这样code_idx数组就初始化完成了,iterator_codes中还调用了recalculate_kernel方法:
- double recalculate_kernel(double freqs[], size_t num, double code_book[],
- CODEBOOK_TYPE *code_idx) {
- double ret = 0;
- //初始化容器
- size_t *item_num = new size_t[kCodeBookSize];
- assert(item_num);
- memset(item_num, 0, sizeof(size_t) * kCodeBookSize);
-
- double *cb_new = new double[kCodeBookSize];
- assert(cb_new);
- memset(cb_new, 0, sizeof(double) * kCodeBookSize);
- //
- for (size_t pos = 0; pos < num; pos++) {
- ret += distance(freqs[pos], code_book[code_idx[pos]]);
-
- cb_new[code_idx[pos]] += freqs[pos];
- item_num[code_idx[pos]] += 1;
- }
-
- for (size_t code = 0; code < kCodeBookSize; code++) {
- assert(item_num[code] > 0);
- code_book[code] = cb_new[code] / item_num[code];
- }
-
- delete [] item_num;
- delete [] cb_new;
-
- return ret;
- }

整体上看,这个函数最终也是循环遍历kCodeBookSize(256)次,使用cb_new和item_num两个数组来跟新code_book数组,具体过程就不说了,最终更新后的code_book数组为(前十个元素为例):
- (gdb) p *code_book@10
- $1 = {5.635799479070388e-09, 1.1536144437805393e-08, 1.8915161279992252e-08, 2.345806003473422e-08, 2.6750659323917793e-08, 2.9549793110303631e-08, 3.2228569987532223e-08, 3.4500060174875951e-08,
- 3.5988886505094894e-08, 3.7613863559902464e-08}
综上,iterator_codes方法主要是更新code_book和code_idx数组的内容。最后这个for循环:
- for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
- double log_score = log(freq_codes_df_[code_pos]);
- float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
- if (kPrintDebug0) {
- printf("code:%ld, probability:%.9f, log score:%.3f, final score: %.3f\n",
- code_pos, freq_codes_df_[code_pos], log_score, final_score);
- }
- freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
- }
这个for循环遍历freq_codes_df_数组,计算每个元素的自然对数(然而并未用到),调用convert_psb_to_score方法计算final_score更新到freq_codes_数组中:
- (gdb) p *freq_codes_@256
- $8 = {14978, 14732, 14555, 14403, 14276, 14166, 14069, 13976, 13895, 13831, 13772, 13720, 13670, 13622, 13586, 13557, 13537, 13516, 13493, 13469, 13450, 13435, 13422, 13406, 13386, 13367, 13349, 13334,
- 13317, 13303, 13289, 13273, 13256, 13235, 13208, 13181, 13156, 13132, 13108, 13088, 13075, 13060, 13042, 13024, 13007, 12990, 12967, 12944, 12917, 12890, 12863, 12834, 12799, 12763, 12728, 12693,
- 12661, 12625, 12598, 12564, 12520, 12468, 12416, 12353, 12289, 12231, 12184, 12140, 12105, 12068, 12034, 12005, 11981, 11954, 11920, 11880, 11837, 11795, 11750, 11699, 11658, 11624, 11588, 11541,
- 11501, 11456, 11412, 11369, 11332, 11298, 11270, 11245, 11222, 11199, 11178, 11151, 11123, 11091, 11063, 11019, 10958, 10925, 10894, 10865, 10836, 10808, 10783, 10758, 10734, 10707, 10679, 10651,
- 10621, 10590, 10556, 10520, 10481, 10440, 10398, 10353, 10306, 10261, 10212, 10163, 10115, 10068, 10020, 9970, 9919, 9867, 9816, 9767, 9717, 9669, 9621, 9578, 9538, 9501, 9465, 9429, 9391, 9350, 9308,
- 9266, 9225, 9190, 9155, 9126, 9102, 9080, 9063, 9048, 9033, 9018, 8997, 8978, 8956, 8934, 8911, 8887, 8857, 8827, 8789, 8750, 8715, 8685, 8656, 8625, 8601, 8582, 8563, 8545, 8525, 8506, 8493, 8479,
- 8467, 8455, 8444, 8431, 8418, 8403, 8382, 8357, 8328, 8295, 8263, 8232, 8204, 8174, 8145, 8110, 8081, 8056, 8025, 7990, 7956, 7922, 7891, 7868, 7846, 7823, 7805, 7789, 7773, 7756, 7735, 7709, 7688,
- 7667, 7640, 7614, 7588, 7561, 7533, 7506, 7474, 7435, 7391, 7342, 7293, 7245, 7204, 7157, 7104, 7056, 6997, 6941, 6896, 6852, 6801, 6737, 6632, 6510, 6378, 6228, 6077, 5950, 5795, 5663, 5512, 5432,
- 5341, 5236, 5107, 5030, 4934, 4883, 4794, 4761, 4640, 4526, 4397, 4044, 3522, 2385}
到此ngram信息构建就完成了,最终也就是初始化了一个freq_codes_数组,此数组在预测逻辑中会使用,ngram这块单独拿出来分析没什么意义,还是要结合在预测输入的过程才能体现其价值所在,本片文章在分析latinIME输入预测等整体流程的过程中将会持续更新,欢迎批评指正!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。