赞
踩
层次短语模型是David Chiang在短语模型基础之上提出来的模型,该模型属于形式化句法翻译模型。将普通短语模型拓展成了层次化的短语。例如“X1 和 X2”。
本文着重讲述层次短语模型的短语规则抽取模块,也就是如何从双语句对的训练集中抽取去“短语表”。
我们可以参看如下图,系统的总体框架图:
将系统中出现的物体都抽象成对象。大家可以通过命名就知道此点。
下面我们将对重要的子过程进行讲述:
1、LexTranslator词到词的翻译
底层的数据结构如下:
- typedef map<Word, Float> LexTableItem;
- typedef map<Word, map<Word, Float> > LexTable;
- LexTable f2e_table_; // prob(0.0-1.0) from f to e
- LexTable e2f_table_;
2、对齐一致性的抽取
对齐一致性的抽取采用了前缀数组,通过检测数组的相应范围变化是否一致,得出是否为对齐一致性。
- void Alignment::CreateTightConsistSpan(int src_limit, int trg_limit)
- {
- //count the size of alignment of prefix
- vector<int> src_count, trg_count;
- src_count.resize(m_src_size, 0);
- trg_count.resize(m_trg_size, 0);
-
- for (size_t i = 0; i < (size_t)m_src_size; i++)
- {
- for (size_t j = 0; j < m_wa[i].size(); j++)
- {
- src_count[i]++;
- trg_count[m_wa[i][j]]++;
- }//end for j
- }//end for i
-
- for (size_t i = 1; i < src_count.size(); i++)
- src_count[i] += src_count[i - 1];
- for (size_t i = 1; i < trg_count.size(); i++)
- trg_count[i] += trg_count[i - 1];
-
- Alignment::Span trg;
- for (int begin = 0; begin < m_src_size; begin++)
- {
- trg.first = MAX_INT;
- trg.second = MIN_INT;
- for (int dist = 1; dist <= src_limit && dist + begin - 1 < m_src_size; dist++)
- {
- int end = begin + dist - 1;
- for (size_t i = 0; i < m_wa[end].size(); i++)
- {
- if (trg.first > m_wa[end][i])
- trg.first = m_wa[end][i];
- if (trg.second < m_wa[end][i])
- trg.second = m_wa[end][i];
- }
-
- if (trg.first > trg.second) //null alignment
- continue;
- if (trg.second - trg.first + 1 > trg_limit)
- continue;
-
- int f = src_count[end];
- if (begin != 0)
- f -= src_count[begin - 1];
-
- f -= trg_count[trg.second];
- if (trg.first != 0)
- f += trg_count[trg.first - 1];
-
- if (f == 0) //consistent to align
- {
- //tight consist, boundary words must have alignments
- if (m_wa[begin].size() != 0 && m_wa[end].size() != 0)
- m_consist_spans[Alignment::Span(begin, end)] = trg;
- }
- }
- }
- }
3、Extractor的抽取规则模块讲解
- void Extractor::Extract(const string& src_file, const string& trg_file, const string& wa_file)
- {
- ifstream in_src, in_trg, in_wa;
- ReadFile(src_file, in_src);
- ReadFile(trg_file, in_trg);
- ReadFile(wa_file, in_wa);
-
- Log::Instance().Out() << "Starting to extract rule!" << endl;
- Log::Instance().TimeStart();
-
- map<string, Rule *> sent_rules;//store the rules extracted from a sentence
- map<string, Rule *> rule_map; //cache for store extracted but not yet output file
- string src, trg, wa;
-
- int part_file_id = 0;
- int sent_id = 0;
- int rule_count = 0;
- while (getline(in_src, src)
- && getline(in_trg, trg)
- && getline(in_wa, wa))
- {
- sent_id ++;
-
- SentPair sent;
- sent.SetSentId(sent_id - 1);
-
- if (sent.Init(src, trg, wa))
- sent.ExtractRules(sent_rules);
- else
- continue;
-
- rule_count += sent_rules.size();
- LocalCombine(sent_rules, rule_map);
-
- if ((int) rule_map.size() > StaticData::Instance().Capacity())
- {
- OutCache(m_part_file, part_file_id, e2f, rule_map);
- part_file_id++;
- }
-
- if (sent_id % 10000 == 0)
- {
- Log::Instance().Out() << "cur sent_id:" << sent_id <<endl;;
- }
- }
- OutCache(m_part_file, part_file_id, e2f, rule_map);
-
- in_src.close();
- in_trg.close();
- in_wa.close();
-
- Log::Instance().Out() << "end extracted rule in time (s):"
- << Log::Instance().TimeEnd() << endl;
- }
4、规则概率估算
1)合并所有的临时文件->一个e2f的文件A
2)对A进行排序
3)计算f2e的概率,并且生成f2e文件B
4)对B进行排序
5)计算e2f的概率,并且生成最终规则文件
5、抽取一个句对中所有的规则
- void SentPair::ExtractRules(std::map<string, Rule *>& rule_map)
- {
- SentenceMeta sm;
- sm.sent_id_ = this->sent_id_;
- sm.src_ = &src_;
- sm.trg_ = &trg_;
- StaticData::Instance().GetFeatureSet().Prepare(sm);
-
- // use cky-style algorithm to find all consistent rule
- for (int dist = 1; dist <= StaticData::Instance().SrcSpanLimit(); dist++)
- {
- for (size_t begin = 0; begin + dist - 1 < src_.size(); begin++)
- {
- pair<int,int> span;
- span.first = begin;
- span.second = begin + dist - 1;
-
- if (Log::Instance().IsVerbose(3))
- {
- Log::Instance().Out() << "\n deal span ("
- << span.first << ", " << span.second << ")" <<endl;
- }
-
- GetRule(span, rule_map);
- } //end begin
- } //end dist
-
- map<string, Rule *>::const_iterator citer;
- for (citer = rule_map.begin(); citer != rule_map.end(); citer++)
- StaticData::Instance().GetFeatureSet().Final(sm, *citer->second);
- }
- void SentPair::GetRule(const pair<int,int>& span, map<string ,Rule *>& rule_map)
- {
- // current span must be consist
- Alignment::SpanAlign::const_iterator citer;
- const Alignment::SpanAlign& cs = wa_->GetConsistSpans();
- map<string, Rule *>::iterator iter;
- citer = cs.find(span);
- if (citer == cs.end())
- return;
-
- // TODO support extract boundary expansion
- // full lexical rule trg_span shall be small than limit
- SentenceMeta sm;
- sm.sent_id_ = this->sent_id_;
- sm.src_ = &src_;
- sm.trg_ = &trg_;
- Context context;
- context.src_span_ = span;
- context.trg_span_ = citer->second;
-
- //extract bp
- if (span.second - span.first + 1 <= StaticData::Instance().InitPhraseLimit())
- {
- vector<pair<int,int> > empty;
- Rule * rule = new Rule();
- CreateSrcTrg(span, empty, citer->second, empty, rule->src_rhs_, rule->trg_rhs_, rule->wa_);
- StaticData::Instance().GetFeatureSet().Traverse(sm, context, 1.0, *rule);
- //cout << "rule->fract_count_: " << rule->fract_count_ << endl;
-
- iter = rule_map.find(rule->Key());
- if (iter == rule_map.end())
- {
- rule_map[rule->Key()] = rule;
- }
- else
- {
- iter->second ->Add(*rule);
- delete rule;
- }
- }
-
- //extract rules with variable
- vector<vector<pair<int,int> > > var_span;
- EnumerateVar(span, var_span);
- vector<pair<int,int> > trg_childs_span;
- for (size_t i = 0; i < (int)var_span.size(); i++)
- {
- trg_childs_span.resize(var_span[i].size());
- for (size_t j = 0; j < var_span[i].size(); j++)
- trg_childs_span[j] = cs.find(var_span[i][j])->second;
-
- Rule *rule = new Rule();
- CreateSrcTrg(span, var_span[i], citer->second, trg_childs_span, rule->src_rhs_, rule->trg_rhs_, rule->wa_);
- //cout << "rule->fract_count_: " << rule->fract_count_ << endl;
- //if (rule->m_wa.size() == var_span[i].size()) {//must have lexical alignment
- if (rule->AlignLinkCount() == var_span[i].size()) //must have lexical alignment
- {
- delete rule;
- continue;
- }
- context.src_var_spans_ = var_span[i];
- context.trg_var_spans_ = trg_childs_span;
- StaticData::Instance().GetFeatureSet().Traverse(sm, context, (Float) 1.0/var_span.size(), *rule);
-
- iter = rule_map.find(rule->Key());
- if (iter == rule_map.end())
- {
- rule_map[rule->Key()] = rule;
- }
- else
- {
- iter->second->Add(*rule);
- delete rule;
- }
- }
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。