当前位置:   article > 正文

层次短语模型之短语规则抽取_chiang的层次短语模型详细介绍

chiang的层次短语模型详细介绍

层次短语模型是David Chiang在短语模型基础之上提出来的模型,该模型属于形式化句法翻译模型。将普通短语模型拓展成了层次化的短语。例如“X1 和 X2”。

本文着重讲述层次短语模型的短语规则抽取模块,也就是如何从双语句对的训练集中抽取去“短语表”。


我们可以参看如下图,系统的总体框架图:

 

将系统中出现的物体都抽象成对象。大家可以通过命名就知道此点。


下面我们将对重要的子过程进行讲述:

1、LexTranslator词到词的翻译

底层的数据结构如下:

  1. typedef map<Word, Float> LexTableItem;
  2. typedef map<Word, map<Word, Float> > LexTable;
  3. LexTable f2e_table_; // prob(0.0-1.0) from f to e
  4. LexTable e2f_table_;

基本流程如下:



2、对齐一致性的抽取

对齐一致性的抽取采用了前缀数组,通过检测数组的相应范围变化是否一致,得出是否为对齐一致性。

  1. void Alignment::CreateTightConsistSpan(int src_limit, int trg_limit)
  2. {
  3. //count the size of alignment of prefix
  4. vector<int> src_count, trg_count;
  5. src_count.resize(m_src_size, 0);
  6. trg_count.resize(m_trg_size, 0);
  7. for (size_t i = 0; i < (size_t)m_src_size; i++)
  8. {
  9. for (size_t j = 0; j < m_wa[i].size(); j++)
  10. {
  11. src_count[i]++;
  12. trg_count[m_wa[i][j]]++;
  13. }//end for j
  14. }//end for i
  15. for (size_t i = 1; i < src_count.size(); i++)
  16. src_count[i] += src_count[i - 1];
  17. for (size_t i = 1; i < trg_count.size(); i++)
  18. trg_count[i] += trg_count[i - 1];
  19. Alignment::Span trg;
  20. for (int begin = 0; begin < m_src_size; begin++)
  21. {
  22. trg.first = MAX_INT;
  23. trg.second = MIN_INT;
  24. for (int dist = 1; dist <= src_limit && dist + begin - 1 < m_src_size; dist++)
  25. {
  26. int end = begin + dist - 1;
  27. for (size_t i = 0; i < m_wa[end].size(); i++)
  28. {
  29. if (trg.first > m_wa[end][i])
  30. trg.first = m_wa[end][i];
  31. if (trg.second < m_wa[end][i])
  32. trg.second = m_wa[end][i];
  33. }
  34. if (trg.first > trg.second) //null alignment
  35. continue;
  36. if (trg.second - trg.first + 1 > trg_limit)
  37. continue;
  38. int f = src_count[end];
  39. if (begin != 0)
  40. f -= src_count[begin - 1];
  41. f -= trg_count[trg.second];
  42. if (trg.first != 0)
  43. f += trg_count[trg.first - 1];
  44. if (f == 0) //consistent to align
  45. {
  46. //tight consist, boundary words must have alignments
  47. if (m_wa[begin].size() != 0 && m_wa[end].size() != 0)
  48. m_consist_spans[Alignment::Span(begin, end)] = trg;
  49. }
  50. }
  51. }
  52. }

仔细研究代码,此段代码很高效!


3、Extractor的抽取规则模块讲解

  1. void Extractor::Extract(const string& src_file, const string& trg_file, const string& wa_file)
  2. {
  3. ifstream in_src, in_trg, in_wa;
  4. ReadFile(src_file, in_src);
  5. ReadFile(trg_file, in_trg);
  6. ReadFile(wa_file, in_wa);
  7. Log::Instance().Out() << "Starting to extract rule!" << endl;
  8. Log::Instance().TimeStart();
  9. map<string, Rule *> sent_rules;//store the rules extracted from a sentence
  10. map<string, Rule *> rule_map; //cache for store extracted but not yet output file
  11. string src, trg, wa;
  12. int part_file_id = 0;
  13. int sent_id = 0;
  14. int rule_count = 0;
  15. while (getline(in_src, src)
  16. && getline(in_trg, trg)
  17. && getline(in_wa, wa))
  18. {
  19. sent_id ++;
  20. SentPair sent;
  21. sent.SetSentId(sent_id - 1);
  22. if (sent.Init(src, trg, wa))
  23. sent.ExtractRules(sent_rules);
  24. else
  25. continue;
  26. rule_count += sent_rules.size();
  27. LocalCombine(sent_rules, rule_map);
  28. if ((int) rule_map.size() > StaticData::Instance().Capacity())
  29. {
  30. OutCache(m_part_file, part_file_id, e2f, rule_map);
  31. part_file_id++;
  32. }
  33. if (sent_id % 10000 == 0)
  34. {
  35. Log::Instance().Out() << "cur sent_id:" << sent_id <<endl;;
  36. }
  37. }
  38. OutCache(m_part_file, part_file_id, e2f, rule_map);
  39. in_src.close();
  40. in_trg.close();
  41. in_wa.close();
  42. Log::Instance().Out() << "end extracted rule in time (s):"
  43. << Log::Instance().TimeEnd() << endl;
  44. }

不断的对每一句话进行提取规则,然后加入到规则表中,如果规则表的数目超过了设定的值,将输出到临时文件中,并且清空规则表。经过这一步的处理之后,就得到了很多临时文件。

4、规则概率估算

1)合并所有的临时文件->一个e2f的文件A

2)对A进行排序

3)计算f2e的概率,并且生成f2e文件B

4)对B进行排序

5)计算e2f的概率,并且生成最终规则文件

5、抽取一个句对中所有的规则

  1. void SentPair::ExtractRules(std::map<string, Rule *>& rule_map)
  2. {
  3. SentenceMeta sm;
  4. sm.sent_id_ = this->sent_id_;
  5. sm.src_ = &src_;
  6. sm.trg_ = &trg_;
  7. StaticData::Instance().GetFeatureSet().Prepare(sm);
  8. // use cky-style algorithm to find all consistent rule
  9. for (int dist = 1; dist <= StaticData::Instance().SrcSpanLimit(); dist++)
  10. {
  11. for (size_t begin = 0; begin + dist - 1 < src_.size(); begin++)
  12. {
  13. pair<int,int> span;
  14. span.first = begin;
  15. span.second = begin + dist - 1;
  16. if (Log::Instance().IsVerbose(3))
  17. {
  18. Log::Instance().Out() << "\n deal span ("
  19. << span.first << ", " << span.second << ")" <<endl;
  20. }
  21. GetRule(span, rule_map);
  22. } //end begin
  23. } //end dist
  24. map<string, Rule *>::const_iterator citer;
  25. for (citer = rule_map.begin(); citer != rule_map.end(); citer++)
  26. StaticData::Instance().GetFeatureSet().Final(sm, *citer->second);
  27. }

抽取某一个span范围内的规则

  1. void SentPair::GetRule(const pair<int,int>& span, map<string ,Rule *>& rule_map)
  2. {
  3. // current span must be consist
  4. Alignment::SpanAlign::const_iterator citer;
  5. const Alignment::SpanAlign& cs = wa_->GetConsistSpans();
  6. map<string, Rule *>::iterator iter;
  7. citer = cs.find(span);
  8. if (citer == cs.end())
  9. return;
  10. // TODO support extract boundary expansion
  11. // full lexical rule trg_span shall be small than limit
  12. SentenceMeta sm;
  13. sm.sent_id_ = this->sent_id_;
  14. sm.src_ = &src_;
  15. sm.trg_ = &trg_;
  16. Context context;
  17. context.src_span_ = span;
  18. context.trg_span_ = citer->second;
  19. //extract bp
  20. if (span.second - span.first + 1 <= StaticData::Instance().InitPhraseLimit())
  21. {
  22. vector<pair<int,int> > empty;
  23. Rule * rule = new Rule();
  24. CreateSrcTrg(span, empty, citer->second, empty, rule->src_rhs_, rule->trg_rhs_, rule->wa_);
  25. StaticData::Instance().GetFeatureSet().Traverse(sm, context, 1.0, *rule);
  26. //cout << "rule->fract_count_: " << rule->fract_count_ << endl;
  27. iter = rule_map.find(rule->Key());
  28. if (iter == rule_map.end())
  29. {
  30. rule_map[rule->Key()] = rule;
  31. }
  32. else
  33. {
  34. iter->second ->Add(*rule);
  35. delete rule;
  36. }
  37. }
  38. //extract rules with variable
  39. vector<vector<pair<int,int> > > var_span;
  40. EnumerateVar(span, var_span);
  41. vector<pair<int,int> > trg_childs_span;
  42. for (size_t i = 0; i < (int)var_span.size(); i++)
  43. {
  44. trg_childs_span.resize(var_span[i].size());
  45. for (size_t j = 0; j < var_span[i].size(); j++)
  46. trg_childs_span[j] = cs.find(var_span[i][j])->second;
  47. Rule *rule = new Rule();
  48. CreateSrcTrg(span, var_span[i], citer->second, trg_childs_span, rule->src_rhs_, rule->trg_rhs_, rule->wa_);
  49. //cout << "rule->fract_count_: " << rule->fract_count_ << endl;
  50. //if (rule->m_wa.size() == var_span[i].size()) {//must have lexical alignment
  51. if (rule->AlignLinkCount() == var_span[i].size()) //must have lexical alignment
  52. {
  53. delete rule;
  54. continue;
  55. }
  56. context.src_var_spans_ = var_span[i];
  57. context.trg_var_spans_ = trg_childs_span;
  58. StaticData::Instance().GetFeatureSet().Traverse(sm, context, (Float) 1.0/var_span.size(), *rule);
  59. iter = rule_map.find(rule->Key());
  60. if (iter == rule_map.end())
  61. {
  62. rule_map[rule->Key()] = rule;
  63. }
  64. else
  65. {
  66. iter->second->Add(*rule);
  67. delete rule;
  68. }
  69. }
  70. }


声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Monodyee/article/detail/372759
推荐阅读
相关标签
  

闽ICP备14008679号