赞
踩
想要计算两句话的Rougel值
#!/usr/bin/env python # # File Name : rouge.py # # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) # # Creation Date : 2015-01-07 06:03 # Author : Ramakrishna Vedantam <vrama91@vt.edu> import numpy as np import pdb def my_lcs(string, sub): """ Calculates longest common subsequence for a pair of tokenized strings :param string : list of str : tokens from a string split using whitespace :param sub : list of str : shorter string, also split using whitespace :returns: length (list of int): length of the longest common subsequence between the two strings Note: my_lcs only gives length of the longest common subsequence, not the actual LCS """ if(len(string)< len(sub)): sub, string = string, sub lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] for j in range(1,len(sub)+1): for i in range(1,len(string)+1): if(string[i-1] == sub[j-1]): lengths[i][j] = lengths[i-1][j-1] + 1 else: lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) return lengths[len(string)][len(sub)] class Rouge(): ''' Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set ''' def __init__(self): # vrama91: updated the value below based on discussion with Hovey self.beta = 1.2 def calc_score(self, candidate, refs): """ Compute ROUGE-L score given one candidate and references for an image :param candidate: str : candidate sentence to be evaluated :param refs: list of str : COCO reference sentences for the particular image to be evaluated :returns score: int (ROUGE-L score for the candidate evaluated against references) """ assert(len(candidate)==1) assert(len(refs)>0) prec = [] rec = [] # split into tokens token_c = candidate[0].split(" ") for reference in refs: # split into tokens token_r = reference.split(" ") # compute the longest common subsequence lcs = my_lcs(token_r, token_c) prec.append(lcs/float(len(token_c))) rec.append(lcs/float(len(token_r))) prec_max = max(prec) rec_max = max(rec) if(prec_max!=0 and rec_max !=0): score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) else: score = 0.0 return score def compute_score(self, gts, res): """ Computes Rouge-L score given a set of reference and candidate sentences for the dataset Invoked by evaluate_captions.py :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) """ assert(gts.keys() == res.keys()) imgIds = gts.keys() score = [] for id in imgIds: hypo = res[id] ref = gts[id] score.append(self.calc_score(hypo, ref)) # Sanity check. assert(type(hypo) is list) assert(len(hypo) == 1) assert(type(ref) is list) assert(len(ref) > 0) average_score = np.mean(np.array(score)) # return average_score, np.array(score) # average_score represents all rougel, np.array(score) represent all single rougle in array formatter return np.array(score) def method(self): return "Rouge"
#!/usr/bin/env python # bleu_scorer.py # David Chiang <chiang@isi.edu> # Copyright (c) 2004-2006 University of Maryland. All rights # reserved. Do not redistribute without permission from the # author. Not for commercial use. # Modified by: # Hao Fang <hfang@uw.edu> # Tsung-Yi Lin <tl483@cornell.edu> '''Provides: cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test(). cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked(). ''' import copy import sys, math, re from collections import defaultdict import six from six.moves import xrange as range def precook(s, n=4, out=False): """Takes a string as input and returns an object that can be given to either cook_refs or cook_test. This is optional: cook_refs and cook_test can take string arguments as well.""" words = s.split() counts = defaultdict(int) for k in range(1,n+1): for i in range(len(words)-k+1): ngram = tuple(words[i:i+k]) counts[ngram] += 1 return (len(words), counts) def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average" '''Takes a list of reference sentences for a single segment and returns an object that encapsulates everything that BLEU needs to know about them.''' reflen = [] maxcounts = {} for ref in refs: rl, counts = precook(ref, n) reflen.append(rl) for (ngram,count) in six.iteritems(counts): maxcounts[ngram] = max(maxcounts.get(ngram,0), count) # Calculate effective reference sentence length. if eff == "shortest": reflen = min(reflen) elif eff == "average": reflen = float(sum(reflen))/len(reflen) ## lhuang: N.B.: leave reflen computaiton to the very end!! ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design) return (reflen, maxcounts) def cook_test(test, reflen_refmaxcounts, eff=None, n=4): '''Takes a test sentence and returns an object that encapsulates everything that BLEU needs to know about it.''' reflen, refmaxcounts = reflen_refmaxcounts testlen, counts = precook(test, n, True) result = {} # Calculate effective reference sentence length. if eff == "closest": result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1] else: ## i.e., "average" or "shortest" or None result["reflen"] = reflen result["testlen"] = testlen result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)] result['correct'] = [0]*n for (ngram, count) in six.iteritems(counts): result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count) return result class BleuScorer(object): """Bleu scorer. """ __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen" # special_reflen is used in oracle (proportional effective ref len for a node). def copy(self): ''' copy the refs.''' new = BleuScorer(n=self.n) new.ctest = copy.copy(self.ctest) new.crefs = copy.copy(self.crefs) new._score = None return new def __init__(self, test=None, refs=None, n=4, special_reflen=None): ''' singular instance ''' self.n = n self.crefs = [] self.ctest = [] self.cook_append(test, refs) self.special_reflen = special_reflen def cook_append(self, test, refs): '''called by constructor and __iadd__ to avoid creating new instances.''' if refs is not None: self.crefs.append(cook_refs(refs)) if test is not None: cooked_test = cook_test(test, self.crefs[-1]) self.ctest.append(cooked_test) ## N.B.: -1 else: self.ctest.append(None) # lens of crefs and ctest have to match self._score = None ## need to recompute def ratio(self, option=None): self.compute_score(option=option) return self._ratio def score_ratio(self, option=None): '''return (bleu, len_ratio) pair''' return (self.fscore(option=option), self.ratio(option=option)) def score_ratio_str(self, option=None): return "%.4f (%.2f)" % self.score_ratio(option) def reflen(self, option=None): self.compute_score(option=option) return self._reflen def testlen(self, option=None): self.compute_score(option=option) return self._testlen def retest(self, new_test): if type(new_test) is str: new_test = [new_test] assert len(new_test) == len(self.crefs), new_test self.ctest = [] for t, rs in zip(new_test, self.crefs): self.ctest.append(cook_test(t, rs)) self._score = None return self def rescore(self, new_test): ''' replace test(s) with new test(s), and returns the new score.''' return self.retest(new_test).compute_score() def size(self): assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) return len(self.crefs) def __iadd__(self, other): '''add an instance (e.g., from another sentence).''' if type(other) is tuple: ## avoid creating new BleuScorer instances self.cook_append(other[0], other[1]) else: assert self.compatible(other), "incompatible BLEUs." self.ctest.extend(other.ctest) self.crefs.extend(other.crefs) self._score = None ## need to recompute return self def compatible(self, other): return isinstance(other, BleuScorer) and self.n == other.n def single_reflen(self, option="average"): return self._single_reflen(self.crefs[0][0], option) def _single_reflen(self, reflens, option=None, testlen=None): if option == "shortest": reflen = min(reflens) elif option == "average": reflen = float(sum(reflens))/len(reflens) elif option == "closest": reflen = min((abs(l-testlen), l) for l in reflens)[1] else: assert False, "unsupported reflen option %s" % option return reflen def recompute_score(self, option=None, verbose=0): self._score = None return self.compute_score(option, verbose) def compute_score(self, option=None, verbose=0): n = self.n small = 1e-9 tiny = 1e-15 ## so that if guess is 0 still return 0 bleu_list = [[] for _ in range(n)] if self._score is not None: return self._score if option is None: option = "average" if len(self.crefs) == 1 else "closest" self._testlen = 0 self._reflen = 0 totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n} # for each sentence for comps in self.ctest: testlen = comps['testlen'] self._testlen += testlen if self.special_reflen is None: ## need computation reflen = self._single_reflen(comps['reflen'], option, testlen) else: reflen = self.special_reflen self._reflen += reflen for key in ['guess','correct']: for k in range(n): totalcomps[key][k] += comps[key][k] # append per image bleu score bleu = 1. for k in range(n): bleu *= (float(comps['correct'][k]) + tiny) \ /(float(comps['guess'][k]) + small) bleu_list[k].append(bleu ** (1./(k+1))) ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division if ratio < 1: for k in range(n): bleu_list[k][-1] *= math.exp(1 - 1/ratio) if verbose > 1: print(comps, reflen) totalcomps['reflen'] = self._reflen totalcomps['testlen'] = self._testlen bleus = [] bleu = 1. for k in range(n): bleu *= float(totalcomps['correct'][k] + tiny) \ / (totalcomps['guess'][k] + small) bleus.append(bleu ** (1./(k+1))) ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division if ratio < 1: for k in range(n): bleus[k] *= math.exp(1 - 1/ratio) if verbose > 0: print(totalcomps) print("ratio:", ratio) self._score = bleus return self._score, bleu_list
#!/usr/bin/env python # # File Name : bleu.py # # Description : Wrapper for BLEU scorer. # # Creation Date : 06-01-2015 # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu> from bleu_scorer import BleuScorer class Bleu: def __init__(self, n=4): # default compute Blue score up to 4 self._n = n self._hypo_for_image = {} self.ref_for_image = {} def compute_score(self, gts, res): assert(gts.keys() == res.keys()) imgIds = gts.keys() bleu_scorer = BleuScorer(n=self._n) for id in imgIds: hypo = res[id] ref = gts[id] # Sanity check. assert(type(hypo) is list) assert(len(hypo) == 1) assert(type(ref) is list) assert(len(ref) >= 1) bleu_scorer += (hypo[0], ref) #score, scores = bleu_scorer.compute_score(option='shortest') score, scores = bleu_scorer.compute_score(option='closest', verbose=0) #score, scores = bleu_scorer.compute_score(option='average', verbose=1) # return (bleu, bleu_info) # return score, scores # return total_blue(score) and all four singel blue(scores) return scores[3] # scores[3] represents BLEU4, in this experiment,we only use bleu4 def method(self): return "Bleu"
from numpy.lib.shape_base import tile from bleu import Bleu from rougel import Rouge import json import numpy as np #Function: 1) 计算两个按行存取的txt的句子对的ROUGEL和Bleu值(需要将按行存取的TXT修改成字典的格式,参考上一篇博客)。2)计算ROUGEL+BLEU的值,并将和由高到低排序 3) bleu_scorer = Bleu(4) rouge_scorer = Rouge() ## 将原始按行存入记事本的文件(仅含有sentence),修改成字典形式 eg:groundtruth = {0: ["i love china"], 1:["today is a good day"]} # {}:dict # 从0开始 def Readfile(file_name): line_list = [] dic = {} fr = open(file_name,"r") line_number = 0 for line in fr.readlines(): line = line.strip() if len(line) != 0: line_list.append(line) dic[line_number] = [line] line_number += 1 # fw.write(json.dumps(dic)) # fw.write(dic) return dic fr.close() fw.close() if __name__=="__main__": raw_name = "/home/qtxu/semeval_data/sentences/restaurants_16.train" # original data bt_name = "/home/qtxu/semeval_data/bt/french/sentences/restaurants_16.train" # bt data write_raw_name = "/home/qtxu/semeval_data/Ten_Metric/new_order_sentence/french/r16_fr_neworder.txt" write_bt_name = "/home/qtxu/semeval_data/Ten_Metric/new_bt_order_sentence/french/r16_bt_fr_neworder.txt" # interlanguages neworder data raw_dic = Readfile(raw_name) print(raw_dic) bt_dic = Readfile(bt_name) Bleu_value = bleu_scorer.compute_score(raw_dic, bt_dic) # list Rougle_value = rouge_scorer.compute_score(raw_dic, bt_dic) # list # print(Bleu_value) # print(Rougle_value) list_final = [] #用于存放两个评价指标的计算和 i = 0 for index, item in enumerate(Bleu_value): list_final.append(item+Rougle_value[index]) #add two metric(bleu and rouge) # print(list_final) # print("排序结果是",sorted(list_final)) # sorted() default 升序 sorted_id = sorted(range(len(list_final)), key=lambda k: list_final[k], reverse=True) # reverse=True 降序,sorted_id从0开始 # print(sorted_id) result = [raw_dic[i][0] for i in sorted_id] # result:sorted_id对应的,原始的txt中的sentence # print(result) # fw_raw = open(write_raw_name,"w") # for i in sorted_id: # fw_raw.write(raw_dic[i][0]+"\n") fw_bt = open(write_bt_name,"w") for i in sorted_id: fw_bt.write(bt_dic[i][0]+"\n") # print(bt_dic[i][0]) print("write over")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。