赞
踩
https://github.com/open-compass/human-eval这个代码仓库测评代码生成能力。
humaneval.py是opencompass大模型的代码评测能力的评测代码
代码路径opencompass/opencompass/datasets/humaneval.py
class HumanEvaluator(BaseEvaluator): """Evaluator for HumanEval or EvalPlus.""" def __init__(self, k: List[int] = [1, 10, 100], metric: str = 'HumanEval') -> None: self.metric = metric assert self.metric in ['HumanEval', 'EvalPlus'] if self.metric == 'HumanEval': try: from human_eval.data import HUMAN_EVAL, write_jsonl from human_eval.evaluation import \ evaluate_functional_correctness self.write_jsonl = write_jsonl self.HUMAN_EVAL = HUMAN_EVAL self.eval = evaluate_functional_correctness except ImportError: raise ImportError( 'Please install human_eval use following steps:\n' 'git clone git@github.com:open-compass/human-eval.git\n' 'cd human-eval && pip install -e .') else: try: from evalplus.data import write_jsonl from evalplus.evaluate import evaluate self.write_jsonl = write_jsonl self.eval = evaluate except ImportError: raise ImportError( 'Please install evalplus use following steps:\n' 'git clone --recurse-submodules git@github.com:open-compass/human-eval.git\n' # noqa 'cd human-eval\n' 'pip install -e .\n' 'pip install -e evalplus\n') self.k = k super().__init__() def score(self, predictions, references, test_set): prompts = [item['prompt'] for item in test_set] humaneval_preds = [] if self.metric == 'HumanEval': # create json file in human_eval format for preds, refer in zip(predictions, references): # suits for two case # 1. use repeated dataset # 2. use `num_return_sequences` to generate multiple responses if not isinstance(preds, list): preds = [preds] for pred in preds: humaneval_preds.append({ 'task_id': refer, 'completion': pred }) with tempfile.TemporaryDirectory() as tmp_dir: out_dir = osp.join(tmp_dir, 'human_eval.json') self.write_jsonl(out_dir, humaneval_preds) score = self.eval(out_dir, self.k, n_workers=4, timeout=3.0, problem_file=self.HUMAN_EVAL) return {f'humaneval_{k}': score[k] * 100 for k in score} else: for preds, refer, prompt in zip(predictions, references, prompts): if not isinstance(preds, list): preds = [preds] for pred in preds: humaneval_preds.append({ 'task_id': refer, 'solution': prompt + pred }) with tempfile.TemporaryDirectory() as tmp_dir: out_dir = osp.join(tmp_dir, 'human_eval.jsonl') self.write_jsonl(out_dir, humaneval_preds) flags = dict(dataset='humaneval', samples=out_dir, base_only=None, parallel=None, i_just_wanna_run=None, test_details=0.2, min_time_limit=0.2, gt_time_limit_factor=4.0, mini=None) score = self.eval(flags) return {f'humaneval_plus_{k}': score[k] * 100 for k in score}
HumanEvaluator
类是一个用于评估编程任务解决方案的类,主要处理两种类型的评估数据集:HumanEval
和 EvalPlus
。这个类从 BaseEvaluator
继承,并在初始化时进行了一些配置,同时包含方法来处理和评估预测结果。
k
: 默认为 [1, 10, 100]
,这个参数定义了评估时考虑的不同的 k
值,用于计算不同水平的精确度metric
: 指定评估的数据集,默认为 'HumanEval'
,可以选择 'EvalPlus'
作为另一种选项。metric
的值动态加载相关模块和函数。如果是 'HumanEval'
,则尝试从 human_eval
模块中导入必要的组件;如果是 'EvalPlus'
,则尝试从 evalplus
模块中导入这个方法接收三个参数:predictions
, references
, 和 test_set
。这些参数分别包含了模型的预测结果、参考答案标识符和一个测试数据集。调用过程中关键变量的快照如下
predictions包含了模型生成的代码片段。
[’ “”" Check if in given list of numbers, are any
two numbers closer to each other than\n given threshold.\n >>>
has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>>
has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n
“”“\n for i in range(len(numbers)):\n for j in range(i+1,
len(numbers)):\n if abs(numbers[i] - numbers[j]) <
threshold:\n return True\n return False’, ’ “””
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。