提交评测

本次评测分为开放和封闭两个赛道。参赛队伍需在天池平台获取测试集和提交评测文件,每个队伍每天有 5 次 提交机会,每天早 8:00 更新排行榜单自动更新。两个赛道使用不同的评测提交入口,详细规则如下:


同时,参赛队伍需注意:

  1. 可以同时参加两个赛道的评测提交,也可只参加其中一个赛道,但需遵守上述要求,正确提交到合适的赛道。若有误操作,可及时联系我们撤回提交。
  2. 用于训练的数据必须可以开源获取,并应在评测报告中说明或以其他方式向比赛组织方公开,不得使用闭源数据。
  3. 用于训练的数据必须保证不与测试集重合。
  4. 不允许任何队伍以开小号的形式刷榜。
  5. 评测报告中需要公开参赛系统的完整实验代码,包括实验配置、实验日志、所用数据等所有信息,保证可以重现出评测榜单的结果。若无法复现实验结果,将取消比赛资格并删除已产生的比赛结果。
  6. 不遵守规定导致结果不可复现的,将取消比赛资格并删除已产生的比赛结果。

评测系统所用的打分代码如下:

import json
import sys

def dump_2_json(info, path):
    with open(path, 'w') as output_json_file:
        json.dump(info, output_json_file)

def report_error_msg(detail, showMsg, out_p):
    error_dict = dict()
    error_dict['errorDetail'] = detail
    error_dict['errorMsg'] = showMsg
    error_dict['score'] = 0
    error_dict['scoreJson'] = {}
    error_dict['success'] = False
    dump_2_json(error_dict, out_p)

def report_score(score, score_ref, out_p):
    result = dict()
    result['success'] = True
    result['score'] = score
    result['scoreJson'] = {'score': score, "recall": score_ref["recall"], "precision": score_ref["precision"]}
    dump_2_json(result, out_p)

# 分离原文和标注内容
def parse(path):
    texts = []
    spans = []

    with open(path, 'r', encoding='utf-8') as f:
        i = 0
        for line in f:
            line = line.strip('\n ')
            line_text = ''
            line_spans = {t: [] for t in ENTITY_TYPES}
            typ = ''
            state = 0

            for c in line:
                if state == 0:
                    if c == '{':
                        state = 1
                        begin = len(line_text)
                    else:
                        line_text += c
                elif state == 1:
                    if c == '|':
                        state = 2
                        end = len(line_text)
                    else:
                        line_text += c
                else:
                    if c == '}':
                        if typ not in ENTITY_TYPES:
                            raise Exception(f'Error in line {i} of file {path} : ' +
                                            'Parsed entity type is invalid. Must be in {' +
                                            ', '.join(ENTITY_TYPES) + '}. ' +
                                            'This might be caused by mishandled labels. Abort scoring.')
                        line_spans[typ].append((begin, end, typ))
                        state = 0
                        typ = ''
                    else:
                        typ += c

            if state != 0:
                raise Exception(f'Error in line {i} of file {path} : ' +
                                'Invalid entity label found. Abort scoring.')

            texts.append(line_text)
            spans.append(line_spans)

            i += 1
    return texts, spans

# 计算召回率、准确率、F1
def get_rpf1(correct, num_gold, num_response):
    if correct == 0:
        return 0.0, 0.0, 0.0
    else:
        r = correct / num_gold          # 召回率
        p = correct / num_response      # 准确率
        f1 = 2.0 / (1.0 / r + 1.0 / p)  # F1
        return round(r * 100, 4), round(p * 100, 4), round(f1 * 100, 4)

if __name__ == "__main__":
    '''
      online evaluation
    '''
    in_param_path = sys.argv[1]
    out_path = sys.argv[2]

    # read submit and answer file from first parameter
    with open(in_param_path, 'r') as load_f:
        input_params = json.load(load_f)

    # 标准答案路径
    standard_path = input_params["fileData"]["standardFilePath"]
    print("Read standard from %s" % standard_path)

    # 选手提交的结果文件路径
    submit_path = input_params["fileData"]["userFilePath"]
    print("Read user submit file from %s" % submit_path)

    try:
        # TODO: 执行评测逻辑
        ENTITY_TYPES = ['PER', 'OFI', 'BOOK']

        # 解析文件为原文和标注内容
        gold_texts, gold_spans = parse(standard_path)
        response_texts, response_spans = parse(submit_path)

        # 原文长度不一致
        if len(gold_texts) != len(response_texts):
            raise Exception(
                'Response must have same number of lines as gold. Abort scoring.')

        correct_all = {t: 0 for t in ENTITY_TYPES}
        gold_all = {t: 0 for t in ENTITY_TYPES}
        response_all = {t: 0 for t in ENTITY_TYPES}

        for i in range(len(gold_texts)):
            # 原文文字不一致
            if gold_texts[i] != response_texts[i]:
                raise Exception(f'Response text is different from gold text at line {i}.' +
                                'This might be caused by mishandled labels. Abort scoring.')
            for t in ENTITY_TYPES:
                g_span = set(gold_spans[i][t])
                r_span = set(response_spans[i][t])
                correct_all[t] += len(g_span & r_span)
                gold_all[t] += len(g_span)
                response_all[t] += len(r_span)

        r, p, f1 = get_rpf1(sum(correct_all.values()),
                            sum(gold_all.values()),
                            sum(response_all.values()))

        score = f1
        score_ref = {'recall': r, 'precision': p}

        report_score(score, score_ref, out_path)

    except Exception as e:
        report_error_msg(str(e), str(e), out_path)