Source code for gec_metrics.meta_eval.gjg

import glob
import os
from gec_metrics.metrics.base import MetricBase
from scipy.stats import pearsonr, spearmanr
from dataclasses import dataclass
from .base import MetaEvalBase
import xml.etree.ElementTree as ET
import itertools
from .utils import read_lines

[docs] class MetaEvalGJG(MetaEvalBase): MODELS = ['AMU', 'RAC', 'CAMB', 'CUUI', 'POST', 'UFC', 'PKU', 'UMC', 'IITB', 'SJTU', 'INPUT', 'NTHU', 'IPN'] SCORE_ID = ['ew', 'ts']
[docs] @dataclass class GJGSystemCorrOutput(MetaEvalBase.Output): '''The dataclass to store the meta-evaluation results. Args: ts (MetaEvalBase.Corr): The correlation using TrueSkill-based human evaluation. ts (MetaEvalBase.Corr): The correlation using Expected Wins-based human evaluation. ''' ew: MetaEvalBase.Corr = None ts: MetaEvalBase.Corr = None
[docs] @dataclass class GJGSentenceCorrOutput(MetaEvalBase.Output): '''The dataclass to store the meta-evaluation results. Args: ts (MetaEvalBase.Corr): The correlation using TrueSkill-based human evaluation. ts (MetaEvalBase.Corr): The correlation using Expected Wins-based human evaluation. ''' corr: MetaEvalBase.Corr = None
[docs] @dataclass class GJGWindowAnalysisSystemCorrOutput(MetaEvalBase.Output): '''The dataclass to store the meta-evaluation results. Args: ts (MetaEvalBase.Corr): The correlation using TrueSkill-based human evaluation. ts (MetaEvalBase.Corr): The correlation using Expected Wins-based human evaluation. ''' ew: dict = None ts: dict = None
def __init__(self, config: MetaEvalBase.Config = None): super().__init__(config) self.system_data = self.load_system_data() self.sentence_data = self.load_sentence_data()
[docs] def load_system_data(self) -> dict[str, list]: '''Load system-level meta-evaluation data. Returns: dict[str, list]: The meta-evaluation data contianing the following keys: - "sources": Source sentences. The shape is (num_sentences, ). - "hypotheses": Hypotheses sentences. The shape is (num_systems, num_sentences). - "references": Reference sentences. The shape is (num_references, num_sentences). - "models": The model names. This index corresponds to the first dimension of "hypotheses". - "human_scores": Human scores for the systems. The shape is (num_systems, ) - "ew" is human Expected Wins scores. - "ts" is human TrueSkill scores. ''' # Expected Wins scores # Table 3 (b) https://aclanthology.org/D15-1052.pdf ew_table = '''0.628 1 AMU 0.566 2-3 RAC 0.561 2-4 CAMB 0.550 3-5 CUUI 0.539 4-5 POST 0.513 6-8 UFC 0.506 6-8 PKU 0.495 7-9 UMC 0.485 7-10 IITB 0.463 10-11 SJTU 0.456 9-12 INPUT 0.437 11-12 NTHU 0.300 13 IPN'''.split('\n') # TrueSkill scores # Table 3 (c) https://aclanthology.org/D15-1052.pdf ts_table = '''0.273 1 AMU 0.182 2 CAMB 0.114 3-4 RAC 0.105 3-5 CUUI 0.080 4-5 POST -0.001 6-7 PKU -0.022 6-8 UMC -0.041 7-10 UFC -0.055 8-11 IITB -0.062 8-11 INPUT -0.074 9-11 SJTU -0.142 12 NTHU -0.358 13 IPN'''.split('\n') ew_models = [line.split(' ')[2] for line in ew_table] ew_scores = [float(line.split(' ')[0]) for line in ew_table] ts_models = [line.split(' ')[2] for line in ts_table] ts_scores = [float(line.split(' ')[0]) for line in ts_table] ts_scores_reorder = [ts_scores[ts_models.index(m)] for m in ew_models] data_dir = glob.glob('**/meta_eval_data/conll14/', recursive=True)[0] data = { 'hypotheses': [], 'references': [], 'human_score': { 'ew': ew_scores, 'ts': ts_scores_reorder }, 'models': ew_models, 'sources': [] } sentences = [] for model in ew_models: sents = read_lines(os.path.join(data_dir, 'official_submissions', model)) sentences.append(sents) data['hypotheses'] = sentences input_sents = read_lines(os.path.join(data_dir, 'official_submissions', 'INPUT')) data['sources'] = input_sents ref0 = read_lines(os.path.join(data_dir, 'REF0')) ref1 = read_lines(os.path.join(data_dir, 'REF1')) data['references'] = [ref0, ref1] return data
[docs] def load_xml(self, xml_path: str, target_models: list[str]) -> dict[int, list[list[int]]]: '''Load a XML file. Args: xml_path (str): Path to a XML file. target_models (list[str]): Model names to be evaluated. Returns: dict[int, list[list[int]]: Dictionary containing sentence-level human evaluation rankings. The data is stored for each source and annotator. You can refer to the ranking by dict[src_id][annotator_id][system_id] = -rank. Note that each element is *minus* rank, so higher values are higher quality. ''' tree = ET.parse(xml_path) root = tree.getroot() human_scores = dict() for child in root.find('error-correction-ranking-result'): src_id = int(child.attrib['src-id']) human_scores[src_id] = human_scores.get( src_id, [] ) scores = [None] * len(target_models) for trans in child: systems = trans.attrib['system'].split() rank = int(trans.attrib['rank']) for sys in systems: if sys not in target_models: continue # Put the minus ranking as the score scores[target_models.index(sys)] = -rank human_scores[src_id].append(scores) human_scores = sorted(human_scores.items(), key=lambda x:x[0]) return human_scores
[docs] def load_sentence_data(self) -> dict[str, list]: '''Loads sentence-level meta-evaluation data. Returns: dict[str, list]: The meta-evaluation data contianing the following keys: - "sources": Source sentences. The shape is (num_sentences, ). - "hypotheses": Hypotheses sentences. The shape is (num_systems, num_sentences). - "references": Reference sentences. The shape is (num_references, num_sentences). - "models": The model names. This index corresponds to the first dimension of "hypotheses". - "human_scores": Human scores for the systems. - "ew" is human Expected Wins scores. The shape is (num_sentences, num_systems, num_systems). - "ts" is human TrueSkill scores. The shape is (num_sentences, num_systems, num_systems). ''' data_dir = glob.glob('**/meta_eval_data/conll14/', recursive=True)[0] score_dir = glob.glob('**/meta_eval_data/GJG15/', recursive=True)[0] data = { 'hypotheses': [], 'references': [], 'human_score': dict(), 'models': self.MODELS, 'sources': [] } # The ['data'] key is a dummy label to adapt the interface to SEEDA. data['human_score']['sent'] = self.load_xml(os.path.join(score_dir, 'judgments.xml'), self.MODELS) src_ids = [h[0] for h in data['human_score']['sent']] data['human_score']['sent'] = [h[1] for h in data['human_score']['sent']] sentences = [] for model in self.MODELS: sents = read_lines(os.path.join(data_dir, 'official_submissions', model)) sentences.append([sents[i] for i in src_ids]) data['hypotheses'] = sentences input_sents = read_lines(os.path.join(data_dir, 'official_submissions', 'INPUT')) data['sources'] = [input_sents[i] for i in src_ids] ref0 = read_lines(os.path.join(data_dir, 'REF0')) ref1 = read_lines(os.path.join(data_dir, 'REF1')) data['references'] = [ [ref0[i] for i in src_ids], [ref1[i] for i in src_ids], ] return data
[docs] def corr_system( self, metric: MetricBase, aggregation='default' ) -> "GJGSystemCorrOutput": '''Compute system-level correlations. Args: metric (MetricBase): The metric to be evaluated. Returns: GJGSystemCorrOutput: The correlations. ''' corrs = super().corr_system(metric, aggregation=aggregation) return self.GJGSystemCorrOutput( ew=corrs[0], ts=corrs[1] )
[docs] def corr_sentence(self, metric: MetricBase) -> "GJGSentenceCorrOutput": '''Compute sentence-level correlations. Args: metric (MetricBase): The metric to be evaluated. Returns: GJGSentenceCorrOutput: The correlations. ''' corrs = super().corr_sentence(metric) return self.GJGSentenceCorrOutput( corr=corrs[0] )
[docs] def window_analysis_system( self, metric: MetricBase, window: int = 4, aggregation='default' ) -> "GJGWindowAnalysisSystemCorrOutput": corrs = super().window_analysis_system(metric, window, aggregation) return self.GJGWindowAnalysisSystemCorrOutput( ew=corrs[0], ts=corrs[1], )