Source code for gec_metrics.meta_eval.seeda

import argparse
import glob
import os
from scipy.stats import pearsonr, spearmanr
from dataclasses import dataclass
import itertools
from .base import MetaEvalBase
from gec_metrics.metrics import MetricBase, inputs_handler
import xml.etree.ElementTree as ET
import numpy as np
from .utils import read_lines

[docs] class MetaEvalSEEDA(MetaEvalBase): MODELS = [ 'BART', 'BERT-fuse', 'GECToR-BERT', 'GECToR-ens', 'GPT-3.5', 'INPUT', 'LM-Critic', 'PIE', 'REF-F', 'REF-M', 'Riken-Tohoku', 'T5', 'TemplateGEC', 'TransGEC', 'UEDIN-MS' ] SCORE_ID = ['EW_edit', 'EW_sent', 'TS_edit', 'TS_sent']
[docs] @dataclass class SEEDASystemCorrOutput(MetaEvalBase.Output): '''The dataclass to store system-level correlations. Args: ew_sent (MetaEvalBase.Corr): SEEDA-S correlation based on Expected Wins-based human evaluation. ew_edit (MetaEvalBase.Corr): SEEDA-E correlation based on Expected Wins-based human evaluation. ts_sent (MetaEvalBase.Corr): SEEDA-S correlation based on TrueSkill-based human evaluation. ts_edit (MetaEvalBase.Corr): SEEDA-E correlation based on TrueSkill-based human evaluation. ''' ew_edit: MetaEvalBase.Corr = None ew_sent: MetaEvalBase.Corr = None ts_edit: MetaEvalBase.Corr = None ts_sent: MetaEvalBase.Corr = None
[docs] @dataclass class SEEDAWindowAnalysisSystemCorrOutput(MetaEvalBase.Output): '''The dataclass to store system-level correlations. Args: ew_sent (MetaEvalBase.Corr): SEEDA-S correlation based on Expected Wins-based human evaluation. ew_edit (MetaEvalBase.Corr): SEEDA-E correlation based on Expected Wins-based human evaluation. ts_sent (MetaEvalBase.Corr): SEEDA-S correlation based on TrueSkill-based human evaluation. ts_edit (MetaEvalBase.Corr): SEEDA-E correlation based on TrueSkill-based human evaluation. ''' ew_edit: dict = None ew_sent: dict = None ts_edit: dict = None ts_sent: dict = None
[docs] @dataclass class SEEDASentenceCorrOutput(MetaEvalBase.Output): '''The dataclass to store sentence-level correlations. Args: sent (MetaEvalBase.Corr): SEEDA-S sentence-level correlation. edit (MetaEvalBase.Corr): SEEDA-E sentence-level correlation. ''' sent: MetaEvalBase.Corr = None edit: MetaEvalBase.Corr = None
[docs] @dataclass class Config: system: str = 'base'
def __init__(self, config: MetaEvalBase.Config = None): super().__init__(config) self.system_data = self.load_system_data() self.sentence_data = self.load_sentence_data()
[docs] def load_system_data(self) -> dict[str, list]: '''Load system-level meta-evaluation data. Returns: dict[str, list]: The meta-evaluation data contianing the following keys: - "sources": Source sentences. The shape is (num_sentences, ). - "hypotheses": Hypotheses sentences. The shape is (num_systems, num_sentences). - "references": Reference sentences. The shape is (num_references, num_sentences). - "models": The model names. This index corresponds to the first dimension of "hypotheses". - "human_scores": Dictionary of Human scores. The shape is (num_systems, ). - "EW_edit": Expected Wins scores using edit-based human evaluation. - "EW_sent": Expected Wins scores using sentence-based human evaluation. - "TS_edit": TrueSkill scores using edit-based human evaluation. - "TS_sent": TrueSkill scores using sentence-based human evaluation. ''' subset_dir = glob.glob('**/SEEDA/outputs/subset', recursive=True)[0] del_systems = { 'base': ['INPUT', 'REF-F', 'GPT-3.5'], '+INPUT': ['REF-F', 'GPT-3.5'], '+REF-F_GPT-3.5': ['INPUT'], '+fluency': ['INPUT'], # an alias 'all': [] }[self.config.system] models = [m for m in self.MODELS if m not in del_systems] data = { 'hypotheses': [], 'references': [], 'human_score': dict(), 'models': models, 'del_models': del_systems, 'sources': [] } for model in models: sents = read_lines(os.path.join(subset_dir, model + '.txt')) data['hypotheses'].append(sents) score_dir = glob.glob('**/SEEDA/scores/human', recursive=True)[0] for score_id in self.SCORE_ID: scores = list(map(float, read_lines( os.path.join(score_dir, score_id + '.txt') ))) scores = [s for i, s in enumerate(scores) if self.MODELS[i] not in del_systems] data['human_score'][score_id] = scores data['sources'] = read_lines(os.path.join(subset_dir, 'INPUT.txt')) ref0 = read_lines(os.path.join(subset_dir, 'REF0.txt')) ref1 = read_lines(os.path.join(subset_dir, 'REF1.txt')) data['references'] = [ref0, ref1] return data
[docs] def load_xml(self, xml_path: str, target_models: list[str]) -> dict[str, list[list[int]]]: '''Load a XML file. Args: xml_path (str): Path to a XML file. target_models (list[str]): Model names to be evaluated. Returns: dict[int, list[list[int]]]: Dictionary containing sentence-level human evaluation rankings. The data is stored for each source and annotator. You can refer to the ranking by dict[src_id][annotator_id][system_id] = -rank. Note that each element is *minus* rank, so higher values are higher quality. ''' tree = ET.parse(xml_path) root = tree.getroot() human_scores = dict() for child in root.find('error-correction-ranking-result'): src_id = int(child.attrib['src-id']) human_scores[src_id] = human_scores.get( src_id, [] ) scores = [None] * len(target_models) for trans in child: systems = trans.attrib['system'].split() rank = int(trans.attrib['rank']) for sys in systems: if sys not in target_models: continue # Put the minus ranking as a score scores[target_models.index(sys)] = -rank human_scores[src_id].append(scores) # Sort by source id. human_scores = sorted(human_scores.items(), key=lambda x:x[0]) human_scores = [h[1] for h in human_scores] return human_scores
[docs] def load_sentence_data(self) -> dict[str, int]: '''Load sentence-level meta-evaluation data. Returns: dict[str, list]: The meta-evaluation data contianing the following keys: - "sources": Source sentences. The shape is (num_sentences, ). - "hypotheses": Hypotheses sentences. The shape is (num_systems, num_sentences). - "references": Reference sentences. The shape is (num_references, num_sentences). - "models": The model names. This index corresponds to the first dimension of "hypotheses". - "human_scores": Dictionary of Human scores for the systems. The shape is (num_sentences, num_systems, num_systems). - "EW_edit": Expected Wins scores using edit-based human evaluation. - "EW_sent": Expected Wins scores using sentence-based human evaluation. - "TS_edit": TrueSkill scores using edit-based human evaluation. - "TS_sent": TrueSkill scores using sentence-based human evaluation. ''' subset_dir = glob.glob('**/SEEDA/outputs/subset/', recursive=True)[0] data_dir = glob.glob('**/SEEDA/data/', recursive=True)[0] del_systems = { 'base': ['INPUT', 'REF-F', 'GPT-3.5'], '+INPUT': ['REF-F', 'GPT-3.5'], '+REF-F_GPT-3.5': ['INPUT'], '+fluency': ['INPUT'], # an alias 'all': [] }[self.config.system] del_systems += ['REF0', 'REF1'] models = [m for m in self.MODELS if m not in del_systems] data = { 'hypotheses': [], 'human_score': dict(), 'human_score_paths': dict(), 'models': models, 'del_models': del_systems, 'sources': [] } data['human_score']['edit'] = self.load_xml( data_dir + 'judgments_edit.xml', models ) data['human_score']['sent'] = self.load_xml( data_dir + 'judgments_sent.xml', models ) for model in models: sents = read_lines(os.path.join(subset_dir, model + '.txt')) data['hypotheses'].append(sents) input_sents = read_lines(os.path.join(subset_dir, 'INPUT.txt')) data['sources'] = input_sents ref0 = read_lines(os.path.join(subset_dir, 'REF0.txt')) ref1 = read_lines(os.path.join(subset_dir, 'REF1.txt')) data['references'] = [ref0, ref1] return data
[docs] def corr_system( self, metric: MetricBase, aggregation='default' )-> "SEEDASystemCorrOutput": '''Compute system-level correlations. Args: metric (MetricBase): The metric to be evaluated. Returns: SEEDASystemCorrOutput: The correlations. ''' corrs = super().corr_system(metric, aggregation=aggregation) return self.SEEDASystemCorrOutput( ew_edit=corrs[0], ew_sent=corrs[1], ts_edit=corrs[2], ts_sent=corrs[3] )
[docs] def corr_sentence( self, metric: MetricBase ) -> "SEEDASentenceCorrOutput": '''Compute sentence-level correlations. Args: metric (MetricBase): The metric to be evaluated. Returns: SEEDASentenceCorrOutput: The correlations. ''' corrs = super().corr_sentence(metric) return self.SEEDASentenceCorrOutput( edit=corrs[0], sent=corrs[1] )
[docs] def window_analysis_system( self, metric: MetricBase, window: int = 4, aggregation='default' ) -> "SEEDAWindowAnalysisSystemCorrOutput": corrs = super().window_analysis_system(metric, window, aggregation) return self.SEEDAWindowAnalysisSystemCorrOutput( ew_edit=corrs[0], ew_sent=corrs[1], ts_edit=corrs[2], ts_sent=corrs[3], )