Source code for gec_metrics.meta_eval.gjg

import glob
import os
from gec_metrics.metrics.base import MetricBase
from scipy.stats import pearsonr, spearmanr
from dataclasses import dataclass
from .base import MetaEvalBase
import xml.etree.ElementTree as ET
import itertools
from .utils import read_lines


[docs]
class MetaEvalGJG(MetaEvalBase):
    MODELS = ['AMU', 'RAC', 'CAMB', 'CUUI', 'POST', 'UFC', 'PKU', 'UMC', 'IITB', 'SJTU', 'INPUT', 'NTHU', 'IPN']
    SCORE_ID = ['ew', 'ts']

[docs]
    @dataclass
    class GJGSystemCorrOutput(MetaEvalBase.Output):
        '''The dataclass to store the meta-evaluation results.
        
        Args:
            ts (MetaEvalBase.Corr):
                The correlation using TrueSkill-based human evaluation.
            ts (MetaEvalBase.Corr):
                The correlation using Expected Wins-based human evaluation.
        '''
        ew: MetaEvalBase.Corr = None
        ts: MetaEvalBase.Corr = None



[docs]
    @dataclass
    class GJGSentenceCorrOutput(MetaEvalBase.Output):
        '''The dataclass to store the meta-evaluation results.
        
        Args:
            ts (MetaEvalBase.Corr):
                The correlation using TrueSkill-based human evaluation.
            ts (MetaEvalBase.Corr):
                The correlation using Expected Wins-based human evaluation.
        '''
        corr: MetaEvalBase.Corr = None



[docs]
    @dataclass
    class GJGWindowAnalysisSystemCorrOutput(MetaEvalBase.Output):
        '''The dataclass to store the meta-evaluation results.
        
        Args:
            ts (MetaEvalBase.Corr):
                The correlation using TrueSkill-based human evaluation.
            ts (MetaEvalBase.Corr):
                The correlation using Expected Wins-based human evaluation.
        '''
        ew: dict = None
        ts: dict = None


    def __init__(self, config: MetaEvalBase.Config = None):
        super().__init__(config)
        self.system_data = self.load_system_data()
        self.sentence_data = self.load_sentence_data()


[docs]
    def load_system_data(self) -> dict[str, list]:
        '''Load system-level meta-evaluation data.
        
        Returns:
            dict[str, list]: The meta-evaluation data contianing the following keys:
                - "sources": Source sentences. The shape is (num_sentences, ).
                - "hypotheses": Hypotheses sentences. The shape is (num_systems, num_sentences).
                - "references": Reference sentences. The shape is (num_references, num_sentences).
                - "models": The model names. This index corresponds to the first dimension of "hypotheses".
                - "human_scores": Human scores for the systems. The shape is (num_systems, )
                    - "ew" is human Expected Wins scores.
                    - "ts" is human TrueSkill scores.
        '''
        # Expected Wins scores
        # Table 3 (b) https://aclanthology.org/D15-1052.pdf
        ew_table = '''0.628 1 AMU
0.566 2-3 RAC
0.561 2-4 CAMB
0.550 3-5 CUUI
0.539 4-5 POST
0.513 6-8 UFC
0.506 6-8 PKU
0.495 7-9 UMC
0.485 7-10 IITB
0.463 10-11 SJTU
0.456 9-12 INPUT
0.437 11-12 NTHU
0.300 13 IPN'''.split('\n')
        # TrueSkill scores
        # Table 3 (c) https://aclanthology.org/D15-1052.pdf
        ts_table = '''0.273 1 AMU
0.182 2 CAMB
0.114 3-4 RAC
0.105 3-5 CUUI
0.080 4-5 POST
-0.001 6-7 PKU
-0.022 6-8 UMC
-0.041 7-10 UFC
-0.055 8-11 IITB
-0.062 8-11 INPUT
-0.074 9-11 SJTU
-0.142 12 NTHU
-0.358 13 IPN'''.split('\n')
        
        ew_models = [line.split(' ')[2] for line in ew_table]
        ew_scores = [float(line.split(' ')[0]) for line in ew_table]
        ts_models = [line.split(' ')[2] for line in ts_table]
        ts_scores = [float(line.split(' ')[0]) for line in ts_table]
        ts_scores_reorder = [ts_scores[ts_models.index(m)] for m in ew_models]
        data_dir = glob.glob('**/meta_eval_data/conll14/', recursive=True)[0]
        data = {
            'hypotheses': [],
            'references': [],
            'human_score': {
                'ew': ew_scores,
                'ts': ts_scores_reorder
            },
            'models': ew_models,
            'sources': []
        }
        sentences = []
        for model in ew_models:
            sents = read_lines(os.path.join(data_dir, 'official_submissions', model))
            sentences.append(sents)
        data['hypotheses'] = sentences
        input_sents = read_lines(os.path.join(data_dir, 'official_submissions', 'INPUT'))
        data['sources'] = input_sents
        ref0 = read_lines(os.path.join(data_dir, 'REF0'))
        ref1 = read_lines(os.path.join(data_dir, 'REF1'))
        data['references'] = [ref0, ref1]
        return data

    

[docs]
    def load_xml(self, xml_path: str, target_models: list[str]) -> dict[int, list[list[int]]]:
        '''Load a XML file.
        
        Args:
            xml_path (str): Path to a XML file.
            target_models (list[str]): Model names to be evaluated.

        Returns:
            dict[int, list[list[int]]:
                Dictionary containing sentence-level human evaluation rankings.
                The data is stored for each source and annotator.
                You can refer to the ranking by dict[src_id][annotator_id][system_id] = -rank.
                Note that each element is *minus* rank, so higher values are higher quality. 
        '''
        tree = ET.parse(xml_path)
        root = tree.getroot()
        human_scores = dict()
        for child in root.find('error-correction-ranking-result'):
            src_id = int(child.attrib['src-id'])
            human_scores[src_id] = human_scores.get(
                src_id, []
            )
            scores = [None] * len(target_models)
            for trans in child:
                systems = trans.attrib['system'].split()
                rank = int(trans.attrib['rank'])
                for sys in systems:
                    if sys not in target_models:
                        continue
                    # Put the minus ranking as the score
                    scores[target_models.index(sys)] = -rank
            human_scores[src_id].append(scores)
        human_scores = sorted(human_scores.items(), key=lambda x:x[0])
        return human_scores

    

[docs]
    def load_sentence_data(self) -> dict[str, list]:
        '''Loads sentence-level meta-evaluation data.
        
        Returns:
            dict[str, list]: The meta-evaluation data contianing the following keys:
                - "sources": Source sentences. The shape is (num_sentences, ).
                - "hypotheses": Hypotheses sentences. The shape is (num_systems, num_sentences).
                - "references": Reference sentences. The shape is (num_references, num_sentences).
                - "models": The model names. This index corresponds to the first dimension of "hypotheses".
                - "human_scores": Human scores for the systems.
                    - "ew" is human Expected Wins scores.
                        The shape is (num_sentences, num_systems, num_systems).
                    - "ts" is human TrueSkill scores.
                        The shape is (num_sentences, num_systems, num_systems).
        '''
        data_dir = glob.glob('**/meta_eval_data/conll14/', recursive=True)[0]
        score_dir = glob.glob('**/meta_eval_data/GJG15/', recursive=True)[0]
        data = {
            'hypotheses': [],
            'references': [],
            'human_score': dict(),
            'models': self.MODELS,
            'sources': []
        }
        # The ['data'] key is a dummy label to adapt the interface to SEEDA.
        data['human_score']['sent'] = self.load_xml(os.path.join(score_dir, 'judgments.xml'), self.MODELS)
        src_ids = [h[0] for h in data['human_score']['sent']]
        data['human_score']['sent'] = [h[1] for h in data['human_score']['sent']]
        sentences = []
        for model in self.MODELS:
            sents = read_lines(os.path.join(data_dir, 'official_submissions', model))
            sentences.append([sents[i] for i in src_ids])
        data['hypotheses'] = sentences
        input_sents = read_lines(os.path.join(data_dir, 'official_submissions', 'INPUT'))
        data['sources'] = [input_sents[i] for i in src_ids]
        ref0 = read_lines(os.path.join(data_dir, 'REF0'))
        ref1 = read_lines(os.path.join(data_dir, 'REF1'))
        data['references'] = [
            [ref0[i] for i in src_ids],
            [ref1[i] for i in src_ids],
        ]
        return data

    

[docs]
    def corr_system(
        self,
        metric: MetricBase,
        aggregation='default'
    ) -> "GJGSystemCorrOutput":
        '''Compute system-level correlations.

        Args:
            metric (MetricBase): The metric to be evaluated.

        Returns:
            GJGSystemCorrOutput: The correlations.
        '''
        corrs = super().corr_system(metric, aggregation=aggregation)
        return self.GJGSystemCorrOutput(
            ew=corrs[0],
            ts=corrs[1]
        )

    

[docs]
    def corr_sentence(self, metric: MetricBase) -> "GJGSentenceCorrOutput":
        '''Compute sentence-level correlations.

        Args:
            metric (MetricBase): The metric to be evaluated.

        Returns:
            GJGSentenceCorrOutput: The correlations.
        '''
        corrs = super().corr_sentence(metric)
        return self.GJGSentenceCorrOutput(
            corr=corrs[0]
        )

    

[docs]
    def window_analysis_system(
        self,
        metric: MetricBase,
        window: int = 4,
        aggregation='default'
    ) -> "GJGWindowAnalysisSystemCorrOutput":
        corrs = super().window_analysis_system(metric, window, aggregation)
        return self.GJGWindowAnalysisSystemCorrOutput(
            ew=corrs[0],
            ts=corrs[1],
        )