Source code for gec_metrics.meta_eval.seeda
import argparse
import glob
import os
from scipy.stats import pearsonr, spearmanr
from dataclasses import dataclass
import itertools
from .base import MetaEvalBase
from gec_metrics.metrics import MetricBase, inputs_handler
import xml.etree.ElementTree as ET
import numpy as np
from .utils import read_lines
[docs]
class MetaEvalSEEDA(MetaEvalBase):
MODELS = [
'BART',
'BERT-fuse',
'GECToR-BERT',
'GECToR-ens',
'GPT-3.5',
'INPUT',
'LM-Critic',
'PIE',
'REF-F',
'REF-M',
'Riken-Tohoku',
'T5',
'TemplateGEC',
'TransGEC',
'UEDIN-MS'
]
SCORE_ID = ['EW_edit', 'EW_sent', 'TS_edit', 'TS_sent']
[docs]
@dataclass
class SEEDASystemCorrOutput(MetaEvalBase.Output):
'''The dataclass to store system-level correlations.
Args:
ew_sent (MetaEvalBase.Corr):
SEEDA-S correlation based on Expected Wins-based human evaluation.
ew_edit (MetaEvalBase.Corr):
SEEDA-E correlation based on Expected Wins-based human evaluation.
ts_sent (MetaEvalBase.Corr):
SEEDA-S correlation based on TrueSkill-based human evaluation.
ts_edit (MetaEvalBase.Corr):
SEEDA-E correlation based on TrueSkill-based human evaluation.
'''
ew_edit: MetaEvalBase.Corr = None
ew_sent: MetaEvalBase.Corr = None
ts_edit: MetaEvalBase.Corr = None
ts_sent: MetaEvalBase.Corr = None
[docs]
@dataclass
class SEEDAWindowAnalysisSystemCorrOutput(MetaEvalBase.Output):
'''The dataclass to store system-level correlations.
Args:
ew_sent (MetaEvalBase.Corr):
SEEDA-S correlation based on Expected Wins-based human evaluation.
ew_edit (MetaEvalBase.Corr):
SEEDA-E correlation based on Expected Wins-based human evaluation.
ts_sent (MetaEvalBase.Corr):
SEEDA-S correlation based on TrueSkill-based human evaluation.
ts_edit (MetaEvalBase.Corr):
SEEDA-E correlation based on TrueSkill-based human evaluation.
'''
ew_edit: dict = None
ew_sent: dict = None
ts_edit: dict = None
ts_sent: dict = None
[docs]
@dataclass
class SEEDASentenceCorrOutput(MetaEvalBase.Output):
'''The dataclass to store sentence-level correlations.
Args:
sent (MetaEvalBase.Corr):
SEEDA-S sentence-level correlation.
edit (MetaEvalBase.Corr):
SEEDA-E sentence-level correlation.
'''
sent: MetaEvalBase.Corr = None
edit: MetaEvalBase.Corr = None
def __init__(self, config: MetaEvalBase.Config = None):
super().__init__(config)
self.system_data = self.load_system_data()
self.sentence_data = self.load_sentence_data()
[docs]
def load_system_data(self) -> dict[str, list]:
'''Load system-level meta-evaluation data.
Returns:
dict[str, list]: The meta-evaluation data contianing the following keys:
- "sources": Source sentences. The shape is (num_sentences, ).
- "hypotheses": Hypotheses sentences. The shape is (num_systems, num_sentences).
- "references": Reference sentences. The shape is (num_references, num_sentences).
- "models": The model names. This index corresponds to the first dimension of "hypotheses".
- "human_scores": Dictionary of Human scores. The shape is (num_systems, ).
- "EW_edit": Expected Wins scores using edit-based human evaluation.
- "EW_sent": Expected Wins scores using sentence-based human evaluation.
- "TS_edit": TrueSkill scores using edit-based human evaluation.
- "TS_sent": TrueSkill scores using sentence-based human evaluation.
'''
subset_dir = glob.glob('**/SEEDA/outputs/subset', recursive=True)[0]
del_systems = {
'base': ['INPUT', 'REF-F', 'GPT-3.5'],
'+INPUT': ['REF-F', 'GPT-3.5'],
'+REF-F_GPT-3.5': ['INPUT'],
'+fluency': ['INPUT'], # an alias
'all': []
}[self.config.system]
models = [m for m in self.MODELS if m not in del_systems]
data = {
'hypotheses': [],
'references': [],
'human_score': dict(),
'models': models,
'del_models': del_systems,
'sources': []
}
for model in models:
sents = read_lines(os.path.join(subset_dir, model + '.txt'))
data['hypotheses'].append(sents)
score_dir = glob.glob('**/SEEDA/scores/human', recursive=True)[0]
for score_id in self.SCORE_ID:
scores = list(map(float, read_lines(
os.path.join(score_dir, score_id + '.txt')
)))
scores = [s for i, s in enumerate(scores) if self.MODELS[i] not in del_systems]
data['human_score'][score_id] = scores
data['sources'] = read_lines(os.path.join(subset_dir, 'INPUT.txt'))
ref0 = read_lines(os.path.join(subset_dir, 'REF0.txt'))
ref1 = read_lines(os.path.join(subset_dir, 'REF1.txt'))
data['references'] = [ref0, ref1]
return data
[docs]
def load_xml(self, xml_path: str, target_models: list[str]) -> dict[str, list[list[int]]]:
'''Load a XML file.
Args:
xml_path (str): Path to a XML file.
target_models (list[str]): Model names to be evaluated.
Returns:
dict[int, list[list[int]]]:
Dictionary containing sentence-level human evaluation rankings.
The data is stored for each source and annotator.
You can refer to the ranking by dict[src_id][annotator_id][system_id] = -rank.
Note that each element is *minus* rank, so higher values are higher quality.
'''
tree = ET.parse(xml_path)
root = tree.getroot()
human_scores = dict()
for child in root.find('error-correction-ranking-result'):
src_id = int(child.attrib['src-id'])
human_scores[src_id] = human_scores.get(
src_id, []
)
scores = [None] * len(target_models)
for trans in child:
systems = trans.attrib['system'].split()
rank = int(trans.attrib['rank'])
for sys in systems:
if sys not in target_models:
continue
# Put the minus ranking as a score
scores[target_models.index(sys)] = -rank
human_scores[src_id].append(scores)
# Sort by source id.
human_scores = sorted(human_scores.items(), key=lambda x:x[0])
human_scores = [h[1] for h in human_scores]
return human_scores
[docs]
def load_sentence_data(self) -> dict[str, int]:
'''Load sentence-level meta-evaluation data.
Returns:
dict[str, list]: The meta-evaluation data contianing the following keys:
- "sources": Source sentences. The shape is (num_sentences, ).
- "hypotheses": Hypotheses sentences. The shape is (num_systems, num_sentences).
- "references": Reference sentences. The shape is (num_references, num_sentences).
- "models": The model names. This index corresponds to the first dimension of "hypotheses".
- "human_scores": Dictionary of Human scores for the systems. The shape is (num_sentences, num_systems, num_systems).
- "EW_edit": Expected Wins scores using edit-based human evaluation.
- "EW_sent": Expected Wins scores using sentence-based human evaluation.
- "TS_edit": TrueSkill scores using edit-based human evaluation.
- "TS_sent": TrueSkill scores using sentence-based human evaluation.
'''
subset_dir = glob.glob('**/SEEDA/outputs/subset/', recursive=True)[0]
data_dir = glob.glob('**/SEEDA/data/', recursive=True)[0]
del_systems = {
'base': ['INPUT', 'REF-F', 'GPT-3.5'],
'+INPUT': ['REF-F', 'GPT-3.5'],
'+REF-F_GPT-3.5': ['INPUT'],
'+fluency': ['INPUT'], # an alias
'all': []
}[self.config.system]
del_systems += ['REF0', 'REF1']
models = [m for m in self.MODELS if m not in del_systems]
data = {
'hypotheses': [],
'human_score': dict(),
'human_score_paths': dict(),
'models': models,
'del_models': del_systems,
'sources': []
}
data['human_score']['edit'] = self.load_xml(
data_dir + 'judgments_edit.xml',
models
)
data['human_score']['sent'] = self.load_xml(
data_dir + 'judgments_sent.xml',
models
)
for model in models:
sents = read_lines(os.path.join(subset_dir, model + '.txt'))
data['hypotheses'].append(sents)
input_sents = read_lines(os.path.join(subset_dir, 'INPUT.txt'))
data['sources'] = input_sents
ref0 = read_lines(os.path.join(subset_dir, 'REF0.txt'))
ref1 = read_lines(os.path.join(subset_dir, 'REF1.txt'))
data['references'] = [ref0, ref1]
return data
[docs]
def corr_system(
self,
metric: MetricBase,
aggregation='default'
)-> "SEEDASystemCorrOutput":
'''Compute system-level correlations.
Args:
metric (MetricBase): The metric to be evaluated.
Returns:
SEEDASystemCorrOutput: The correlations.
'''
corrs = super().corr_system(metric, aggregation=aggregation)
return self.SEEDASystemCorrOutput(
ew_edit=corrs[0],
ew_sent=corrs[1],
ts_edit=corrs[2],
ts_sent=corrs[3]
)
[docs]
def corr_sentence(
self, metric: MetricBase
) -> "SEEDASentenceCorrOutput":
'''Compute sentence-level correlations.
Args:
metric (MetricBase): The metric to be evaluated.
Returns:
SEEDASentenceCorrOutput: The correlations.
'''
corrs = super().corr_sentence(metric)
return self.SEEDASentenceCorrOutput(
edit=corrs[0],
sent=corrs[1]
)
[docs]
def window_analysis_system(
self,
metric: MetricBase,
window: int = 4,
aggregation='default'
) -> "SEEDAWindowAnalysisSystemCorrOutput":
corrs = super().window_analysis_system(metric, window, aggregation)
return self.SEEDAWindowAnalysisSystemCorrOutput(
ew_edit=corrs[0],
ew_sent=corrs[1],
ts_edit=corrs[2],
ts_sent=corrs[3],
)