from dataclasses import dataclass
from .base import MetricBaseForReferenceBased, MetricBase
import hashlib
import errant
import spacy
[docs]
class ERRANT(MetricBaseForReferenceBased):
[docs]
@dataclass
class Config(MetricBaseForReferenceBased.Config):
'''ERRANT configuration.
- beta (float): The beta for F-beta score.
- language (str): The language for spacy.
'''
beta: float = 0.5
language: str = 'en'
def __init__(self, config: Config = None):
super().__init__(config)
self.errant = errant.load(self.config.language)
self.cache_parse = dict()
self.cache_annotate = dict()
[docs]
def cached_parse(self, sent: str) -> spacy.tokens.doc.Doc:
'''Efficient parse() by caching.
Args:
sent (str): The sentence to be parsed.
Return:
spacy.tokens.doc.Doc: The parse results.
'''
key = hashlib.sha256(sent.encode()).hexdigest()
if self.cache_parse.get(key) is None:
self.cache_parse[key] = self.errant.parse(sent)
return self.cache_parse[key]
[docs]
def filter_edits(
self,
edits: list[errant.edit.Edit]
) -> list[errant.edit.Edit]:
'''Handle edits that will be ignored.'''
return [e for e in edits if e.type not in ['noop', 'UNK']]
[docs]
def aggregate_to_overall(self, scores: dict[str, "Score"]) -> "Score":
'''Convert error type-wise scores into an overall score.
Args:
scores (dict[str, "Score"]): Error type-wise scores.
Returns:
Score: The aggregated score.
'''
overall = self.Score(beta=self.config.beta)
for v in scores.values():
overall += v
return overall
[docs]
def score_corpus(
self,
sources: list[str],
hypotheses: list[str],
references: list[list[str]]
) -> float:
'''Calculate a corpus-level score.
This accumulates edit count for TP, FP, FN
and calculates f-beta score.
Args:
sources (list[str]): Source sentence.
The shape is (num_sentences, )
hypotheses (list[str]): Corrected sentences.
The shape is (num_sentences, )
references (list[list[str]]): Reference sentences.
The shape is (num_references, num_sentences).
Returns:
float: The corpus-level score.
'''
verbose_scores = self.score_corpus_verbose(
sources, hypotheses, references
)
return verbose_scores.f
[docs]
def score_corpus_verbose(
self,
sources: list[str],
hypotheses: list[str],
references: list[list[str]]
) -> "Score":
'''Calculate a corpus level score by aggregating verbose scores.
Args:
sources (list[str]): Source sentence.
hypothesis (list[str]): Corrected sentences.
references (list[list[str]]): Reference sentences.
The shape is (the number of references, the number of sentences).
Returns:
Score: It contains TP, FP, FN, Precision, Recall, and F-beta.
'''
verbose_scores = self.score_base(
sources,
hypotheses,
references
)
score = self.Score(beta=self.config.beta)
for v_scores in verbose_scores: # sentence loop
best_score = None
for v_score_for_ref in v_scores: # reference loop
agg_score = self.aggregate_to_overall(v_score_for_ref)
# The comparison is performed by adding
# the current sentence-level score to the current accumulated score.
# This is not mentioned ERRANT paper but the official implementation is doing so.
if best_score is None or (score + best_score) < (score + agg_score):
best_score = agg_score
score += best_score
return score
[docs]
def score_corpus_etype(
self,
sources: list[str],
hypotheses: list[str],
references: list[list[str]],
cat: int = 2
):
'''Calculate error-type-level scores.
Args:
sources (list[str]): Source sentence.
The shape is (num_sentences, )
hypotheses (list[str]): Corrected sentences.
The shape is (num_sentences, )
references (list[list[str]]): Reference sentences.
The shape is (num_references, num_sentences).
cat (int): Error type category.
By following the original ERRANT,
cat=1: Operation: e.g. M, R, U
cat=2: Main types: e.g. NOUN, VERB
cat=3: All types: e.g. M:NOUN, R:VERB
Returns:
dict[str, Score]: The error-type-level score.
Each key is an error type, and value is the Score instance.
'''
verbose_scores = self.score_base(
sources,
hypotheses,
references
)
score = self.Score(beta=self.config.beta)
etype_scores = dict()
for v_scores in verbose_scores:
best_score = None
best_v_score = None
# Identify the best reference
for v_score_for_ref in v_scores: # reference loop
agg_score = self.aggregate_to_overall(v_score_for_ref)
if best_score is None or (score + best_score) < (score + agg_score):
best_score = agg_score
best_v_score = v_score_for_ref
# Now best_v_score is a dict, {etype: Score}
for etype in best_v_score.keys():
this_etype = etype
if cat == 1:
this_etype = this_etype[0] # E.g., R:NOUN -> R
elif cat == 2:
this_etype = this_etype[2:] # E.g., R:NOUN -> NOUN
etype_scores[this_etype] = etype_scores.get(this_etype, self.Score(beta=self.config.beta))
etype_scores[this_etype] += best_v_score[etype] # Adding between Score instances
return etype_scores
[docs]
def score_sentence(
self,
sources: list[str],
hypotheses: list[str],
references: list[list[str]]
) -> list[float]:
'''Calculate sentence-level scores.
Args:
sources (list[str]): Source sentence.
The shape is (num_sentences, )
hypotheses (list[str]): Corrected sentences.
The shape is (num_sentences, )
references (list[list[str]]): Reference sentences.
The shape is (num_references, num_sentences).
Returns:
list[float]: The sentence-level scores.
'''
verbose_scores = self.score_sentence_verbose(
sources, hypotheses, references
)
return [s.f for s in verbose_scores]
[docs]
def score_sentence_verbose(
self,
sources: list[str],
hypotheses: list[str],
references: list[list[str]]
) -> list["Score"]:
'''Calculate sentence level scores by aggregating verbose scores.
"verbose" means that TP, FP, FN, Precisoin, Recall, and F are available.
Args:
sources (list[str]): Source sentence.
The shape is (num_sentences, )
hypotheses (list[str]): Corrected sentences.
The shape is (num_sentences, )
references (list[list[str]]): Reference sentences.
The shape is (num_references, num_sentences).
Returns:
list[Score]: The sentence-level scores.
'''
verbose_scores = self.score_base(
sources,
hypotheses,
references
)
scores = []
for sent_id, v_scores in enumerate(verbose_scores): # sentence loop
best_score = None
for v_score_for_ref in v_scores: # reference loop to choose the best reference.
agg_score = self.aggregate_to_overall(v_score_for_ref)
if best_score is None or best_score < agg_score:
best_score = agg_score
scores.append(best_score)
return scores
[docs]
def score_base(
self,
sources: list[str],
hypotheses: list[str],
references: list[list[str]]
) -> list[list[dict[str, "Score"]]]:
'''Calculate scores while retaining sentence and reference boundaries.
The results can be aggregated according to the purpose,
e.g., at sentence-level or corpus-level.
Args:
sources (list[str]): Source sentence.
hypothesis (list[str]): Corrected sentences.
references (list[list[str]]): Reference sentences.
The shape is (the number of references, the number of sentences).
Returns:
list[list[dict[str, "Score"]]]: The verbose scores.
- The list shape is (num_sents, num_refs)
- The dict contains error type-wise scores.
'''
num_sents = len(sources)
num_refs = len(references)
scores = [] # shape will be: (num_sents, num_refs, )
for sent_id in range(num_sents):
hyp_edits = self.edit_extraction(
sources[sent_id],
hypotheses[sent_id]
)
ref_edits_list = [self.edit_extraction(
sources[sent_id],
references[ref_id][sent_id]
) for ref_id in range(num_refs)]
sent_scores = [] # shape will be: (num_refs, )
h_edits = [(e.o_start, e.o_end, e.c_str) for e in hyp_edits]
h_types = [e.type for e in hyp_edits]
for ref_edits in ref_edits_list:
r_edits = [(e.o_start, e.o_end, e.c_str) for e in ref_edits]
r_types = [e.type for e in ref_edits]
this_score = dict()
for h_edit, h_type in zip(h_edits, h_types):
this_score[h_type] = this_score.get(
h_type, self.Score(beta=self.config.beta)
)
if h_edit in r_edits:
this_score[h_type].tp += 1
else:
this_score[h_type].fp += 1
for r_edit, r_type in zip(r_edits, r_types):
if r_edit not in h_edits:
this_score[r_type] = this_score.get(
r_type, self.Score(beta=self.config.beta)
)
this_score[r_type].fn += 1
sent_scores.append(this_score)
scores.append(sent_scores)
return scores