from .base import MetricBaseForReferenceBased
from dataclasses import dataclass
from collections import Counter
import math
import hashlib
[docs]
class GREEN(MetricBaseForReferenceBased):
[docs]
@dataclass
class Config(MetricBaseForReferenceBased.Config):
'''GREEN configuration
- n (int): Maxmimun n for n-gram.
- beta (int): The beta for F-beta score.
- unit (str): Word-level or character-level. Can be 'word' or 'char'.
'''
n: int = 4
beta: float = 2.0
unit: str = 'word'
def __init__(self, config: Config = None):
super().__init__(config)
self.cache_ngram = dict()
[docs]
def cached_get_all_ngrams(
self,
sentence: str,
) -> dict[str, int]:
'''Get frequency of n-gram for all n (1 <= n <= config.n)
'''
if sentence == '':
return dict()
if self.config.unit == 'word':
words = sentence.split(' ')
elif self.config.unit == 'char':
words = sentence
key = hashlib.sha256(sentence.encode()).hexdigest()
if self.cache_ngram.get(key) is None:
ngrams = []
for n in range(1, self.config.n + 1):
for i in range(len(words) - n + 1):
ngrams.append(tuple(words[i:i+n]))
self.cache_ngram[key] = Counter(ngrams)
return self.cache_ngram[key]
[docs]
def aggregate_score(self, scores: list["Score"]) -> float:
'''Aggregate n-gram scores to an overall score by the geometric mean.
Args:
scores (list[Score]): The scores keeping n-gram boundary.
The shape is (n, )
Returns:
float: The aggregated score.
'''
ps = [s.precision for s in scores]
rs = [s.recall for s in scores]
if 0 in ps:
prec = 0
else:
# $(\PI x)^(1/N) = exp((1/N) \sum log(x))
prec = math.exp(sum(math.log(p) for p in ps) / len(scores))
if 0 in rs:
rec = 0
else:
rec = math.exp(sum(math.log(r) for r in rs) / len(scores))
beta = self.config.beta
f = float((1+(beta**2))*prec*rec) / (((beta**2)*prec)+rec) if prec+rec else 0.0
return f
[docs]
def score_corpus(
self,
sources: list[str],
hypotheses: list[str],
references: list[list[str]]
) -> float:
'''Calculate a corpus-level score.
This accumulates n-gram count for TP, FP, FN
and calculates f-beta score.
Args:
sources (list[str]): Source sentence.
The shape is (num_sentences, )
hypotheses (list[str]): Corrected sentences.
The shape is (num_sentences, )
references (list[list[str]]): Reference sentences.
The shape is (num_references, num_sentences).
Returns:
float: The corpus-level score.
'''
verbose_scores = self.score_base(
sources,
hypotheses,
references
)
score = [self.Score(beta=self.config.beta) for _ in range(self.config.n)]
for v_scores in verbose_scores: # sentence loop
best_score = None
for v_score_for_ref in v_scores: # reference loop
# Choose the best reference
if best_score is None \
or self.aggregate_score(best_score) < self.aggregate_score(v_score_for_ref):
best_score = v_score_for_ref
# Accumulate scores for each n-gram.
for n in range(self.config.n):
score[n] += best_score[n]
return self.aggregate_score(score)
[docs]
def score_sentence(
self,
sources: list[str],
hypotheses: list[str],
references: list[list[str]]
) -> list[float]:
'''Calculate sentence-level scores.
Args:
sources (list[str]): Source sentence.
The shape is (num_sentences, )
hypotheses (list[str]): Corrected sentences.
The shape is (num_sentences, )
references (list[list[str]]): Reference sentences.
The shape is (num_references, num_sentences).
Returns:
list[float]: The sentence-level scores.
'''
verbose_scores = self.score_base(
sources,
hypotheses,
references
)
scores = []
for v_scores in verbose_scores: # sentence loop
best_score = None
for v_score_for_ref in v_scores: # reference loop
# Choose the best reference
if best_score is None \
or self.aggregate_score(best_score) < self.aggregate_score(v_score_for_ref):
best_score = v_score_for_ref
# Accumulate scores for each n-gram.
scores.append(self.aggregate_score(best_score))
return scores
[docs]
def score_base(
self,
sources: list[str],
hypotheses: list[str],
references: list[list[str]]
) -> list[list[list["Score"]]]:
'''Calculate scores while retaining sentence and reference boundaries.
The results can be aggregated according to the purpose,
e.g., at sentence-level or corpus-level.
Args:
sources (list[str]): Source sentence.
hypothesis (list[str]): Corrected sentences.
references (list[list[str]]): Reference sentences.
The shape is (the number of references, the number of sentences).
Returns:
list[list[list["Score"]]]: The verbose scores.
The shape is (num_iterations, num_sents, max_ngram).
'''
num_sents = len(sources)
scores = [] # The shape will be (num_sents, num_refs, max_ngram)
for sent_id in range(num_sents):
ngram_s = self.cached_get_all_ngrams(sources[sent_id].strip())
ngram_h = self.cached_get_all_ngrams(hypotheses[sent_id].strip())
ngram_rs = [
self.cached_get_all_ngrams(ref[sent_id].strip()) for ref in references
]
sent_score = []
for ngram_r in ngram_rs:
all_ngram = set(list(ngram_s.keys()) + list(ngram_h.keys()) + list(ngram_r.keys()))
this_score = [self.Score(beta=self.config.beta) for _ in range(self.config.n)]
for ngram in all_ngram:
idx = len(ngram) - 1
ms = ngram_s.get(ngram, 0)
mh = ngram_h.get(ngram, 0)
mr = ngram_r.get(ngram, 0)
# TD
this_score[idx].tp += max(
ms - max(mr, mh),
0
)
# TI
this_score[idx].tp += max(
min(mr, mh) - ms,
0
)
# TK
this_score[idx].tp += min(ms, mh, mr)
# OD
this_score[idx].fp += max(
min(ms, mr) - mh,
0
)
# OI
this_score[idx].fp += max(
mh - max(ms, mr),
0
)
# UD
this_score[idx].fn += max(
min(ms, mh) - mr,
0
)
# UI
this_score[idx].fn += max(
mr - max(ms, mh),
0
)
sent_score.append(this_score)
scores.append(sent_score)
return scores