Source code for gec_metrics.metrics.green

from .base import MetricBaseForReferenceBased
from dataclasses import dataclass
from collections import Counter
import math
import hashlib


[docs]
class GREEN(MetricBaseForReferenceBased):

[docs]
    @dataclass
    class Config(MetricBaseForReferenceBased.Config):
        '''GREEN configuration
            - n (int): Maxmimun n for n-gram.
            - beta (int): The beta for F-beta score.
            - unit (str): Word-level or character-level. Can be 'word' or 'char'.
        '''
        n: int = 4
        beta: float = 2.0
        unit: str = 'word'


    def __init__(self, config: Config = None):
        super().__init__(config)
        self.cache_ngram = dict()
    

[docs]
    def cached_get_all_ngrams(
        self,
        sentence: str,
    ) -> dict[str, int]:
        '''Get frequency of n-gram for all n (1 <= n <= config.n)
        '''
        if sentence == '':
            return dict()
        if self.config.unit == 'word':
            words = sentence.split(' ')
        elif self.config.unit == 'char':
            words = sentence
        key = hashlib.sha256(sentence.encode()).hexdigest()
        if self.cache_ngram.get(key) is None:
            ngrams = []
            for n in range(1, self.config.n + 1):
                for i in range(len(words) - n + 1):
                    ngrams.append(tuple(words[i:i+n]))
            self.cache_ngram[key] = Counter(ngrams)
        return self.cache_ngram[key]

    

[docs]
    def aggregate_score(self, scores: list["Score"]) -> float:
        '''Aggregate n-gram scores to an overall score by the geometric mean.
        
        Args:
            scores (list[Score]): The scores keeping n-gram boundary.
                The shape is (n, )
        
        Returns:
            float: The aggregated score.
        '''
        ps = [s.precision for s in scores]
        rs = [s.recall for s in scores]
        if 0 in ps:
            prec = 0
        else:
            # $(\PI x)^(1/N) = exp((1/N) \sum log(x))
            prec = math.exp(sum(math.log(p) for p in ps) / len(scores))
        if 0 in rs:
            rec = 0
        else:
            rec = math.exp(sum(math.log(r) for r in rs) / len(scores))
        beta = self.config.beta
        f = float((1+(beta**2))*prec*rec) / (((beta**2)*prec)+rec) if prec+rec else 0.0
        return f

    

[docs]
    def score_corpus(
        self,
        sources: list[str],
        hypotheses: list[str],
        references: list[list[str]]
    ) -> float:
        '''Calculate a corpus-level score.
        This accumulates n-gram count for TP, FP, FN
            and calculates f-beta score.

        Args:
            sources (list[str]): Source sentence.
                The shape is (num_sentences, )
            hypotheses (list[str]): Corrected sentences.
                The shape is (num_sentences, )
            references (list[list[str]]): Reference sentences.
                The shape is (num_references, num_sentences).
        
        Returns:
            float: The corpus-level score.
        '''
        verbose_scores = self.score_base(
            sources,
            hypotheses,
            references
        )
        score = [self.Score(beta=self.config.beta) for _ in range(self.config.n)]
        for v_scores in verbose_scores:  # sentence loop
            best_score = None
            for v_score_for_ref in v_scores:  # reference loop
                # Choose the best reference
                if best_score is None \
                    or self.aggregate_score(best_score) < self.aggregate_score(v_score_for_ref):
                    best_score = v_score_for_ref
            # Accumulate scores for each n-gram.
            for n in range(self.config.n):
                score[n] += best_score[n]
        return self.aggregate_score(score)



[docs]
    def score_sentence(
        self,
        sources: list[str],
        hypotheses: list[str],
        references: list[list[str]]
    ) -> list[float]:
        '''Calculate sentence-level scores.

        Args:
            sources (list[str]): Source sentence.
                The shape is (num_sentences, )
            hypotheses (list[str]): Corrected sentences.
                The shape is (num_sentences, )
            references (list[list[str]]): Reference sentences.
                The shape is (num_references, num_sentences).
        
        Returns:
            list[float]: The sentence-level scores.
        '''
        verbose_scores = self.score_base(
            sources,
            hypotheses,
            references
        )
        scores = []
        for v_scores in verbose_scores:  # sentence loop
            best_score = None
            for v_score_for_ref in v_scores:  # reference loop
                # Choose the best reference
                if best_score is None \
                    or self.aggregate_score(best_score) < self.aggregate_score(v_score_for_ref):
                    best_score = v_score_for_ref
            # Accumulate scores for each n-gram.
            scores.append(self.aggregate_score(best_score))
        return scores

        

[docs]
    def score_base(
        self,
        sources: list[str],
        hypotheses: list[str],
        references: list[list[str]]
    ) -> list[list[list["Score"]]]:
        '''Calculate scores while retaining sentence and reference boundaries.
            The results can be aggregated according to the purpose,
                e.g., at sentence-level or corpus-level.

        Args:
            sources (list[str]): Source sentence.
            hypothesis (list[str]): Corrected sentences.
            references (list[list[str]]): Reference sentences.
                The shape is (the number of references, the number of sentences).
        
        Returns:
            list[list[list["Score"]]]: The verbose scores.
                The shape is (num_iterations, num_sents, max_ngram).
        '''
        num_sents = len(sources)
        scores = []  # The shape will be (num_sents, num_refs, max_ngram)
        for sent_id in range(num_sents):
            ngram_s = self.cached_get_all_ngrams(sources[sent_id].strip())
            ngram_h = self.cached_get_all_ngrams(hypotheses[sent_id].strip())
            ngram_rs = [
                self.cached_get_all_ngrams(ref[sent_id].strip()) for ref in references
            ]
            sent_score = []
            for ngram_r in ngram_rs:
                all_ngram = set(list(ngram_s.keys()) + list(ngram_h.keys()) + list(ngram_r.keys()))
                this_score = [self.Score(beta=self.config.beta) for _ in range(self.config.n)]
                for ngram in all_ngram:
                    idx = len(ngram) - 1
                    ms = ngram_s.get(ngram, 0)
                    mh = ngram_h.get(ngram, 0)
                    mr = ngram_r.get(ngram, 0)
                    # TD
                    this_score[idx].tp += max(
                        ms - max(mr, mh),
                        0
                    )
                    # TI
                    this_score[idx].tp += max(
                        min(mr, mh) - ms,
                        0
                    )
                    # TK
                    this_score[idx].tp += min(ms, mh, mr)
                    # OD
                    this_score[idx].fp += max(
                        min(ms, mr) - mh,
                        0
                    )
                    # OI
                    this_score[idx].fp += max(
                        mh - max(ms, mr),
                        0
                    )
                    # UD
                    this_score[idx].fn += max(
                        min(ms, mh) - mr,
                        0
                    )
                    # UI
                    this_score[idx].fn += max(
                        mr - max(ms, mh),
                        0
                    )
                sent_score.append(this_score)
            scores.append(sent_score)
        return scores