File size: 3,654 Bytes
251efe4
 
 
 
 
 
 
b6836c3
 
ab3d55c
 
 
 
 
b6836c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251efe4
b6836c3
 
 
 
251efe4
b6836c3
 
 
251efe4
 
5b4cc8a
251efe4
 
 
 
 
 
 
 
5b4cc8a
 
 
 
 
 
 
 
 
 
 
251efe4
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding:utf-8 -*-
from __future__ import annotations

import re
import evaluate
import pandas as pd

print(f"loading: {__file__}")


bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bert_score = evaluate.load("bertscore")

# pattern_non_word_char_repetition = re.compile(r"\s{5,}")
# pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)

# final version
pattern_non_word_char_repetition = re.compile(r"[\s\W]{5,}")
pattern_text_repetitions = re.compile(
    r"(?P<repeat>.{5}.*?)(?:[\s\W]*(?P=repeat))+", re.M | re.DOTALL | re.IGNORECASE
)
# Explanation of the Regex Pattern:
#   (?P<repeat>.{5}.*?): Captures any sequence of characters with minimal length of 5 and names this group repeat.
#     .*?: Matches zero or more characters, non-greedily (as few as possible).
#   (?:[\s\W]+(?P=repeat))+: A non-capturing group that matches one or more repetitions of:
#     [\s\W]+: One or more whitespace or non-word characters (spaces, punctuation, etc.).
#     (?P=repeat): A backreference to the named group repeat.


def del_non_word_char_repetition(text, debug=False):
    count = 0

    if isinstance(text, str):
        if debug:
            print("----detect non-word characters repetition----")
        count = len(text)
        text = pattern_non_word_char_repetition.sub("\t", text)
        count -= len(text)
        if debug and count:
            print(f"removed non-word characters repetition: {count}")
    return text, count


# final version for repetition detection
def detect_text_repetitions(text, debug=False):
    count = 0

    if isinstance(text, str):
        if debug:
            print("----detect text repetitions----")
        matches = pattern_text_repetitions.finditer(text)
        for match in matches:
            if debug:
                print(match)
                for groupNum in range(0, len(match.groups())):
                    groupNum = groupNum + 1
                    print(
                        "Group {groupNum} found at {start}-{end}: `{group}`".format(
                            groupNum=groupNum,
                            start=match.start(groupNum),
                            end=match.end(groupNum),
                            group=match.group(groupNum),
                        )
                    )

            start, end = match.span()
            count += end - start - len(match.group(1))

    return count


def detect_repetitions(text, debug=False):
    if isinstance(text, str) is False:
        return 0, 0, 0
    text, count_non_word_char_repetition = del_non_word_char_repetition(
        text, debug=debug
    )
    count_text_repetitions = detect_text_repetitions(text, debug=debug)
    total_repetitions = count_non_word_char_repetition + count_text_repetitions

    result = (count_non_word_char_repetition, count_text_repetitions, total_repetitions)

    if debug:
        print(result)
    return result


def calc_perf_scores(predictions, references, debug=False):
    if debug:
        print("predictions:", predictions)
        print("references:", references)

    bleu_scores = bleu.compute(
        predictions=predictions, references=references, max_order=1
    )
    rouge_scores = rouge.compute(predictions=predictions, references=references)
    bert_scores = bert_score.compute(
        predictions=predictions,
        references=references,
        lang="en",
        model_type="microsoft/deberta-large-mnli",
    )
    result = {
        "bleu_scores": bleu_scores,
        "rouge_scores": rouge_scores,
        "bert_scores": bert_scores,
    }

    if debug:
        print("result:", result)

    return result