File size: 2,601 Bytes
3e37441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
import os
from collections import Counter

base_dir = "generate_result/zero_shot/bd_math/generation/llama3.1/1"


def has_repetition(text, threshold=3):
    """
    Check if the given text contains repetitive substrings and return the repetitive phrases.

    :param text: The text to check for repetition
    :param threshold: The number of repetitions to consider as repetitive
    :return: A list of repetitive phrases if found, otherwise an empty list
    """
    words = text.split()
    repetitive_phrases = []
    for n in range(15, 20):
        phrases = [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]
        phrase_counts = Counter(phrases)
        repetitive_phrases.extend(
            [phrase for phrase, count in phrase_counts.items() if count >= threshold]
        )
        break
    return repetitive_phrases


total_items = 0
items_with_repetition = 0
repetition_data = []

for i in range(8):
    file_path = os.path.join(base_dir, f"{i}.json")

    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist. Skipping.")
        continue

    with open(file_path, "r") as file:
        for line_number, line in enumerate(file, 1):
            try:
                data = json.loads(line)
                model_output = data.get("total output", "")[0]
                total_items += 1

                repetitive_phrases = has_repetition(model_output)
                if len(repetitive_phrases):
                    items_with_repetition += 1
                    repetition_data.append(
                        {
                            "file": f"{i}.json",
                            "line": line_number,
                            "prompt": data.get("prompt", ""),
                            "repetitive_phrases": repetitive_phrases,
                        }
                    )
                    # print(repetitive_phrases[0])
            except json.JSONDecodeError:
                print(f"Error decoding JSON in file {i}.json, line {line_number}")
            except Exception as e:
                print(f"Error processing file {i}.json, line {line_number}: {str(e)}")

# Calculate the ratio
ratio = items_with_repetition / total_items if total_items > 0 else 0
print(
    f"Ratio of items with repetition: {ratio:.2f} ({items_with_repetition}/{total_items})"
)

# Save repetition data to a file
output_file = "repetition_analysis.json"
with open(os.path.join(base_dir, output_file), "w") as f:
    json.dump(repetition_data, f, indent=2)

print(f"Repetition analysis completed. Results saved to {output_file}")