File size: 5,845 Bytes
b9604a1
 
 
3b069b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9604a1
3b069b9
 
 
 
 
 
 
 
 
b9604a1
3b069b9
 
 
 
b9604a1
475e55f
 
 
 
b9604a1
475e55f
 
 
 
 
 
 
b9604a1
 
475e55f
 
 
 
b9604a1
475e55f
 
 
 
 
b9604a1
475e55f
 
 
 
 
 
 
b9604a1
 
3b069b9
475e55f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datetime import datetime
import gc

class ContentAnalyzer:
    def __init__(self):
        self.model_name = "meta-llama/Llama-3.2-1B"
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = None
        self.model = None
        
    def load_model(self):
        """Load model with memory optimization"""
        try:
            print("Loading tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
            
            print(f"Loading model on {self.device}...")
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                low_cpu_mem_usage=True,
                device_map="auto"
            )
            return True
        except Exception as e:
            print(f"Model loading error: {str(e)}")
            return False

    def cleanup(self):
        """Clean up GPU memory"""
        if self.device == "cuda":
            torch.cuda.empty_cache()
        gc.collect()

    def analyze_chunk(self, chunk, category_info):
        """Analyze a single chunk of text for a specific trigger"""
        mapped_name = category_info["mapped_name"]
        description = category_info["description"]

        prompt = f"""Check this text for any indication of {mapped_name} ({description}).
        Be sensitive to subtle references or implications, make sure the text is not metaphorical.
        Respond concisely with: YES, NO, or MAYBE.
        Text: {chunk}
        Answer:"""

        try:
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=10,
                    do_sample=True,
                    temperature=0.5,
                    top_p=0.9,
                    pad_token_id=self.tokenizer.eos_token_id
                )

            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
            first_word = response.split("\n")[-1].split()[0] if response else "NO"
            
            score = 1 if first_word == "YES" else 0.5 if first_word == "MAYBE" else 0
            return score, first_word
            
        except Exception as e:
            print(f"Chunk analysis error: {str(e)}")
            return 0, "NO"

    def analyze_text(self, text):
        """Main analysis function"""
        if not self.load_model():
            return {"error": "Model loading failed"}

        chunk_size = 256  # Set the chunk size for text processing
        overlap = 15  # Overlap between chunks for context preservation
        script_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]

        trigger_categories = {
            "Violence": {"mapped_name": "Violence", "description": "Any act involving physical force or aggression intended to cause harm, injury, or death."},
            "Death": {"mapped_name": "Death References", "description": "Any mention, implication, or depiction of the loss of life, including direct deaths or abstract references to mortality."},
            "Substance_Use": {"mapped_name": "Substance Use", "description": "References to consumption, misuse, or abuse of drugs, alcohol, or other intoxicating substances."},
            "Gore": {"mapped_name": "Gore", "description": "Graphic depictions of severe physical injuries, mutilation, or extreme bodily harm."},
            "Sexual_Content": {"mapped_name": "Sexual Content", "description": "Depictions or mentions of sexual activity, intimacy, or sexual behavior."},
            "Self_Harm": {"mapped_name": "Self-Harm", "description": "Behaviors where an individual intentionally causes harm to themselves."},
            "Mental_Health": {"mapped_name": "Mental Health Issues", "description": "References to mental health struggles, disorders, or psychological distress."}
        }

        identified_triggers = {}

        for chunk_idx, chunk in enumerate(script_chunks, 1):
            print(f"\n--- Processing Chunk {chunk_idx}/{len(script_chunks)} ---")
            for category, info in trigger_categories.items():
                score, response = self.analyze_chunk(chunk, info)
                if response == "YES":
                    identified_triggers[category] = identified_triggers.get(category, 0) + 1
                elif response == "MAYBE":
                    identified_triggers[category] = identified_triggers.get(category, 0) + 0.5

        final_triggers = [category for category, count in identified_triggers.items() if count > 0.5]
        self.cleanup()

        if not final_triggers:
            final_triggers = ["None"]

        return final_triggers

def get_detailed_analysis(script):
    analyzer = ContentAnalyzer()
    print("\n=== Starting Detailed Analysis ===")
    triggers = analyzer.analyze_text(script)
    
    if isinstance(triggers, list) and triggers != ["None"]:
        result = {
            "detected_triggers": triggers,
            "confidence": "High - Content detected",
            "model": "Llama-3.2-1B",
            "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
    else:
        result = {
            "detected_triggers": ["None"],
            "confidence": "High - No concerning content detected",
            "model": "Llama-3.2-1B",
            "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

    print("\nFinal Result Dictionary:", result)
    return result