Spaces:

Kuberwastaken
/

TREAT

Running

App Files Files Community

Kuberwastaken commited on Jan 7

Commit

3b069b9

1 Parent(s): 9ee7507

First Attempt to shorten the model and making it workable on Spaces

Browse files

Files changed (1) hide show

model/model.py +147 -190

model/model.py CHANGED Viewed

@@ -1,223 +1,180 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 from datetime import datetime
-def analyze_script(script):
-    # Starting the script analysis
-    print("\n=== Starting Analysis ===")
-    print(f"Time: {datetime.now()}")  # Outputting the current timestamp
-    print("Loading model and tokenizer...")
-    try:
-        # Load the tokenizer and model, selecting the appropriate device (CPU or CUDA)
-        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", use_fast=True)
-        device = "cuda" if torch.cuda.is_available() else "cpu"  # Use CUDA if available, else use CPU
-        print(f"Using device: {device}")
-        model = AutoModelForCausalLM.from_pretrained(
-            "meta-llama/Llama-3.2-1B",
-            torch_dtype=torch.float16 if device == "cuda" else torch.float32,  # Use 16-bit precision for CUDA, 32-bit for CPU
-            device_map="auto"  # Automatically map model to available device
-        )
-        print("Model loaded successfully")
-      # Define trigger categories with their descriptions
         trigger_categories = {
             "Violence": {
                 "mapped_name": "Violence",
-                "description": (
-                    "Any act involving physical force or aggression intended to cause harm, injury, or death to a person, animal, or object. "
-                    "Includes direct physical confrontations (e.g., fights, beatings, or assaults), implied violence (e.g., very graphical threats or descriptions of injuries), "
-                    "or large-scale events like wars, riots, or violent protests."
-                )
             },
             "Death": {
                 "mapped_name": "Death References",
-                "description": (
-                    "Any mention, implication, or depiction of the loss of life, including direct deaths of characters, including mentions of deceased individuals, "
-                    "or abstract references to mortality (e.g., 'facing the end' or 'gone forever'). This also covers depictions of funerals, mourning, "
-                    "grieving, or any dialogue that centers around death, do not take metaphors into context that don't actually lead to death."
-                )
             },
-            "Substance Use": {
                 "mapped_name": "Substance Use",
-                "description": (
-                    "Any explicit or implied reference to the consumption, misuse, or abuse of drugs, alcohol, or other intoxicating substances. "
-                    "Includes scenes of drinking, smoking, or drug use, whether recreational or addictive. May also cover references to withdrawal symptoms, "
-                    "rehabilitation, or substance-related paraphernalia (e.g., needles, bottles, pipes)."
-                )
             },
             "Gore": {
                 "mapped_name": "Gore",
-                "description": (
-                    "Extremely detailed and graphic depictions of highly severe physical injuries, mutilation, or extreme bodily harm, often accompanied by descriptions of heavy blood, exposed organs, "
-                    "or dismemberment. This includes war scenes with severe casualties, horror scenarios involving grotesque creatures, or medical procedures depicted with excessive detail."
-                    "only answer yes if you're completely certain."
-                )
             },
-            "Vomit": {
-                "mapped_name": "Vomit",
-                "description": (
-                    "Any explicit reference to vomiting, whether directly described, implied, or depicted. This includes detailed sounds, visual descriptions, mentions of nausea explicitly leading to vomiting, or any aftermath involving vomit."
-                    "Respond 'yes' only if the scene unambiguously and clearly involves vomiting, with no room for doubt."
-                )
-            },
-            "Sexual Content": {
                 "mapped_name": "Sexual Content",
-                "description": (
-                    "Any depiction or mention of sexual activity, intimacy, or sexual behavior, ranging from implied scenes to explicit descriptions. "
-                    "This includes romantic encounters, physical descriptions of characters in a sexual context, sexual dialogue, or references to sexual themes (e.g., harassment, innuendos)."
-                )
             },
-            "Sexual Abuse": {
-               "mapped_name": "Sexual Abuse",
-               "description": (
-                  "Any form of non-consensual sexual act, behavior, or interaction, involving coercion, manipulation, or physical force. "
-                  "This includes incidents of sexual assault, molestation, exploitation, harassment, and any acts where an individual is subjected to sexual acts against their will or without their consent. "
-                  "It also covers discussions or depictions of the aftermath of such abuse, such as trauma, emotional distress, legal proceedings, or therapy. "
-                  "References to inappropriate sexual advances, groping, or any other form of sexual misconduct are also included, as well as the psychological and emotional impact on survivors. "
-                  "Scenes where individuals are placed in sexually compromising situations, even if not directly acted upon, may also fall under this category."
-                  "only answer yes if you're completely certain of it's presence."
-                )
-            },
-            "Self-Harm": {
                 "mapped_name": "Self-Harm",
-                "description": (
-                    "Any mention or depiction of behaviors where an individual intentionally causes harm to themselves. This includes cutting, burning, or other forms of physical injury, "
-                    "as well as suicidal ideation, suicide attempts, or discussions of self-destructive thoughts and actions. References to scars, bruises, or other lasting signs of self-harm are also included."
-                    "only answer yes if you're completely certain."
-                )
             },
-            "Gun Use": {
-                "mapped_name": "Gun Use",
-                "description": (
-                    "Any explicit or implied mention of firearms being handled, fired, or used in a threatening manner. This includes scenes of gun violence, references to shootings, "
-                    "gun-related accidents, or the presence of firearms in a tense or dangerous context (e.g., holstered weapons during an argument)."
-                )
-            },
-            "Animal Cruelty": {
-                "mapped_name": "Animal Cruelty",
-                "description": (
-                    "Any act of harm or abuse toward animals, whether intentional or accidental. This includes physical abuse (e.g., hitting, injuring, or killing animals), "
-                    "mental or emotional mistreatment (e.g., starvation, isolation), and scenes where animals are subjected to pain or suffering for human entertainment or experimentation."
-                    "Respond 'yes' only if the scene unambiguously and clearly involves Animal Cruelty, with no room for doubt"
-                )
-            },
-            "Mental Health Issues": {
                 "mapped_name": "Mental Health Issues",
-                "description": (
-                    "Any reference to mental health struggles, disorders, or psychological distress. This includes mentions of depression, anxiety, PTSD, bipolar disorder, schizophrenia, "
-                    "or other conditions. Scenes depicting destructive coping mechanisms are also included."
-                    "like a character expressing feelings of worthlessness, hopelessness, or detachment from reality."
-                )
             }
         }
-        print("\nProcessing text...")  # Output indicating the text is being processed
-        chunk_size = 256  # Set the chunk size for text processing
-        overlap = 15  # Overlap between chunks for context preservation
-        script_chunks = []  # List to store script chunks
-        # Split the script into smaller chunks
-        for i in range(0, len(script), chunk_size - overlap):
-            chunk = script[i:i + chunk_size]
-            script_chunks.append(chunk)
-        print(f"Split into {len(script_chunks)} chunks with {overlap} token overlap")  # Inform about the chunking
-        identified_triggers = {}  # Dictionary to store the identified triggers
-        # Process each chunk of the script
-        for chunk_idx, chunk in enumerate(script_chunks, 1):
-            print(f"\n--- Processing Chunk {chunk_idx}/{len(script_chunks)} ---")
-            print(f"Chunk text (preview): {chunk[:50]}...")  # Preview of the current chunk
-            # Check each category for triggers
             for category, info in trigger_categories.items():
-                mapped_name = info["mapped_name"]
-                description = info["description"]
-                print(f"\nAnalyzing for {mapped_name}...")
-                prompt = f"""
-                Check this text for any indication of {mapped_name} ({description}).
-                Be sensitive to subtle references or implications, make sure the text is not metaphorical.
-                Respond concisely with: YES, NO, or MAYBE.
-                Text: {chunk}
-                Answer:
-                """
-                print(f"Sending prompt to model...")  # Indicate that prompt is being sent to the model
-                inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)  # Tokenize the prompt
-                inputs = {k: v.to(device) for k, v in inputs.items()}  # Send inputs to the chosen device
-                with torch.no_grad():  # Disable gradient calculation for inference
-                    print("Generating response...")  # Indicate that the model is generating a response
-                    outputs = model.generate(
-                        **inputs,
-                        max_new_tokens=10,  # Limit response length
-                        do_sample=True,  # Enable sampling for more diverse output
-                        temperature=0.5,  # Control randomness of the output
-                        top_p=0.9,  # Use nucleus sampling
-                        pad_token_id=tokenizer.eos_token_id  # Pad token ID
-                    )
-                response_text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()  # Decode and format the response
-                first_word = response_text.split("\n")[-1].split()[0] if response_text else "NO"  # Get the first word of the response
-                print(f"Model response for {mapped_name}: {first_word}")
-                # Update identified triggers based on model response
-                if first_word == "YES":
-                   print(f"Detected {mapped_name} in this chunk!")  # Trigger detected
-                   identified_triggers[mapped_name] = identified_triggers.get(mapped_name, 0) + 1
-                elif first_word == "MAYBE":
-                   print(f"Possible {mapped_name} detected, marking for further review.")  # Possible trigger detected
-                   identified_triggers[mapped_name] = identified_triggers.get(mapped_name, 0) + 0.5
-                else:
-                   print(f"No {mapped_name} detected in this chunk.")  # No trigger detected
-        print("\n=== Analysis Complete ===")  # Indicate that analysis is complete
-        print("Final Results:")
-        final_triggers = []  # List to store final triggers
-        # Filter and output the final trigger results
-        for mapped_name, count in identified_triggers.items():
-            if count > 0.5:
-                final_triggers.append(mapped_name)
-            print(f"- {mapped_name}: found in {count} chunks")
-        if not final_triggers:
-            print("No triggers detected")  # No triggers detected
-            final_triggers = ["None"]
-        print("\nReturning results...")
-        return final_triggers  # Return the list of detected triggers
-    except Exception as e:
-        # Handle errors and provide stack trace
-        print(f"\nERROR OCCURRED: {str(e)}")
-        print("Stack trace:")
-        import traceback
-        traceback.print_exc()
-        return {"error": str(e)}
 def get_detailed_analysis(script):
-    print("\n=== Starting Detailed Analysis ===")
-    triggers = analyze_script(script)  # Call the analyze_script function
-    if isinstance(triggers, list) and triggers != ["None"]:
-        result = {
-            "detected_triggers": triggers,
-            "confidence": "High - Content detected",
-            "model": "Llama-3.2-1B",
-            "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        }
-    else:
-        result = {
-            "detected_triggers": ["None"],
-            "confidence": "High - No concerning content detected",
-            "model": "Llama-3.2-1B",
-            "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        }
-    print("\nFinal Result Dictionary:", result)  # Output the final result dictionary
-    return result

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 from datetime import datetime
+import gc
+class ContentAnalyzer:
+    def __init__(self):
+        self.model_name = "meta-llama/Llama-3.2-1B"
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tokenizer = None
+        self.model = None
+    def load_model(self):
+        """Load model with memory optimization"""
+        try:
+            print("Loading tokenizer...")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
+            print(f"Loading model on {self.device}...")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                low_cpu_mem_usage=True,
+                device_map="auto"
+            )
+            return True
+        except Exception as e:
+            print(f"Model loading error: {str(e)}")
+            return False
+    def cleanup(self):
+        """Clean up GPU memory"""
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+    def analyze_chunk(self, chunk, category_info):
+        """Analyze a single chunk of text for a specific trigger"""
+        mapped_name = category_info["mapped_name"]
+        description = category_info["description"]
+        prompt = f"""Check this text for any indication of {mapped_name} ({description}).
+        Be sensitive to subtle references or implications, make sure the text is not metaphorical.
+        Respond concisely with: YES, NO, or MAYBE.
+        Text: {chunk}
+        Answer:"""
+        try:
+            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=10,
+                    do_sample=True,
+                    temperature=0.5,
+                    top_p=0.9,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
+            first_word = response.split("\n")[-1].split()[0] if response else "NO"
+            score = 1 if first_word == "YES" else 0.5 if first_word == "MAYBE" else 0
+            return score, first_word
+        except Exception as e:
+            print(f"Chunk analysis error: {str(e)}")
+            return 0, "NO"
+    def analyze_text(self, text):
+        """Main analysis function"""
+        if not self.load_model():
+            return {"error": "Model loading failed"}
+        # Original trigger categories
         trigger_categories = {
             "Violence": {
                 "mapped_name": "Violence",
+                "description": "Any act involving physical force or aggression intended to cause harm, injury, or death."
             },
             "Death": {
                 "mapped_name": "Death References",
+                "description": "Any mention, implication, or depiction of the loss of life, including direct deaths or abstract references to mortality."
             },
+            "Substance_Use": {
                 "mapped_name": "Substance Use",
+                "description": "References to consumption, misuse, or abuse of drugs, alcohol, or other intoxicating substances."
             },
             "Gore": {
                 "mapped_name": "Gore",
+                "description": "Graphic depictions of severe physical injuries, mutilation, or extreme bodily harm."
             },
+            "Sexual_Content": {
                 "mapped_name": "Sexual Content",
+                "description": "Depictions or mentions of sexual activity, intimacy, or sexual behavior."
             },
+            "Self_Harm": {
                 "mapped_name": "Self-Harm",
+                "description": "Behaviors where an individual intentionally causes harm to themselves."
             },
+            "Mental_Health": {
                 "mapped_name": "Mental Health Issues",
+                "description": "References to mental health struggles, disorders, or psychological distress."
             }
         }
+        try:
+            # Optimize chunk processing
+            chunk_size = 200  # Reduced chunk size for better memory management
+            overlap = 10
+            chunks = []
+            # Create chunks with overlap
+            for i in range(0, len(text), chunk_size - overlap):
+                chunk = text[i:i + chunk_size]
+                chunks.append(chunk)
+            trigger_scores = {}
+            trigger_occurrences = {}
+            # Initialize tracking dictionaries
             for category, info in trigger_categories.items():
+                trigger_scores[info["mapped_name"]] = 0
+                trigger_occurrences[info["mapped_name"]] = []
+            # Process all chunks for all categories
+            for chunk_idx, chunk in enumerate(chunks):
+                print(f"\nProcessing chunk {chunk_idx + 1}/{len(chunks)}")
+                chunk_triggers = {}
+                for category, info in trigger_categories.items():
+                    score, response = self.analyze_chunk(chunk, info)
+                    if score > 0:
+                        mapped_name = info["mapped_name"]
+                        trigger_scores[mapped_name] += score
+                        trigger_occurrences[mapped_name].append({
+                            'chunk_idx': chunk_idx,
+                            'response': response,
+                            'score': score
+                        })
+                        print(f"Found {mapped_name} in chunk {chunk_idx + 1} (Response: {response})")
+                # Cleanup after processing each chunk
+                if self.device == "cuda":
+                    self.cleanup()
+            # Collect all triggers that meet the threshold
+            detected_triggers = []
+            for name, score in trigger_scores.items():
+                if score >= 0.5:  # Threshold for considering a trigger as detected
+                    occurrences = len(trigger_occurrences[name])
+                    detected_triggers.append(name)
+                    print(f"\nTrigger '{name}' detected in {occurrences} chunks with total score {score}")
+            result = {
+                "detected_triggers": detected_triggers if detected_triggers else ["None"],
+                "confidence": "High - Content detected" if detected_triggers else "High - No concerning content detected",
+                "model": self.model_name,
+                "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                "trigger_details": {
+                    name: {
+                        "total_score": trigger_scores[name],
+                        "occurrences": trigger_occurrences[name]
+                    } for name in detected_triggers if name != "None"
+                }
+            }
+            return result
+        except Exception as e:
+            return {"error": str(e)}
+        finally:
+            self.cleanup()
 def get_detailed_analysis(script):
+    analyzer = ContentAnalyzer()
+    return analyzer.analyze_text(script)