Kuberwastaken commited on
Commit
b826a4f
·
1 Parent(s): cba901f

v1.0 - Stable

Browse files
__pycache__/script_search_api.cpython-310.pyc ADDED
Binary file (8.43 kB). View file
 
model/__pycache__/analyzer.cpython-310.pyc ADDED
Binary file (7.51 kB). View file
 
model/analyzer.py CHANGED
@@ -24,10 +24,10 @@ class ContentAnalyzer:
24
  ]
25
  self.pattern = re.compile(r'\b(' + '|'.join(self.categories) + r')\b', re.IGNORECASE)
26
  logger.info(f"Initialized analyzer with device: {self.device}")
27
- self._load_model() # Load model during initialization
28
 
29
  def _load_model(self) -> None:
30
- """Load model and tokenizer synchronously during initialization"""
31
  try:
32
  logger.info("Loading model components...")
33
  self.tokenizer = AutoTokenizer.from_pretrained(
@@ -45,95 +45,193 @@ class ContentAnalyzer:
45
  logger.error(f"Model loading failed: {str(e)}")
46
  raise
47
 
48
- def _chunk_text(self, text: str, chunk_size: int = 1024) -> List[str]:
49
- """Optimized chunking using paragraph boundaries"""
50
- paragraphs = text.split('\n\n')
51
  chunks = []
52
- current_chunk = ""
 
53
 
54
  for para in paragraphs:
55
- if len(current_chunk) + len(para) < chunk_size:
56
- current_chunk += para + "\n\n"
 
 
 
 
 
 
57
  else:
58
- if current_chunk:
59
- chunks.append(current_chunk.strip())
60
- current_chunk = para + "\n\n"
61
 
62
  if current_chunk:
63
- chunks.append(current_chunk.strip())
 
64
 
65
- logger.info(f"Split text into {len(chunks)} chunks")
66
  return chunks
67
 
68
- async def _analyze_chunk(self, chunk: str) -> List[str]:
69
- """Optimized chunk analysis with structured prompt"""
70
- prompt = f"""You are a highly specialized content analysis AI, Analyze this text for sensitive content from: {', '.join(self.categories)}.
71
- Respond with categories in format: [CATEGORIES]:
72
 
73
- Text: {chunk[:2000]}
74
- [CATEGORIES]: """
75
-
76
- inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(self.device)
77
-
78
- with torch.no_grad():
79
- outputs = self.model.generate(
80
- **inputs,
81
- max_new_tokens=50,
82
- do_sample=False,
83
- pad_token_id=self.tokenizer.eos_token_id
84
- )
85
-
86
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
87
- return [m.capitalize() for m in self.pattern.findall(response)]
88
 
89
- async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> List[str]:
90
- """Main analysis method with progress support"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  identified_triggers = set()
 
92
  chunks = self._chunk_text(script)
93
 
 
 
 
 
 
94
  for idx, chunk in enumerate(chunks):
95
  if progress:
96
- progress((idx/len(chunks), f"Analyzing chunk {idx+1}/{len(chunks)}"))
 
 
 
 
97
 
98
- triggers = await self._analyze_chunk(chunk)
99
- identified_triggers.update(triggers)
100
 
101
  if progress:
102
  progress((1.0, "Analysis complete"))
103
 
104
- return sorted(identified_triggers) if identified_triggers else ["None"]
 
 
105
 
106
  async def analyze_content(
107
  script: str,
108
  progress: Optional[gr.Progress] = None
109
  ) -> Dict[str, Union[List[str], str]]:
110
- """Main analysis function for Gradio interface"""
111
  try:
112
  analyzer = ContentAnalyzer()
113
- triggers = await analyzer.analyze_script(script, progress)
114
 
115
- return {
116
- "detected_triggers": triggers,
117
- "confidence": "High - Content detected" if triggers != ["None"] else "High - No concerning content detected",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  "model": "DeepSeek-R1-Distill-Qwen-1.5B",
119
- "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
120
  }
 
 
 
 
121
  except Exception as e:
122
  logger.error(f"Analysis error: {str(e)}")
123
  return {
124
- "detected_triggers": ["Error occurred during analysis"],
125
  "confidence": "Error",
126
  "model": "DeepSeek-R1-Distill-Qwen-1.5B",
127
  "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 
128
  "error": str(e)
129
  }
130
 
131
  if __name__ == "__main__":
132
  iface = gr.Interface(
133
  fn=analyze_content,
134
- inputs=gr.Textbox(lines=8, label="Input Text"),
135
- outputs=gr.JSON(),
136
- title="Content Sensitivity Analysis",
137
- description="Analyze text content for sensitive topics using DeepSeek R1"
 
 
 
 
138
  )
139
- iface.launch()
 
24
  ]
25
  self.pattern = re.compile(r'\b(' + '|'.join(self.categories) + r')\b', re.IGNORECASE)
26
  logger.info(f"Initialized analyzer with device: {self.device}")
27
+ self._load_model()
28
 
29
  def _load_model(self) -> None:
30
+ """Load model and tokenizer with CPU optimization"""
31
  try:
32
  logger.info("Loading model components...")
33
  self.tokenizer = AutoTokenizer.from_pretrained(
 
45
  logger.error(f"Model loading failed: {str(e)}")
46
  raise
47
 
48
+ def _chunk_text(self, text: str, max_tokens: int = 512) -> List[str]:
49
+ """Context-aware chunking with token counting"""
50
+ paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
51
  chunks = []
52
+ current_chunk = []
53
+ current_length = 0
54
 
55
  for para in paragraphs:
56
+ para_tokens = self.tokenizer.encode(para, add_special_tokens=False)
57
+ para_length = len(para_tokens)
58
+
59
+ if current_length + para_length > max_tokens and current_chunk:
60
+ chunk_text = "\n\n".join(current_chunk)
61
+ chunks.append(chunk_text)
62
+ current_chunk = [para]
63
+ current_length = para_length
64
  else:
65
+ current_chunk.append(para)
66
+ current_length += para_length
 
67
 
68
  if current_chunk:
69
+ chunk_text = "\n\n".join(current_chunk)
70
+ chunks.append(chunk_text)
71
 
72
+ logger.info(f"Split text into {len(chunks)} chunks (max_tokens={max_tokens})")
73
  return chunks
74
 
75
+ async def _analyze_chunk(self, chunk: str) -> tuple[List[str], str]:
76
+ """Deep analysis with step-by-step reasoning"""
77
+ prompt = f"""As a deep-thinking content analyzer, carefully evaluate this text for sensitive content.
78
+ Input text: {chunk}
79
 
80
+ Think through each step:
81
+ 1. What is happening in the text?
82
+ 2. What potentially sensitive themes or elements are present?
83
+ 3. For each category below, is there clear evidence?
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ Categories: {", ".join(self.categories)}
86
+
87
+ Detailed analysis:
88
+ """
89
+
90
+ try:
91
+ inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.device)
92
+
93
+ with torch.no_grad():
94
+ outputs = self.model.generate(
95
+ **inputs,
96
+ do_sample=True,
97
+ temperature=0.7,
98
+ top_p=0.9,
99
+ max_length=8192
100
+ )
101
+
102
+ full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
103
+
104
+ # Extract categories more reliably using multiple patterns
105
+ categories_found = set()
106
+
107
+ # Look for explicit category mentions
108
+ category_matches = self.pattern.findall(full_response.lower())
109
+
110
+ # Normalize and validate matches
111
+ for match in category_matches:
112
+ for category in self.categories:
113
+ if match.lower() == category.lower():
114
+ categories_found.add(category)
115
+
116
+ # Convert to list and sort for consistency
117
+ matched_categories = sorted(list(categories_found))
118
+
119
+ # Clean up reasoning text
120
+ reasoning = full_response.split("\n\nCategories found:")[0] if "\n\nCategories found:" in full_response else full_response
121
+ reasoning = reasoning.strip()
122
+
123
+ if not matched_categories and any(trigger_word in full_response.lower() for trigger_word in
124
+ ["concerning", "warning", "caution", "trigger", "sensitive"]):
125
+ logger.warning(f"Potential triggers found but no categories matched in chunk")
126
+
127
+ logger.info(f"Chunk analysis complete - Categories found: {matched_categories}")
128
+ return matched_categories, reasoning
129
+
130
+ except Exception as e:
131
+ logger.error(f"Chunk analysis error: {str(e)}")
132
+ return [], f"Analysis error: {str(e)}"
133
+
134
+ async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> tuple[List[str], List[str]]:
135
+ """Main analysis workflow with progress updates"""
136
+ if not script.strip():
137
+ return ["No content provided"], ["No analysis performed"]
138
+
139
  identified_triggers = set()
140
+ reasoning_outputs = []
141
  chunks = self._chunk_text(script)
142
 
143
+ if not chunks:
144
+ return ["Empty text after chunking"], ["No analysis performed"]
145
+
146
+ total_chunks = len(chunks)
147
+
148
  for idx, chunk in enumerate(chunks):
149
  if progress:
150
+ progress((idx/total_chunks, f"Deep analysis of chunk {idx+1}/{total_chunks}"))
151
+
152
+ chunk_triggers, chunk_reasoning = await self._analyze_chunk(chunk)
153
+ identified_triggers.update(chunk_triggers)
154
+ reasoning_outputs.append(f"Chunk {idx + 1} Analysis:\n{chunk_reasoning}")
155
 
156
+ logger.info(f"Processed chunk {idx+1}/{total_chunks}, found triggers: {chunk_triggers}")
 
157
 
158
  if progress:
159
  progress((1.0, "Analysis complete"))
160
 
161
+ final_triggers = sorted(list(identified_triggers)) if identified_triggers else ["None"]
162
+ logger.info(f"Final triggers identified: {final_triggers}")
163
+ return final_triggers, reasoning_outputs
164
 
165
  async def analyze_content(
166
  script: str,
167
  progress: Optional[gr.Progress] = None
168
  ) -> Dict[str, Union[List[str], str]]:
169
+ """Gradio interface function with enhanced trigger detection"""
170
  try:
171
  analyzer = ContentAnalyzer()
172
+ triggers, reasoning_output = await analyzer.analyze_script(script, progress)
173
 
174
+ # Extract triggers from detailed analysis
175
+ detected_triggers = set()
176
+ full_reasoning = "\n\n".join(reasoning_output)
177
+
178
+ # Look for explicit category markers
179
+ category_markers = [
180
+ (r'\b(\w+):\s*\+', 1), # Matches "Category: +"
181
+ (r'\*\*(\w+(?:\s+\w+)?):\*\*[^\n]*?\bMarked with "\+"', 1), # Matches "**Category:** ... Marked with "+"
182
+ (r'(\w+(?:\s+\w+)?)\s*is clearly present', 1), # Matches "Category is clearly present"
183
+ ]
184
+
185
+ for pattern, group in category_markers:
186
+ matches = re.finditer(pattern, full_reasoning, re.IGNORECASE)
187
+ for match in matches:
188
+ category = match.group(group).strip()
189
+ # Normalize category names to match predefined categories
190
+ for predefined_category in analyzer.categories:
191
+ if category.lower() in predefined_category.lower():
192
+ detected_triggers.add(predefined_category)
193
+
194
+ # Add any triggers found through direct pattern matching
195
+ for category in analyzer.categories:
196
+ pattern = fr'\b{re.escape(category)}\b.*?(present|evident|indicated|clear|obvious)'
197
+ if re.search(pattern, full_reasoning, re.IGNORECASE):
198
+ detected_triggers.add(category)
199
+
200
+ # If no triggers were found through detailed analysis, fall back to original triggers
201
+ final_triggers = sorted(list(detected_triggers)) if detected_triggers else triggers
202
+
203
+ result = {
204
+ "detected_triggers": final_triggers if final_triggers else ["None"],
205
+ "confidence": "High confidence" if final_triggers and final_triggers != ["None"] else "No triggers found",
206
  "model": "DeepSeek-R1-Distill-Qwen-1.5B",
207
+ "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
208
+ "analysis_reasoning": full_reasoning
209
  }
210
+
211
+ logger.info(f"Enhanced analysis complete. Results: {result}")
212
+ return result
213
+
214
  except Exception as e:
215
  logger.error(f"Analysis error: {str(e)}")
216
  return {
217
+ "detected_triggers": ["Analysis error"],
218
  "confidence": "Error",
219
  "model": "DeepSeek-R1-Distill-Qwen-1.5B",
220
  "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
221
+ "analysis_reasoning": str(e),
222
  "error": str(e)
223
  }
224
 
225
  if __name__ == "__main__":
226
  iface = gr.Interface(
227
  fn=analyze_content,
228
+ inputs=gr.Textbox(lines=12, label="Paste Script Here", placeholder="Enter text to analyze..."),
229
+ outputs=[
230
+ gr.JSON(label="Analysis Results"),
231
+ gr.Textbox(label="Analysis Reasoning", lines=10)
232
+ ],
233
+ title="TREAT - Trigger Analysis for Entertainment Texts",
234
+ description="Deep analysis of scripts for sensitive content using AI",
235
+ allow_flagging="never"
236
  )
237
+ iface.launch(show_error=True)