Spaces:
Running
Running
Commit
·
b826a4f
1
Parent(s):
cba901f
v1.0 - Stable
Browse files
__pycache__/script_search_api.cpython-310.pyc
ADDED
Binary file (8.43 kB). View file
|
|
model/__pycache__/analyzer.cpython-310.pyc
ADDED
Binary file (7.51 kB). View file
|
|
model/analyzer.py
CHANGED
@@ -24,10 +24,10 @@ class ContentAnalyzer:
|
|
24 |
]
|
25 |
self.pattern = re.compile(r'\b(' + '|'.join(self.categories) + r')\b', re.IGNORECASE)
|
26 |
logger.info(f"Initialized analyzer with device: {self.device}")
|
27 |
-
self._load_model()
|
28 |
|
29 |
def _load_model(self) -> None:
|
30 |
-
"""Load model and tokenizer
|
31 |
try:
|
32 |
logger.info("Loading model components...")
|
33 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
@@ -45,95 +45,193 @@ class ContentAnalyzer:
|
|
45 |
logger.error(f"Model loading failed: {str(e)}")
|
46 |
raise
|
47 |
|
48 |
-
def _chunk_text(self, text: str,
|
49 |
-
"""
|
50 |
-
paragraphs = text.split('\n\n')
|
51 |
chunks = []
|
52 |
-
current_chunk =
|
|
|
53 |
|
54 |
for para in paragraphs:
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
else:
|
58 |
-
|
59 |
-
|
60 |
-
current_chunk = para + "\n\n"
|
61 |
|
62 |
if current_chunk:
|
63 |
-
|
|
|
64 |
|
65 |
-
logger.info(f"Split text into {len(chunks)} chunks")
|
66 |
return chunks
|
67 |
|
68 |
-
async def _analyze_chunk(self, chunk: str) -> List[str]:
|
69 |
-
"""
|
70 |
-
prompt = f"""
|
71 |
-
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
with torch.no_grad():
|
79 |
-
outputs = self.model.generate(
|
80 |
-
**inputs,
|
81 |
-
max_new_tokens=50,
|
82 |
-
do_sample=False,
|
83 |
-
pad_token_id=self.tokenizer.eos_token_id
|
84 |
-
)
|
85 |
-
|
86 |
-
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
87 |
-
return [m.capitalize() for m in self.pattern.findall(response)]
|
88 |
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
identified_triggers = set()
|
|
|
92 |
chunks = self._chunk_text(script)
|
93 |
|
|
|
|
|
|
|
|
|
|
|
94 |
for idx, chunk in enumerate(chunks):
|
95 |
if progress:
|
96 |
-
progress((idx/
|
|
|
|
|
|
|
|
|
97 |
|
98 |
-
|
99 |
-
identified_triggers.update(triggers)
|
100 |
|
101 |
if progress:
|
102 |
progress((1.0, "Analysis complete"))
|
103 |
|
104 |
-
|
|
|
|
|
105 |
|
106 |
async def analyze_content(
|
107 |
script: str,
|
108 |
progress: Optional[gr.Progress] = None
|
109 |
) -> Dict[str, Union[List[str], str]]:
|
110 |
-
"""
|
111 |
try:
|
112 |
analyzer = ContentAnalyzer()
|
113 |
-
triggers = await analyzer.analyze_script(script, progress)
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
"model": "DeepSeek-R1-Distill-Qwen-1.5B",
|
119 |
-
"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
120 |
}
|
|
|
|
|
|
|
|
|
121 |
except Exception as e:
|
122 |
logger.error(f"Analysis error: {str(e)}")
|
123 |
return {
|
124 |
-
"detected_triggers": ["
|
125 |
"confidence": "Error",
|
126 |
"model": "DeepSeek-R1-Distill-Qwen-1.5B",
|
127 |
"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
128 |
"error": str(e)
|
129 |
}
|
130 |
|
131 |
if __name__ == "__main__":
|
132 |
iface = gr.Interface(
|
133 |
fn=analyze_content,
|
134 |
-
inputs=gr.Textbox(lines=
|
135 |
-
outputs=
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
138 |
)
|
139 |
-
iface.launch()
|
|
|
24 |
]
|
25 |
self.pattern = re.compile(r'\b(' + '|'.join(self.categories) + r')\b', re.IGNORECASE)
|
26 |
logger.info(f"Initialized analyzer with device: {self.device}")
|
27 |
+
self._load_model()
|
28 |
|
29 |
def _load_model(self) -> None:
|
30 |
+
"""Load model and tokenizer with CPU optimization"""
|
31 |
try:
|
32 |
logger.info("Loading model components...")
|
33 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
|
45 |
logger.error(f"Model loading failed: {str(e)}")
|
46 |
raise
|
47 |
|
48 |
+
def _chunk_text(self, text: str, max_tokens: int = 512) -> List[str]:
|
49 |
+
"""Context-aware chunking with token counting"""
|
50 |
+
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
51 |
chunks = []
|
52 |
+
current_chunk = []
|
53 |
+
current_length = 0
|
54 |
|
55 |
for para in paragraphs:
|
56 |
+
para_tokens = self.tokenizer.encode(para, add_special_tokens=False)
|
57 |
+
para_length = len(para_tokens)
|
58 |
+
|
59 |
+
if current_length + para_length > max_tokens and current_chunk:
|
60 |
+
chunk_text = "\n\n".join(current_chunk)
|
61 |
+
chunks.append(chunk_text)
|
62 |
+
current_chunk = [para]
|
63 |
+
current_length = para_length
|
64 |
else:
|
65 |
+
current_chunk.append(para)
|
66 |
+
current_length += para_length
|
|
|
67 |
|
68 |
if current_chunk:
|
69 |
+
chunk_text = "\n\n".join(current_chunk)
|
70 |
+
chunks.append(chunk_text)
|
71 |
|
72 |
+
logger.info(f"Split text into {len(chunks)} chunks (max_tokens={max_tokens})")
|
73 |
return chunks
|
74 |
|
75 |
+
async def _analyze_chunk(self, chunk: str) -> tuple[List[str], str]:
|
76 |
+
"""Deep analysis with step-by-step reasoning"""
|
77 |
+
prompt = f"""As a deep-thinking content analyzer, carefully evaluate this text for sensitive content.
|
78 |
+
Input text: {chunk}
|
79 |
|
80 |
+
Think through each step:
|
81 |
+
1. What is happening in the text?
|
82 |
+
2. What potentially sensitive themes or elements are present?
|
83 |
+
3. For each category below, is there clear evidence?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
+
Categories: {", ".join(self.categories)}
|
86 |
+
|
87 |
+
Detailed analysis:
|
88 |
+
"""
|
89 |
+
|
90 |
+
try:
|
91 |
+
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.device)
|
92 |
+
|
93 |
+
with torch.no_grad():
|
94 |
+
outputs = self.model.generate(
|
95 |
+
**inputs,
|
96 |
+
do_sample=True,
|
97 |
+
temperature=0.7,
|
98 |
+
top_p=0.9,
|
99 |
+
max_length=8192
|
100 |
+
)
|
101 |
+
|
102 |
+
full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
103 |
+
|
104 |
+
# Extract categories more reliably using multiple patterns
|
105 |
+
categories_found = set()
|
106 |
+
|
107 |
+
# Look for explicit category mentions
|
108 |
+
category_matches = self.pattern.findall(full_response.lower())
|
109 |
+
|
110 |
+
# Normalize and validate matches
|
111 |
+
for match in category_matches:
|
112 |
+
for category in self.categories:
|
113 |
+
if match.lower() == category.lower():
|
114 |
+
categories_found.add(category)
|
115 |
+
|
116 |
+
# Convert to list and sort for consistency
|
117 |
+
matched_categories = sorted(list(categories_found))
|
118 |
+
|
119 |
+
# Clean up reasoning text
|
120 |
+
reasoning = full_response.split("\n\nCategories found:")[0] if "\n\nCategories found:" in full_response else full_response
|
121 |
+
reasoning = reasoning.strip()
|
122 |
+
|
123 |
+
if not matched_categories and any(trigger_word in full_response.lower() for trigger_word in
|
124 |
+
["concerning", "warning", "caution", "trigger", "sensitive"]):
|
125 |
+
logger.warning(f"Potential triggers found but no categories matched in chunk")
|
126 |
+
|
127 |
+
logger.info(f"Chunk analysis complete - Categories found: {matched_categories}")
|
128 |
+
return matched_categories, reasoning
|
129 |
+
|
130 |
+
except Exception as e:
|
131 |
+
logger.error(f"Chunk analysis error: {str(e)}")
|
132 |
+
return [], f"Analysis error: {str(e)}"
|
133 |
+
|
134 |
+
async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> tuple[List[str], List[str]]:
|
135 |
+
"""Main analysis workflow with progress updates"""
|
136 |
+
if not script.strip():
|
137 |
+
return ["No content provided"], ["No analysis performed"]
|
138 |
+
|
139 |
identified_triggers = set()
|
140 |
+
reasoning_outputs = []
|
141 |
chunks = self._chunk_text(script)
|
142 |
|
143 |
+
if not chunks:
|
144 |
+
return ["Empty text after chunking"], ["No analysis performed"]
|
145 |
+
|
146 |
+
total_chunks = len(chunks)
|
147 |
+
|
148 |
for idx, chunk in enumerate(chunks):
|
149 |
if progress:
|
150 |
+
progress((idx/total_chunks, f"Deep analysis of chunk {idx+1}/{total_chunks}"))
|
151 |
+
|
152 |
+
chunk_triggers, chunk_reasoning = await self._analyze_chunk(chunk)
|
153 |
+
identified_triggers.update(chunk_triggers)
|
154 |
+
reasoning_outputs.append(f"Chunk {idx + 1} Analysis:\n{chunk_reasoning}")
|
155 |
|
156 |
+
logger.info(f"Processed chunk {idx+1}/{total_chunks}, found triggers: {chunk_triggers}")
|
|
|
157 |
|
158 |
if progress:
|
159 |
progress((1.0, "Analysis complete"))
|
160 |
|
161 |
+
final_triggers = sorted(list(identified_triggers)) if identified_triggers else ["None"]
|
162 |
+
logger.info(f"Final triggers identified: {final_triggers}")
|
163 |
+
return final_triggers, reasoning_outputs
|
164 |
|
165 |
async def analyze_content(
|
166 |
script: str,
|
167 |
progress: Optional[gr.Progress] = None
|
168 |
) -> Dict[str, Union[List[str], str]]:
|
169 |
+
"""Gradio interface function with enhanced trigger detection"""
|
170 |
try:
|
171 |
analyzer = ContentAnalyzer()
|
172 |
+
triggers, reasoning_output = await analyzer.analyze_script(script, progress)
|
173 |
|
174 |
+
# Extract triggers from detailed analysis
|
175 |
+
detected_triggers = set()
|
176 |
+
full_reasoning = "\n\n".join(reasoning_output)
|
177 |
+
|
178 |
+
# Look for explicit category markers
|
179 |
+
category_markers = [
|
180 |
+
(r'\b(\w+):\s*\+', 1), # Matches "Category: +"
|
181 |
+
(r'\*\*(\w+(?:\s+\w+)?):\*\*[^\n]*?\bMarked with "\+"', 1), # Matches "**Category:** ... Marked with "+"
|
182 |
+
(r'(\w+(?:\s+\w+)?)\s*is clearly present', 1), # Matches "Category is clearly present"
|
183 |
+
]
|
184 |
+
|
185 |
+
for pattern, group in category_markers:
|
186 |
+
matches = re.finditer(pattern, full_reasoning, re.IGNORECASE)
|
187 |
+
for match in matches:
|
188 |
+
category = match.group(group).strip()
|
189 |
+
# Normalize category names to match predefined categories
|
190 |
+
for predefined_category in analyzer.categories:
|
191 |
+
if category.lower() in predefined_category.lower():
|
192 |
+
detected_triggers.add(predefined_category)
|
193 |
+
|
194 |
+
# Add any triggers found through direct pattern matching
|
195 |
+
for category in analyzer.categories:
|
196 |
+
pattern = fr'\b{re.escape(category)}\b.*?(present|evident|indicated|clear|obvious)'
|
197 |
+
if re.search(pattern, full_reasoning, re.IGNORECASE):
|
198 |
+
detected_triggers.add(category)
|
199 |
+
|
200 |
+
# If no triggers were found through detailed analysis, fall back to original triggers
|
201 |
+
final_triggers = sorted(list(detected_triggers)) if detected_triggers else triggers
|
202 |
+
|
203 |
+
result = {
|
204 |
+
"detected_triggers": final_triggers if final_triggers else ["None"],
|
205 |
+
"confidence": "High confidence" if final_triggers and final_triggers != ["None"] else "No triggers found",
|
206 |
"model": "DeepSeek-R1-Distill-Qwen-1.5B",
|
207 |
+
"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
208 |
+
"analysis_reasoning": full_reasoning
|
209 |
}
|
210 |
+
|
211 |
+
logger.info(f"Enhanced analysis complete. Results: {result}")
|
212 |
+
return result
|
213 |
+
|
214 |
except Exception as e:
|
215 |
logger.error(f"Analysis error: {str(e)}")
|
216 |
return {
|
217 |
+
"detected_triggers": ["Analysis error"],
|
218 |
"confidence": "Error",
|
219 |
"model": "DeepSeek-R1-Distill-Qwen-1.5B",
|
220 |
"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
221 |
+
"analysis_reasoning": str(e),
|
222 |
"error": str(e)
|
223 |
}
|
224 |
|
225 |
if __name__ == "__main__":
|
226 |
iface = gr.Interface(
|
227 |
fn=analyze_content,
|
228 |
+
inputs=gr.Textbox(lines=12, label="Paste Script Here", placeholder="Enter text to analyze..."),
|
229 |
+
outputs=[
|
230 |
+
gr.JSON(label="Analysis Results"),
|
231 |
+
gr.Textbox(label="Analysis Reasoning", lines=10)
|
232 |
+
],
|
233 |
+
title="TREAT - Trigger Analysis for Entertainment Texts",
|
234 |
+
description="Deep analysis of scripts for sensitive content using AI",
|
235 |
+
allow_flagging="never"
|
236 |
)
|
237 |
+
iface.launch(show_error=True)
|