Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
# app.py —
|
2 |
from __future__ import annotations
|
3 |
-
import os, re, json, time, glob, uuid, shutil, subprocess, urllib.parse
|
4 |
from typing import List, Dict, Optional
|
5 |
from datetime import datetime, timezone
|
6 |
|
@@ -9,17 +9,11 @@ import pandas as pd
|
|
9 |
import requests
|
10 |
import gradio as gr
|
11 |
|
12 |
-
# ----------
|
13 |
-
def now_iso()
|
14 |
-
|
15 |
-
|
16 |
-
def normalize_ws(s: str) -> str:
|
17 |
-
return re.sub(r"\s+", " ", s or "").strip()
|
18 |
-
|
19 |
-
def sent_tokenize_fallback(txt: str) -> List[str]:
|
20 |
-
# NLTK-free sentence splitter
|
21 |
return [s.strip() for s in re.split(r'(?<=[.!?])\s+|\n+', txt or '') if s.strip()]
|
22 |
-
|
23 |
def domain_from_url(url: str) -> str:
|
24 |
try: return urllib.parse.urlparse(url).netloc.lower()
|
25 |
except Exception: return ""
|
@@ -36,66 +30,63 @@ DEFAULT_ALLOWLIST = [
|
|
36 |
"nature.com","sciencemag.org","thelancet.com","nejm.org",
|
37 |
]
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
try:
|
44 |
-
return __import__(name)
|
45 |
-
except Exception:
|
46 |
-
return None
|
47 |
-
|
48 |
-
duckduckgo_search = _try_import("duckduckgo_search")
|
49 |
-
trafilatura = _try_import("trafilatura")
|
50 |
-
rank_bm25 = _try_import("rank_bm25")
|
51 |
-
sentence_transformers = _try_import("sentence_transformers")
|
52 |
-
transformers = _try_import("transformers")
|
53 |
-
torch = _try_import("torch")
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
59 |
|
60 |
-
#
|
61 |
-
_openai = _try_import("openai")
|
62 |
-
_has_openai_key = bool(os.environ.get("OPENAI_API_KEY"))
|
63 |
-
|
64 |
-
# ---- ASR guarded imports
|
65 |
try:
|
66 |
from faster_whisper import WhisperModel as FWWhisperModel
|
67 |
except Exception:
|
68 |
FWWhisperModel = None
|
69 |
-
|
70 |
try:
|
71 |
import whisper as OpenAIWhisper
|
72 |
except Exception:
|
73 |
OpenAIWhisper = None
|
74 |
|
75 |
-
#
|
76 |
try:
|
77 |
import easyocr as _easyocr
|
78 |
except Exception:
|
79 |
_easyocr = None
|
80 |
-
|
81 |
try:
|
82 |
import pytesseract as _pyt
|
83 |
except Exception:
|
84 |
_pyt = None
|
85 |
-
|
86 |
try:
|
87 |
import cv2
|
88 |
except Exception:
|
89 |
cv2 = None
|
90 |
|
|
|
|
|
|
|
91 |
|
|
|
|
|
92 |
|
93 |
-
def
|
94 |
-
|
|
|
|
|
|
|
95 |
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
def split_into_chunks(text: str, max_chars: int = 700) -> List[str]:
|
98 |
-
sents = [normalize_ws(s) for s in
|
99 |
chunks, cur = [], ""
|
100 |
for s in sents:
|
101 |
if len(cur) + 1 + len(s) > max_chars and cur:
|
@@ -107,7 +98,6 @@ def split_into_chunks(text: str, max_chars: int = 700) -> List[str]:
|
|
107 |
|
108 |
# ---------- Wikipedia ----------
|
109 |
WIKI_API = "https://en.wikipedia.org/w/api.php"
|
110 |
-
|
111 |
def wiki_search(query: str, n: int = 6) -> List[Dict]:
|
112 |
r = requests.get(WIKI_API, params={"action":"query","list":"search","srsearch":query,"srlimit":n,"format":"json"},
|
113 |
headers=HEADERS, timeout=20)
|
@@ -162,8 +152,7 @@ def build_wiki_corpus(claim: str, max_pages: int = 6, chunk_chars: int = 600) ->
|
|
162 |
|
163 |
# ---------- Web retrieval ----------
|
164 |
def ddg_search(query: str, max_results: int = 10, allowlist: Optional[List[str]] = None) -> List[Dict]:
|
165 |
-
if duckduckgo_search is None:
|
166 |
-
return []
|
167 |
DDGS = duckduckgo_search.DDGS
|
168 |
allowlist = allowlist or DEFAULT_ALLOWLIST
|
169 |
out = []
|
@@ -175,9 +164,8 @@ def ddg_search(query: str, max_results: int = 10, allowlist: Optional[List[str]]
|
|
175 |
return out
|
176 |
|
177 |
def fetch_clean_text(url: str) -> str:
|
178 |
-
if trafilatura is None:
|
179 |
try:
|
180 |
-
# last-chance plain GET (messy but better than nothing)
|
181 |
r = requests.get(url, headers=HEADERS, timeout=12); r.raise_for_status()
|
182 |
txt = re.sub(r"<[^>]+>", " ", r.text)
|
183 |
return normalize_ws(txt)[:8000]
|
@@ -206,11 +194,11 @@ def build_web_corpus(claim: str, allowlist: Optional[List[str]] = None, per_quer
|
|
206 |
for j, ch in enumerate(split_into_chunks(text, max_chars=chunk_chars)):
|
207 |
corpus.append({"id": f"web-{hash(url)}-{j}", "source":"web", "title": h["title"] or domain_from_url(url),
|
208 |
"url": url, "published": now_iso(), "text": ch})
|
209 |
-
time.sleep(0.6)
|
210 |
if len(corpus) >= per_query_results * 4: break
|
211 |
return list({d["id"]: d for d in corpus}.values())
|
212 |
|
213 |
-
# ---------- retrieval
|
214 |
def tokenize_simple(text: str) -> List[str]:
|
215 |
text = re.sub(r"[^a-z0-9\s]", " ", (text or "").lower())
|
216 |
return [w for w in text.split() if w and w not in {"the","a","an","and","or","of","to","in","for","on","with"}]
|
@@ -222,32 +210,25 @@ def rrf_merge(orderings: List[List[str]], k: int = 60) -> List[str]:
|
|
222 |
scores[doc_id] = scores.get(doc_id, 0.0) + 1.0/(k + r)
|
223 |
return [doc for doc,_ in sorted(scores.items(), key=lambda x: -x[1])]
|
224 |
|
225 |
-
# try to load BM25
|
226 |
BM25Okapi = getattr(rank_bm25, "BM25Okapi", None) if rank_bm25 else None
|
227 |
|
228 |
-
|
229 |
-
_emb_model = None
|
230 |
if sentence_transformers:
|
231 |
try:
|
232 |
_emb_model = sentence_transformers.SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
|
233 |
from sentence_transformers import util as st_util
|
234 |
except Exception:
|
235 |
-
_emb_model = None
|
236 |
-
st_util = None
|
237 |
-
else:
|
238 |
-
st_util = None
|
239 |
|
240 |
def retrieve_hybrid(claim: str, docs: List[Dict], k: int = 8) -> List[Dict]:
|
241 |
if not docs: return []
|
242 |
-
# BM25 (
|
243 |
-
bm25_order = []
|
244 |
if BM25Okapi:
|
245 |
corpus_tokens = [tokenize_simple(d["text"]) for d in docs]
|
246 |
bm25 = BM25Okapi(corpus_tokens)
|
247 |
bm25_scores = bm25.get_scores(tokenize_simple(claim))
|
248 |
bm25_order = [docs[i]["id"] for i in list(np.argsort(-np.array(bm25_scores)))]
|
249 |
else:
|
250 |
-
# poor-man BM25: sort by overlap count
|
251 |
q_toks = set(tokenize_simple(claim))
|
252 |
overlaps = [(i, len(q_toks.intersection(set(tokenize_simple(d["text"]))))) for i, d in enumerate(docs)]
|
253 |
bm25_order = [docs[i]["id"] for i,_ in sorted(overlaps, key=lambda x: -x[1])]
|
@@ -270,27 +251,26 @@ def retrieve_hybrid(claim: str, docs: List[Dict], k: int = 8) -> List[Dict]:
|
|
270 |
return [{**doc, "score": float(1/(60+i))} for i, doc in enumerate(ranked_docs[:k])]
|
271 |
|
272 |
# ---------- verifier (transformers optional; heuristic fallback) ----------
|
273 |
-
|
274 |
if transformers:
|
275 |
try:
|
276 |
AutoModelForSequenceClassification = transformers.AutoModelForSequenceClassification
|
277 |
AutoTokenizer = transformers.AutoTokenizer
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
except Exception:
|
283 |
-
|
284 |
|
285 |
def verify_with_nli(claim: str, evidence: List[Dict]) -> Dict:
|
286 |
-
|
287 |
-
if _nli_pipeline:
|
288 |
best_ent_id, best_ent_p = None, 0.0
|
289 |
best_con_id, best_con_p = None, 0.0
|
290 |
for e in evidence or []:
|
291 |
prem = (e.get("text") or "").strip()
|
292 |
if not prem: continue
|
293 |
-
outputs =
|
294 |
probs = {d["label"].upper(): float(d["score"]) for d in outputs[0]}
|
295 |
ent, con = probs.get("ENTAILMENT", 0.0), probs.get("CONTRADICTION", 0.0)
|
296 |
if ent > best_ent_p: best_ent_id, best_ent_p = e.get("id"), ent
|
@@ -303,15 +283,14 @@ def verify_with_nli(claim: str, evidence: List[Dict]) -> Dict:
|
|
303 |
elif best_con_p >= 0.60 and (best_con_p - best_ent_p) >= 0.10:
|
304 |
label, used, conf, rationale = "REFUTE", [best_con_id] if best_con_id else [], best_con_p, "Top evidence contradicts the claim."
|
305 |
return {"label": label, "used_evidence_ids": used, "confidence": float(conf), "rationale": rationale}
|
306 |
-
|
307 |
-
# Heuristic fallback (no transformers)
|
308 |
text = " ".join((e.get("text") or "")[:400].lower() for e in evidence[:6])
|
309 |
k = sanitize_claim_for_search(claim).lower()
|
310 |
-
if any(x in text for x in ["false",
|
311 |
-
return {"label":
|
312 |
-
if any(x in text for x in ["confirmed",
|
313 |
-
return {"label":
|
314 |
-
return {"label":
|
315 |
|
316 |
def enforce_json_schema(x: Dict) -> Dict:
|
317 |
return {"label": str(x.get("label","NEI")).upper(),
|
@@ -320,10 +299,8 @@ def enforce_json_schema(x: Dict) -> Dict:
|
|
320 |
"rationale": str(x.get("rationale","")).strip()[:300]}
|
321 |
|
322 |
def filter_by_time(docs: List[Dict], t_max_iso: str) -> List[Dict]:
|
323 |
-
try:
|
324 |
-
|
325 |
-
except Exception:
|
326 |
-
tmax = datetime.now(timezone.utc)
|
327 |
kept = []
|
328 |
for d in docs:
|
329 |
try:
|
@@ -333,15 +310,6 @@ def filter_by_time(docs: List[Dict], t_max_iso: str) -> List[Dict]:
|
|
333 |
kept.append(d)
|
334 |
return kept
|
335 |
|
336 |
-
def format_evidence_block(evs: List[Dict]) -> str:
|
337 |
-
lines = []
|
338 |
-
for e in evs:
|
339 |
-
snippet = (e.get("text","") or "")
|
340 |
-
if len(snippet) > 420: snippet = snippet[:420] + "..."
|
341 |
-
title = e.get("title","") or e.get("source","")
|
342 |
-
lines.append(f"[{e['id']}] ({e.get('published','')}) {title} — {e.get('url','')}\n{snippet}")
|
343 |
-
return "\n\n".join(lines)
|
344 |
-
|
345 |
def verify_claim(claim_text: str, use_web: bool = True, use_wiki: bool = True,
|
346 |
allowlist: Optional[List[str]] = None, t_claim_iso: Optional[str] = None, k: int = 8) -> Dict:
|
347 |
t_claim_iso = t_claim_iso or now_iso()
|
@@ -368,36 +336,71 @@ def run_on_claims(claims: List[str], use_web: bool, use_wiki: bool, allowlist: L
|
|
368 |
outs.append(verify_claim(c, use_web=use_web, use_wiki=use_wiki, allowlist=allowlist, t_claim_iso=now_iso(), k=k))
|
369 |
return outs
|
370 |
|
371 |
-
# ---------- ASR
|
372 |
def extract_audio_ffmpeg(video_path: str, out_wav: str, sr: int = 16000) -> str:
|
373 |
cmd = ["ffmpeg","-y","-i",video_path,"-vn","-acodec","pcm_s16le","-ar",str(sr),"-ac","1",out_wav]
|
374 |
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
375 |
return out_wav
|
376 |
-
|
377 |
def run_whisper_asr(audio_path: str, model_size: str = "base", language: Optional[str] = None) -> str:
|
378 |
-
# Prefer faster-whisper
|
379 |
if FWWhisperModel is not None:
|
380 |
-
device = "cuda" if
|
381 |
-
compute_type = "float16" if device == "cuda" else "int8"
|
382 |
model = FWWhisperModel(model_size, device=device, compute_type=compute_type)
|
383 |
segments, info = model.transcribe(audio_path, language=language, vad_filter=True, beam_size=5)
|
384 |
return " ".join(seg.text for seg in segments).strip()
|
385 |
-
|
386 |
-
# Fallback to OpenAI whisper (PyTorch)
|
387 |
if OpenAIWhisper is not None:
|
388 |
model = OpenAIWhisper.load_model(model_size)
|
389 |
result = model.transcribe(audio_path, language=language) if language else model.transcribe(audio_path)
|
390 |
return (result.get("text") or "").strip()
|
|
|
|
|
391 |
|
392 |
-
|
393 |
-
|
|
|
|
|
|
|
394 |
|
395 |
-
|
396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
|
398 |
-
def
|
399 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
return []
|
|
|
|
|
|
|
401 |
lang = _tess_langs(langs_csv)
|
402 |
texts, count = [], 0
|
403 |
for fp in frames:
|
@@ -407,26 +410,31 @@ def _ocr_with_tesseract(frames: list[str], langs_csv: str, max_images: int | Non
|
|
407 |
count += 1;
|
408 |
continue
|
409 |
try:
|
410 |
-
raw = _pyt.image_to_string(img, lang=lang)
|
411 |
except Exception:
|
412 |
-
|
|
|
|
|
|
|
413 |
for line in (raw or "").splitlines():
|
414 |
line = normalize_ws(line)
|
415 |
-
if len(line) >= 3:
|
416 |
-
texts.append(line)
|
417 |
count += 1
|
418 |
-
# dedupe
|
419 |
uniq, seen = [], set()
|
420 |
for t in texts:
|
421 |
k = t.lower()
|
422 |
-
if k not in seen:
|
423 |
-
uniq.append(t); seen.add(k)
|
424 |
return uniq
|
425 |
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
-
|
428 |
def download_video(url: str, out_dir: str = "videos") -> str:
|
429 |
-
# yt-dlp is installed via requirements; call binary
|
430 |
os.makedirs(out_dir, exist_ok=True)
|
431 |
out_tpl = os.path.join(out_dir, "%(title)s.%(ext)s")
|
432 |
subprocess.run(["yt-dlp","-o",out_tpl,url], check=True)
|
@@ -435,66 +443,13 @@ def download_video(url: str, out_dir: str = "videos") -> str:
|
|
435 |
|
436 |
def sample_frames_ffmpeg(video_path: str, out_dir: str = "frames", fps: float = 0.5) -> List[str]:
|
437 |
os.makedirs(out_dir, exist_ok=True)
|
438 |
-
|
439 |
-
|
|
|
|
|
|
|
440 |
return sorted(glob.glob(os.path.join(out_dir, "frame_*.jpg")))
|
441 |
|
442 |
-
def preprocess_for_ocr(img_path: str):
|
443 |
-
if cv2 is None: return None
|
444 |
-
img = cv2.imread(img_path, cv2.IMREAD_COLOR)
|
445 |
-
if img is None: return None
|
446 |
-
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
447 |
-
gray = cv2.bilateralFilter(gray, 7, 50, 50)
|
448 |
-
gray = cv2.equalizeHist(gray)
|
449 |
-
th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
450 |
-
cv2.THRESH_BINARY, 31, 9)
|
451 |
-
return th
|
452 |
-
|
453 |
-
def run_ocr_on_frames(frames: list[str], languages: list[str] | str = "en", gpu: bool | None = None, max_images: int | None = None) -> list[str]:
|
454 |
-
# Normalize languages input
|
455 |
-
if isinstance(languages, list):
|
456 |
-
langs_csv = ",".join(languages)
|
457 |
-
else:
|
458 |
-
langs_csv = languages or "en"
|
459 |
-
|
460 |
-
# 1) Try EasyOCR
|
461 |
-
if _easyocr is not None and cv2 is not None:
|
462 |
-
try:
|
463 |
-
if gpu is None:
|
464 |
-
gpu = True if (os.environ.get("SPACE_ID") or shutil.which("nvidia-smi")) else False
|
465 |
-
reader = _easyocr.Reader([c.strip() for c in langs_csv.split(",") if c.strip()], gpu=gpu)
|
466 |
-
texts, count = [], 0
|
467 |
-
for fp in frames:
|
468 |
-
if max_images and count >= max_images: break
|
469 |
-
img = preprocess_for_ocr(fp)
|
470 |
-
if img is None:
|
471 |
-
count += 1;
|
472 |
-
continue
|
473 |
-
for (_bbox, txt, conf) in reader.readtext(img):
|
474 |
-
txt = normalize_ws(txt)
|
475 |
-
if txt and conf >= 0.35:
|
476 |
-
texts.append(txt)
|
477 |
-
count += 1
|
478 |
-
# dedupe
|
479 |
-
uniq, seen = [], set()
|
480 |
-
for t in texts:
|
481 |
-
k = t.lower()
|
482 |
-
if k not in seen:
|
483 |
-
uniq.append(t); seen.add(k)
|
484 |
-
if uniq:
|
485 |
-
return uniq
|
486 |
-
except Exception:
|
487 |
-
pass # fall through to tesseract
|
488 |
-
|
489 |
-
# 2) Fallback: Tesseract
|
490 |
-
t_res = _ocr_with_tesseract(frames, langs_csv, max_images)
|
491 |
-
if t_res:
|
492 |
-
return t_res
|
493 |
-
|
494 |
-
# 3) Nothing available
|
495 |
-
return []
|
496 |
-
|
497 |
-
|
498 |
def aggregate_text(asr_text: str, ocr_lines: List[str]) -> str:
|
499 |
parts = []
|
500 |
if asr_text: parts.append(asr_text)
|
@@ -506,54 +461,72 @@ def aggregate_text(asr_text: str, ocr_lines: List[str]) -> str:
|
|
506 |
if k and k not in seen: uniq_lines.append(line.strip()); seen.add(k)
|
507 |
return "\n".join(uniq_lines)
|
508 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
def process_video(video_file: Optional[str] = None, video_url: Optional[str] = None,
|
510 |
whisper_model: str = "base", asr_language: Optional[str] = None,
|
511 |
ocr_langs: str = "en", fps: float = 0.5, max_ocr_images: int = 200) -> Dict:
|
512 |
workdir = f"session_{uuid.uuid4().hex[:8]}"; os.makedirs(workdir, exist_ok=True)
|
513 |
-
|
514 |
-
|
515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
516 |
wav = os.path.join(workdir, "audio_16k.wav")
|
|
|
|
|
517 |
extract_audio_ffmpeg(vp, wav, sr=16000)
|
518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
519 |
open(os.path.join(workdir, "transcript_asr.txt"), "w").write(asr_text)
|
|
|
|
|
520 |
frames_dir = os.path.join(workdir, "frames")
|
521 |
frames = sample_frames_ffmpeg(vp, out_dir=frames_dir, fps=fps)
|
522 |
-
|
523 |
-
|
524 |
-
ocr_lines =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
if not ocr_lines:
|
526 |
ocr_lines = ["[OCR skipped: no backend available]"]
|
527 |
-
|
528 |
open(os.path.join(workdir, "transcript_ocr.txt"), "w").write("\n".join(ocr_lines))
|
|
|
|
|
529 |
agg = aggregate_text(asr_text, ocr_lines)
|
530 |
open(os.path.join(workdir, "transcript_aggregated.txt"), "w").write(agg)
|
531 |
suggestions = suggest_claims(agg, top_k=10)
|
|
|
532 |
return {"workdir": workdir, "video_path": vp, "asr_text": asr_text, "ocr_lines": ocr_lines,
|
533 |
"aggregated_text": agg, "suggested_claims": suggestions}
|
534 |
|
535 |
-
# ----------
|
536 |
-
CLAIM_MIN_LEN = 12
|
537 |
-
VERB_TRIGGERS = r"\b(" + "|".join([
|
538 |
-
"is","are","was","were","has","have","had","will","can","does","did",
|
539 |
-
"cause","causes","leads","led","prove","proves","confirm","confirms",
|
540 |
-
"predict","predicts","announce","announces","claim","claims","say","says",
|
541 |
-
"warn","warns","plan","plans","declare","declares","ban","bans","approve","approves"
|
542 |
-
]) + r")\b"
|
543 |
-
|
544 |
-
def suggest_claims(text: str, top_k: int = 10) -> List[str]:
|
545 |
-
sents = [re.sub(r'^[\'"“”]+|[\'"“”]+$', '', x).strip() for x in re.split(r'[.!?\n]+', text or "") if x.strip()]
|
546 |
-
candidates = [s for s in sents if len(s) >= CLAIM_MIN_LEN and re.search(VERB_TRIGGERS, s, re.I)]
|
547 |
-
if not candidates:
|
548 |
-
fallback = [s for s in sents if 8 <= len(s) <= 140]
|
549 |
-
scored = []
|
550 |
-
for s in fallback:
|
551 |
-
score = (1 if re.search(r'\d', s) else 0) + sum(1 for w in s.split()[:6] if w[:1].isupper())
|
552 |
-
scored.append((score, s))
|
553 |
-
candidates = [s for _, s in sorted(scored, key=lambda x: -x[0])[:top_k]]
|
554 |
-
return candidates[:top_k]
|
555 |
-
|
556 |
-
# ---------- Gradio theme & UI ----------
|
557 |
THEME_CSS = """
|
558 |
<style>
|
559 |
body, .gradio-container {
|
@@ -562,34 +535,20 @@ THEME_CSS = """
|
|
562 |
linear-gradient(180deg, #0f1020, #0a0a12) !important;
|
563 |
color: #fff;
|
564 |
}
|
565 |
-
.glass {
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
}
|
572 |
-
.neon-btn {
|
573 |
-
background: linear-gradient(90deg, rgba(122,60,255,0.9), rgba(0,179,255,0.9));
|
574 |
-
border-radius: 12px;
|
575 |
-
color: white;
|
576 |
-
box-shadow: 0 0 24px rgba(122,60,255,0.35);
|
577 |
-
}
|
578 |
-
.neon-title {
|
579 |
-
background: linear-gradient(90deg, #b28cff, #7a3cff, #00b3ff);
|
580 |
-
-webkit-background-clip: text; -webkit-text-fill-color: transparent;
|
581 |
-
font-weight: 900;
|
582 |
-
}
|
583 |
</style>
|
584 |
"""
|
585 |
|
586 |
def ui_run_factcheck(claims_text: str, use_web: bool, use_wiki: bool, allowlist_str: str):
|
587 |
claims = [c.strip() for c in (claims_text or "").splitlines() if c.strip()]
|
588 |
-
if not claims:
|
589 |
-
return "Please enter one claim per line.", None
|
590 |
allow = [d.strip() for d in (allowlist_str or ", ".join(DEFAULT_ALLOWLIST)).split(",") if d.strip()]
|
591 |
res = run_on_claims(claims, use_web=use_web, use_wiki=use_wiki, allowlist=allow, k=8)
|
592 |
-
|
593 |
rows, cards = [], []
|
594 |
for v in res:
|
595 |
lines = ["─"*74, f"CLAIM: {v['claim']}", f"t_claim: {v['t_claim']}",
|
@@ -609,32 +568,64 @@ def ui_run_factcheck(claims_text: str, use_web: bool, use_wiki: bool, allowlist_
|
|
609 |
rows.append({"claim": v["claim"], "verdict_at_t": v["label_at_t"], "verdict_now": v["label_now"],
|
610 |
"confidence": round(float(v["confidence"]), 3),
|
611 |
"used_ids": "|".join(v.get("used_evidence_ids_now", []))})
|
612 |
-
|
613 |
df = pd.DataFrame(rows)
|
614 |
return "\n\n".join(cards), df
|
615 |
|
616 |
def ui_ingest_and_suggest(video_file, video_url, whisper_model, asr_language, ocr_langs, fps, max_ocr_images):
|
617 |
try: vp = video_file.name if video_file else None
|
618 |
except Exception: vp = None
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
ocr_langs=ocr_langs, fps=float(fps), max_ocr_images=int(max_ocr_images))
|
623 |
-
except Exception as e:
|
624 |
-
return f"Error during ingest: {e}", "", "", "", ""
|
625 |
asr_preview = (out["asr_text"][:1200] + "...") if len(out["asr_text"]) > 1200 else out["asr_text"]
|
626 |
ocr_preview = "\n".join(out["ocr_lines"][:50])
|
627 |
agg_preview = (out["aggregated_text"][:2000] + "...") if len(out["aggregated_text"]) > 2000 else out["aggregated_text"]
|
628 |
sugg = "\n".join(out["suggested_claims"])
|
629 |
return asr_preview, ocr_preview, agg_preview, sugg, sugg
|
630 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
631 |
with gr.Blocks(css=THEME_CSS, fill_height=True) as demo:
|
632 |
-
gr.HTML("<h1 class='neon-title' style='font-size:42px;margin:8px 0;'>
|
633 |
|
634 |
with gr.Tab("Manual Claims"):
|
635 |
with gr.Row():
|
636 |
with gr.Column(scale=1):
|
637 |
-
claims_box = gr.Textbox(label="Claims (one per line)", lines=8, placeholder="e.g.
|
638 |
with gr.Row():
|
639 |
use_web = gr.Checkbox(value=True, label="Use Web retrieval")
|
640 |
use_wiki = gr.Checkbox(value=True, label="Use Wikipedia")
|
@@ -646,7 +637,7 @@ with gr.Blocks(css=THEME_CSS, fill_height=True) as demo:
|
|
646 |
run_btn.click(ui_run_factcheck, inputs=[claims_box, use_web, use_wiki, allowlist_box], outputs=[out_text, out_df])
|
647 |
|
648 |
with gr.Tab("Video Ingest (ASR + OCR)"):
|
649 |
-
gr.Markdown("Upload a video **OR** provide a URL.
|
650 |
with gr.Row():
|
651 |
with gr.Column(scale=1):
|
652 |
video_upload = gr.File(label="Upload video (mp4/mov/mkv...)", file_types=["video"])
|
@@ -669,4 +660,9 @@ with gr.Blocks(css=THEME_CSS, fill_height=True) as demo:
|
|
669 |
inputs=[video_upload, video_url, whisper_model, asr_language, ocr_langs, fps, max_ocr_images],
|
670 |
outputs=[asr_out, ocr_out, agg_out, sugg_out, to_manual])
|
671 |
|
|
|
|
|
|
|
|
|
|
|
672 |
demo.launch()
|
|
|
1 |
+
# app.py — resilient ASR/OCR with Diagnostics (no NLTK)
|
2 |
from __future__ import annotations
|
3 |
+
import os, re, json, time, glob, uuid, shutil, subprocess, urllib.parse, io
|
4 |
from typing import List, Dict, Optional
|
5 |
from datetime import datetime, timezone
|
6 |
|
|
|
9 |
import requests
|
10 |
import gradio as gr
|
11 |
|
12 |
+
# ---------- small helpers ----------
|
13 |
+
def now_iso(): return datetime.now(timezone.utc).isoformat()
|
14 |
+
def normalize_ws(s: str) -> str: return re.sub(r"\s+", " ", s or "").strip()
|
15 |
+
def sent_tokenize(txt: str) -> List[str]:
|
|
|
|
|
|
|
|
|
|
|
16 |
return [s.strip() for s in re.split(r'(?<=[.!?])\s+|\n+', txt or '') if s.strip()]
|
|
|
17 |
def domain_from_url(url: str) -> str:
|
18 |
try: return urllib.parse.urlparse(url).netloc.lower()
|
19 |
except Exception: return ""
|
|
|
30 |
"nature.com","sciencemag.org","thelancet.com","nejm.org",
|
31 |
]
|
32 |
|
33 |
+
# ---------- guarded imports ----------
|
34 |
+
def _try(name):
|
35 |
+
try: return __import__(name)
|
36 |
+
except Exception: return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
duckduckgo_search = _try("duckduckgo_search")
|
39 |
+
trafilatura = _try("trafilatura")
|
40 |
+
rank_bm25 = _try("rank_bm25")
|
41 |
+
sentence_transformers = _try("sentence_transformers")
|
42 |
+
transformers = _try("transformers")
|
43 |
|
44 |
+
# ASR backends
|
|
|
|
|
|
|
|
|
45 |
try:
|
46 |
from faster_whisper import WhisperModel as FWWhisperModel
|
47 |
except Exception:
|
48 |
FWWhisperModel = None
|
|
|
49 |
try:
|
50 |
import whisper as OpenAIWhisper
|
51 |
except Exception:
|
52 |
OpenAIWhisper = None
|
53 |
|
54 |
+
# OCR backends
|
55 |
try:
|
56 |
import easyocr as _easyocr
|
57 |
except Exception:
|
58 |
_easyocr = None
|
|
|
59 |
try:
|
60 |
import pytesseract as _pyt
|
61 |
except Exception:
|
62 |
_pyt = None
|
|
|
63 |
try:
|
64 |
import cv2
|
65 |
except Exception:
|
66 |
cv2 = None
|
67 |
|
68 |
+
# ---------- env probes ----------
|
69 |
+
def ffmpeg_available() -> bool:
|
70 |
+
return bool(shutil.which("ffmpeg"))
|
71 |
|
72 |
+
def gpu_available() -> bool:
|
73 |
+
return bool(shutil.which("nvidia-smi"))
|
74 |
|
75 |
+
def asr_backends():
|
76 |
+
b = []
|
77 |
+
if FWWhisperModel: b.append("faster-whisper")
|
78 |
+
if OpenAIWhisper: b.append("openai-whisper")
|
79 |
+
return b
|
80 |
|
81 |
+
def ocr_backends():
|
82 |
+
b = []
|
83 |
+
if _easyocr and cv2: b.append("easyocr")
|
84 |
+
if _pyt and shutil.which("tesseract"): b.append("tesseract")
|
85 |
+
return b
|
86 |
+
|
87 |
+
# ---------- text chunking ----------
|
88 |
def split_into_chunks(text: str, max_chars: int = 700) -> List[str]:
|
89 |
+
sents = [normalize_ws(s) for s in sent_tokenize(text or "")]
|
90 |
chunks, cur = [], ""
|
91 |
for s in sents:
|
92 |
if len(cur) + 1 + len(s) > max_chars and cur:
|
|
|
98 |
|
99 |
# ---------- Wikipedia ----------
|
100 |
WIKI_API = "https://en.wikipedia.org/w/api.php"
|
|
|
101 |
def wiki_search(query: str, n: int = 6) -> List[Dict]:
|
102 |
r = requests.get(WIKI_API, params={"action":"query","list":"search","srsearch":query,"srlimit":n,"format":"json"},
|
103 |
headers=HEADERS, timeout=20)
|
|
|
152 |
|
153 |
# ---------- Web retrieval ----------
|
154 |
def ddg_search(query: str, max_results: int = 10, allowlist: Optional[List[str]] = None) -> List[Dict]:
|
155 |
+
if duckduckgo_search is None: return []
|
|
|
156 |
DDGS = duckduckgo_search.DDGS
|
157 |
allowlist = allowlist or DEFAULT_ALLOWLIST
|
158 |
out = []
|
|
|
164 |
return out
|
165 |
|
166 |
def fetch_clean_text(url: str) -> str:
|
167 |
+
if trafilatura is None:
|
168 |
try:
|
|
|
169 |
r = requests.get(url, headers=HEADERS, timeout=12); r.raise_for_status()
|
170 |
txt = re.sub(r"<[^>]+>", " ", r.text)
|
171 |
return normalize_ws(txt)[:8000]
|
|
|
194 |
for j, ch in enumerate(split_into_chunks(text, max_chars=chunk_chars)):
|
195 |
corpus.append({"id": f"web-{hash(url)}-{j}", "source":"web", "title": h["title"] or domain_from_url(url),
|
196 |
"url": url, "published": now_iso(), "text": ch})
|
197 |
+
time.sleep(0.6)
|
198 |
if len(corpus) >= per_query_results * 4: break
|
199 |
return list({d["id"]: d for d in corpus}.values())
|
200 |
|
201 |
+
# ---------- retrieval ----------
|
202 |
def tokenize_simple(text: str) -> List[str]:
|
203 |
text = re.sub(r"[^a-z0-9\s]", " ", (text or "").lower())
|
204 |
return [w for w in text.split() if w and w not in {"the","a","an","and","or","of","to","in","for","on","with"}]
|
|
|
210 |
scores[doc_id] = scores.get(doc_id, 0.0) + 1.0/(k + r)
|
211 |
return [doc for doc,_ in sorted(scores.items(), key=lambda x: -x[1])]
|
212 |
|
|
|
213 |
BM25Okapi = getattr(rank_bm25, "BM25Okapi", None) if rank_bm25 else None
|
214 |
|
215 |
+
_emb_model, st_util = None, None
|
|
|
216 |
if sentence_transformers:
|
217 |
try:
|
218 |
_emb_model = sentence_transformers.SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
|
219 |
from sentence_transformers import util as st_util
|
220 |
except Exception:
|
221 |
+
_emb_model, st_util = None, None
|
|
|
|
|
|
|
222 |
|
223 |
def retrieve_hybrid(claim: str, docs: List[Dict], k: int = 8) -> List[Dict]:
|
224 |
if not docs: return []
|
225 |
+
# BM25 (or overlap fallback)
|
|
|
226 |
if BM25Okapi:
|
227 |
corpus_tokens = [tokenize_simple(d["text"]) for d in docs]
|
228 |
bm25 = BM25Okapi(corpus_tokens)
|
229 |
bm25_scores = bm25.get_scores(tokenize_simple(claim))
|
230 |
bm25_order = [docs[i]["id"] for i in list(np.argsort(-np.array(bm25_scores)))]
|
231 |
else:
|
|
|
232 |
q_toks = set(tokenize_simple(claim))
|
233 |
overlaps = [(i, len(q_toks.intersection(set(tokenize_simple(d["text"]))))) for i, d in enumerate(docs)]
|
234 |
bm25_order = [docs[i]["id"] for i,_ in sorted(overlaps, key=lambda x: -x[1])]
|
|
|
251 |
return [{**doc, "score": float(1/(60+i))} for i, doc in enumerate(ranked_docs[:k])]
|
252 |
|
253 |
# ---------- verifier (transformers optional; heuristic fallback) ----------
|
254 |
+
_nli = None
|
255 |
if transformers:
|
256 |
try:
|
257 |
AutoModelForSequenceClassification = transformers.AutoModelForSequenceClassification
|
258 |
AutoTokenizer = transformers.AutoTokenizer
|
259 |
+
_tok = AutoTokenizer.from_pretrained("roberta-large-mnli")
|
260 |
+
_mdl = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
|
261 |
+
_nli = transformers.pipeline("text-classification", model=_mdl, tokenizer=_tok,
|
262 |
+
return_all_scores=True, truncation=True, device=-1)
|
263 |
except Exception:
|
264 |
+
_nli = None
|
265 |
|
266 |
def verify_with_nli(claim: str, evidence: List[Dict]) -> Dict:
|
267 |
+
if _nli:
|
|
|
268 |
best_ent_id, best_ent_p = None, 0.0
|
269 |
best_con_id, best_con_p = None, 0.0
|
270 |
for e in evidence or []:
|
271 |
prem = (e.get("text") or "").strip()
|
272 |
if not prem: continue
|
273 |
+
outputs = _nli([{"text": prem, "text_pair": claim}])
|
274 |
probs = {d["label"].upper(): float(d["score"]) for d in outputs[0]}
|
275 |
ent, con = probs.get("ENTAILMENT", 0.0), probs.get("CONTRADICTION", 0.0)
|
276 |
if ent > best_ent_p: best_ent_id, best_ent_p = e.get("id"), ent
|
|
|
283 |
elif best_con_p >= 0.60 and (best_con_p - best_ent_p) >= 0.10:
|
284 |
label, used, conf, rationale = "REFUTE", [best_con_id] if best_con_id else [], best_con_p, "Top evidence contradicts the claim."
|
285 |
return {"label": label, "used_evidence_ids": used, "confidence": float(conf), "rationale": rationale}
|
286 |
+
# heuristic fallback
|
|
|
287 |
text = " ".join((e.get("text") or "")[:400].lower() for e in evidence[:6])
|
288 |
k = sanitize_claim_for_search(claim).lower()
|
289 |
+
if any(x in text for x in ["false","hoax","debunked","misinformation","no evidence","not true"]) and any(y in text for y in k.split()[:4]):
|
290 |
+
return {"label":"REFUTE","used_evidence_ids":[evidence[0]["id"]] if evidence else [],"confidence":0.6,"rationale":"Heuristic: refutation keywords."}
|
291 |
+
if any(x in text for x in ["confirmed","approved","verified","evidence shows","found that"]) and any(y in text for y in k.split()[:4]):
|
292 |
+
return {"label":"SUPPORT","used_evidence_ids":[evidence[0]["id"]] if evidence else [],"confidence":0.55,"rationale":"Heuristic: support keywords."}
|
293 |
+
return {"label":"NEI","used_evidence_ids":[],"confidence":0.4,"rationale":"Insufficient signal without NLI."}
|
294 |
|
295 |
def enforce_json_schema(x: Dict) -> Dict:
|
296 |
return {"label": str(x.get("label","NEI")).upper(),
|
|
|
299 |
"rationale": str(x.get("rationale","")).strip()[:300]}
|
300 |
|
301 |
def filter_by_time(docs: List[Dict], t_max_iso: str) -> List[Dict]:
|
302 |
+
try: tmax = datetime.fromisoformat(t_max_iso.replace("Z","+00:00"))
|
303 |
+
except Exception: tmax = datetime.now(timezone.utc)
|
|
|
|
|
304 |
kept = []
|
305 |
for d in docs:
|
306 |
try:
|
|
|
310 |
kept.append(d)
|
311 |
return kept
|
312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
def verify_claim(claim_text: str, use_web: bool = True, use_wiki: bool = True,
|
314 |
allowlist: Optional[List[str]] = None, t_claim_iso: Optional[str] = None, k: int = 8) -> Dict:
|
315 |
t_claim_iso = t_claim_iso or now_iso()
|
|
|
336 |
outs.append(verify_claim(c, use_web=use_web, use_wiki=use_wiki, allowlist=allowlist, t_claim_iso=now_iso(), k=k))
|
337 |
return outs
|
338 |
|
339 |
+
# ---------- ASR ----------
|
340 |
def extract_audio_ffmpeg(video_path: str, out_wav: str, sr: int = 16000) -> str:
|
341 |
cmd = ["ffmpeg","-y","-i",video_path,"-vn","-acodec","pcm_s16le","-ar",str(sr),"-ac","1",out_wav]
|
342 |
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
343 |
return out_wav
|
344 |
+
|
345 |
def run_whisper_asr(audio_path: str, model_size: str = "base", language: Optional[str] = None) -> str:
|
346 |
+
# Prefer faster-whisper
|
347 |
if FWWhisperModel is not None:
|
348 |
+
device = "cuda" if gpu_available() else "cpu"
|
349 |
+
compute_type = "float16" if device == "cuda" else "int8"
|
350 |
model = FWWhisperModel(model_size, device=device, compute_type=compute_type)
|
351 |
segments, info = model.transcribe(audio_path, language=language, vad_filter=True, beam_size=5)
|
352 |
return " ".join(seg.text for seg in segments).strip()
|
353 |
+
# Fallback to OpenAI whisper
|
|
|
354 |
if OpenAIWhisper is not None:
|
355 |
model = OpenAIWhisper.load_model(model_size)
|
356 |
result = model.transcribe(audio_path, language=language) if language else model.transcribe(audio_path)
|
357 |
return (result.get("text") or "").strip()
|
358 |
+
# No backend
|
359 |
+
return ""
|
360 |
|
361 |
+
# ---------- OCR (EasyOCR → Tesseract) ----------
|
362 |
+
def _tess_langs(langs_csv: str) -> str:
|
363 |
+
map_ = {"en":"eng","ar":"ara","fr":"fra","de":"deu","es":"spa","it":"ita","pt":"por","ru":"rus","zh":"chi_sim"}
|
364 |
+
codes = [x.strip().lower() for x in (langs_csv or "en").split(",") if x.strip()]
|
365 |
+
return "+".join(map_.get(c, c) for c in codes) or "eng"
|
366 |
|
367 |
+
def preprocess_for_ocr(img_path: str):
|
368 |
+
if cv2 is None: return None
|
369 |
+
img = cv2.imread(img_path, cv2.IMREAD_COLOR)
|
370 |
+
if img is None: return None
|
371 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
372 |
+
gray = cv2.bilateralFilter(gray, 7, 50, 50)
|
373 |
+
gray = cv2.equalizeHist(gray)
|
374 |
+
th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
375 |
+
cv2.THRESH_BINARY, 31, 9)
|
376 |
+
return th
|
377 |
|
378 |
+
def _ocr_with_easyocr(frames: List[str], langs_csv: str, max_images: Optional[int]) -> List[str]:
|
379 |
+
if not (_easyocr and cv2): return []
|
380 |
+
try:
|
381 |
+
gpu = gpu_available()
|
382 |
+
reader = _easyocr.Reader([c.strip() for c in langs_csv.split(",") if c.strip()], gpu=gpu)
|
383 |
+
texts, count = [], 0
|
384 |
+
for fp in frames:
|
385 |
+
if max_images and count >= max_images: break
|
386 |
+
img = preprocess_for_ocr(fp)
|
387 |
+
if img is None:
|
388 |
+
count += 1;
|
389 |
+
continue
|
390 |
+
for (_bbox, txt, conf) in reader.readtext(img):
|
391 |
+
txt = normalize_ws(txt)
|
392 |
+
if txt and conf >= 0.35: texts.append(txt)
|
393 |
+
count += 1
|
394 |
+
uniq, seen = [], set()
|
395 |
+
for t in texts:
|
396 |
+
k = t.lower()
|
397 |
+
if k not in seen: uniq.append(t); seen.add(k)
|
398 |
+
return uniq
|
399 |
+
except Exception:
|
400 |
return []
|
401 |
+
|
402 |
+
def _ocr_with_tesseract(frames: List[str], langs_csv: str, max_images: Optional[int]) -> List[str]:
|
403 |
+
if not (_pyt and shutil.which("tesseract") and cv2): return []
|
404 |
lang = _tess_langs(langs_csv)
|
405 |
texts, count = [], 0
|
406 |
for fp in frames:
|
|
|
410 |
count += 1;
|
411 |
continue
|
412 |
try:
|
413 |
+
raw = _pyt.image_to_string(img, lang=lang)
|
414 |
except Exception:
|
415 |
+
try:
|
416 |
+
raw = _pyt.image_to_string(img, lang="eng")
|
417 |
+
except Exception:
|
418 |
+
raw = ""
|
419 |
for line in (raw or "").splitlines():
|
420 |
line = normalize_ws(line)
|
421 |
+
if len(line) >= 3: texts.append(line)
|
|
|
422 |
count += 1
|
|
|
423 |
uniq, seen = [], set()
|
424 |
for t in texts:
|
425 |
k = t.lower()
|
426 |
+
if k not in seen: uniq.append(t); seen.add(k)
|
|
|
427 |
return uniq
|
428 |
|
429 |
+
def run_ocr_on_frames(frames: List[str], languages: str = "en", max_images: Optional[int] = None) -> List[str]:
|
430 |
+
langs_csv = languages or "en"
|
431 |
+
out = _ocr_with_easyocr(frames, langs_csv, max_images)
|
432 |
+
if out: return out
|
433 |
+
out = _ocr_with_tesseract(frames, langs_csv, max_images)
|
434 |
+
return out
|
435 |
|
436 |
+
# ---------- video processing ----------
|
437 |
def download_video(url: str, out_dir: str = "videos") -> str:
|
|
|
438 |
os.makedirs(out_dir, exist_ok=True)
|
439 |
out_tpl = os.path.join(out_dir, "%(title)s.%(ext)s")
|
440 |
subprocess.run(["yt-dlp","-o",out_tpl,url], check=True)
|
|
|
443 |
|
444 |
def sample_frames_ffmpeg(video_path: str, out_dir: str = "frames", fps: float = 0.5) -> List[str]:
|
445 |
os.makedirs(out_dir, exist_ok=True)
|
446 |
+
try:
|
447 |
+
subprocess.run(["ffmpeg","-y","-i",video_path,"-vf",f"fps={fps}", os.path.join(out_dir, "frame_%06d.jpg")],
|
448 |
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
449 |
+
except Exception:
|
450 |
+
return []
|
451 |
return sorted(glob.glob(os.path.join(out_dir, "frame_*.jpg")))
|
452 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
453 |
def aggregate_text(asr_text: str, ocr_lines: List[str]) -> str:
|
454 |
parts = []
|
455 |
if asr_text: parts.append(asr_text)
|
|
|
461 |
if k and k not in seen: uniq_lines.append(line.strip()); seen.add(k)
|
462 |
return "\n".join(uniq_lines)
|
463 |
|
464 |
+
def suggest_claims(text: str, top_k: int = 10) -> List[str]:
|
465 |
+
sents = [re.sub(r'^[\'"“”]+|[\'"“”]+$', '', x).strip() for x in re.split(r'[.!?\n]+', text or "") if x.strip()]
|
466 |
+
candidates = [s for s in sents if len(s) >= 12 and re.search(r"\b(is|are|was|were|has|have|had|will|can|does|did|cause|causes|leads|led|prove|proves|confirm|confirms|predict|predicts|announce|announces|claim|claims|say|says|warn|warns|plan|plans|declare|declares|ban|bans|approve|approves)\b", s, re.I)]
|
467 |
+
if not candidates:
|
468 |
+
fallback = [s for s in sents if 8 <= len(s) <= 140]
|
469 |
+
scored = []
|
470 |
+
for s in fallback:
|
471 |
+
score = (1 if re.search(r'\d', s) else 0) + sum(1 for w in s.split()[:6] if w[:1].isupper())
|
472 |
+
scored.append((score, s))
|
473 |
+
candidates = [s for _, s in sorted(scored, key=lambda x: -x[0])[:top_k]]
|
474 |
+
return candidates[:top_k]
|
475 |
+
|
476 |
def process_video(video_file: Optional[str] = None, video_url: Optional[str] = None,
|
477 |
whisper_model: str = "base", asr_language: Optional[str] = None,
|
478 |
ocr_langs: str = "en", fps: float = 0.5, max_ocr_images: int = 200) -> Dict:
|
479 |
workdir = f"session_{uuid.uuid4().hex[:8]}"; os.makedirs(workdir, exist_ok=True)
|
480 |
+
# pick source
|
481 |
+
if video_url and video_url.strip():
|
482 |
+
vp = download_video(video_url.strip(), out_dir=workdir)
|
483 |
+
elif video_file and os.path.exists(video_file):
|
484 |
+
vp = shutil.copy(video_file, os.path.join(workdir, os.path.basename(video_file)))
|
485 |
+
else:
|
486 |
+
raise ValueError("Provide either a local video file path or a URL.")
|
487 |
+
|
488 |
+
# audio
|
489 |
wav = os.path.join(workdir, "audio_16k.wav")
|
490 |
+
if not ffmpeg_available():
|
491 |
+
raise RuntimeError("ffmpeg binary not found. Ensure apt.txt includes 'ffmpeg'.")
|
492 |
extract_audio_ffmpeg(vp, wav, sr=16000)
|
493 |
+
|
494 |
+
# ASR (never hard-fail)
|
495 |
+
asr_text = ""
|
496 |
+
try:
|
497 |
+
asr_text = run_whisper_asr(wav, model_size=whisper_model, language=asr_language)
|
498 |
+
if not asr_text:
|
499 |
+
asr_text = "[ASR skipped: no backend available]"
|
500 |
+
except Exception as e:
|
501 |
+
asr_text = f"[ASR skipped: {e}]"
|
502 |
open(os.path.join(workdir, "transcript_asr.txt"), "w").write(asr_text)
|
503 |
+
|
504 |
+
# frames
|
505 |
frames_dir = os.path.join(workdir, "frames")
|
506 |
frames = sample_frames_ffmpeg(vp, out_dir=frames_dir, fps=fps)
|
507 |
+
|
508 |
+
# OCR (never hard-fail)
|
509 |
+
ocr_lines = []
|
510 |
+
try:
|
511 |
+
if frames:
|
512 |
+
ocr_lines = run_ocr_on_frames(frames, languages=ocr_langs, max_images=int(max_ocr_images))
|
513 |
+
else:
|
514 |
+
ocr_lines = []
|
515 |
+
except Exception as e:
|
516 |
+
ocr_lines = [f"[OCR error: {e}]"]
|
517 |
if not ocr_lines:
|
518 |
ocr_lines = ["[OCR skipped: no backend available]"]
|
|
|
519 |
open(os.path.join(workdir, "transcript_ocr.txt"), "w").write("\n".join(ocr_lines))
|
520 |
+
|
521 |
+
# aggregate + suggestions
|
522 |
agg = aggregate_text(asr_text, ocr_lines)
|
523 |
open(os.path.join(workdir, "transcript_aggregated.txt"), "w").write(agg)
|
524 |
suggestions = suggest_claims(agg, top_k=10)
|
525 |
+
|
526 |
return {"workdir": workdir, "video_path": vp, "asr_text": asr_text, "ocr_lines": ocr_lines,
|
527 |
"aggregated_text": agg, "suggested_claims": suggestions}
|
528 |
|
529 |
+
# ---------- Gradio UI ----------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
THEME_CSS = """
|
531 |
<style>
|
532 |
body, .gradio-container {
|
|
|
535 |
linear-gradient(180deg, #0f1020, #0a0a12) !important;
|
536 |
color: #fff;
|
537 |
}
|
538 |
+
.glass { background: rgba(255,255,255,0.06); backdrop-filter: blur(8px);
|
539 |
+
border: 1px solid rgba(255,255,255,0.08); border-radius: 18px !important; }
|
540 |
+
.neon-btn { background: linear-gradient(90deg, rgba(122,60,255,0.9), rgba(0,179,255,0.9));
|
541 |
+
border-radius: 12px; color: white; box-shadow: 0 0 24px rgba(122,60,255,0.35); }
|
542 |
+
.neon-title { background: linear-gradient(90deg, #b28cff, #7a3cff, #00b3ff);
|
543 |
+
-webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: 900; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
</style>
|
545 |
"""
|
546 |
|
547 |
def ui_run_factcheck(claims_text: str, use_web: bool, use_wiki: bool, allowlist_str: str):
|
548 |
claims = [c.strip() for c in (claims_text or "").splitlines() if c.strip()]
|
549 |
+
if not claims: return "Please enter one claim per line.", None
|
|
|
550 |
allow = [d.strip() for d in (allowlist_str or ", ".join(DEFAULT_ALLOWLIST)).split(",") if d.strip()]
|
551 |
res = run_on_claims(claims, use_web=use_web, use_wiki=use_wiki, allowlist=allow, k=8)
|
|
|
552 |
rows, cards = [], []
|
553 |
for v in res:
|
554 |
lines = ["─"*74, f"CLAIM: {v['claim']}", f"t_claim: {v['t_claim']}",
|
|
|
568 |
rows.append({"claim": v["claim"], "verdict_at_t": v["label_at_t"], "verdict_now": v["label_now"],
|
569 |
"confidence": round(float(v["confidence"]), 3),
|
570 |
"used_ids": "|".join(v.get("used_evidence_ids_now", []))})
|
|
|
571 |
df = pd.DataFrame(rows)
|
572 |
return "\n\n".join(cards), df
|
573 |
|
574 |
def ui_ingest_and_suggest(video_file, video_url, whisper_model, asr_language, ocr_langs, fps, max_ocr_images):
|
575 |
try: vp = video_file.name if video_file else None
|
576 |
except Exception: vp = None
|
577 |
+
out = process_video(video_file=vp, video_url=video_url,
|
578 |
+
whisper_model=whisper_model, asr_language=asr_language or None,
|
579 |
+
ocr_langs=ocr_langs, fps=float(fps), max_ocr_images=int(max_ocr_images))
|
|
|
|
|
|
|
580 |
asr_preview = (out["asr_text"][:1200] + "...") if len(out["asr_text"]) > 1200 else out["asr_text"]
|
581 |
ocr_preview = "\n".join(out["ocr_lines"][:50])
|
582 |
agg_preview = (out["aggregated_text"][:2000] + "...") if len(out["aggregated_text"]) > 2000 else out["aggregated_text"]
|
583 |
sugg = "\n".join(out["suggested_claims"])
|
584 |
return asr_preview, ocr_preview, agg_preview, sugg, sugg
|
585 |
|
586 |
+
def run_diagnostics():
|
587 |
+
lines = []
|
588 |
+
lines.append(f"FFmpeg: {'found' if ffmpeg_available() else 'NOT found'}")
|
589 |
+
lines.append(f"GPU: {'available' if gpu_available() else 'CPU only'}")
|
590 |
+
lines.append(f"ASR backends: {', '.join(asr_backends()) or 'none'}")
|
591 |
+
lines.append(f"OCR backends: {', '.join(ocr_backends()) or 'none'}")
|
592 |
+
# ffmpeg version
|
593 |
+
try:
|
594 |
+
v = subprocess.run(['ffmpeg','-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=5)
|
595 |
+
lines.append(v.stdout.splitlines()[0])
|
596 |
+
except Exception as e:
|
597 |
+
lines.append(f"ffmpeg version: {e}")
|
598 |
+
# tesseract version
|
599 |
+
try:
|
600 |
+
if shutil.which("tesseract"):
|
601 |
+
tv = subprocess.run(['tesseract','-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=5)
|
602 |
+
lines.append("Tesseract: " + tv.stdout.splitlines()[0])
|
603 |
+
else:
|
604 |
+
lines.append("Tesseract: NOT found on PATH")
|
605 |
+
except Exception as e:
|
606 |
+
lines.append(f"Tesseract: {e}")
|
607 |
+
# EasyOCR smoke (import only)
|
608 |
+
lines.append(f"EasyOCR import: {'ok' if _easyocr else 'fail'}; OpenCV: {'ok' if cv2 is not None else 'fail'}")
|
609 |
+
# Create a quick OCR synthetic test with Tesseract if available
|
610 |
+
try:
|
611 |
+
from PIL import Image, ImageDraw
|
612 |
+
img = Image.new("RGB", (480, 120), (255,255,255))
|
613 |
+
d = ImageDraw.Draw(img); d.text((10,40), "AEGIS TEST 123", fill=(0,0,0))
|
614 |
+
tmp = f"diag_{uuid.uuid4().hex[:6]}.png"; img.save(tmp)
|
615 |
+
o = run_ocr_on_frames([tmp], languages="en", max_images=1)
|
616 |
+
os.remove(tmp)
|
617 |
+
lines.append("OCR synthetic test: " + ("OK: " + " | ".join(o) if o else "no text read"))
|
618 |
+
except Exception as e:
|
619 |
+
lines.append(f"OCR synthetic test error: {e}")
|
620 |
+
return "\n".join(lines)
|
621 |
+
|
622 |
with gr.Blocks(css=THEME_CSS, fill_height=True) as demo:
|
623 |
+
gr.HTML("<h1 class='neon-title' style='font-size:42px;margin:8px 0;'>AEGIS FactCheck</h1><p style='opacity:.75;margin:-6px 0 18px;'>Receipts or it didn’t happen — evidence-backed verification for video claims.</p>")
|
624 |
|
625 |
with gr.Tab("Manual Claims"):
|
626 |
with gr.Row():
|
627 |
with gr.Column(scale=1):
|
628 |
+
claims_box = gr.Textbox(label="Claims (one per line)", lines=8, placeholder="e.g. WHO approved mRNA vaccines in 2020", elem_classes=["glass"])
|
629 |
with gr.Row():
|
630 |
use_web = gr.Checkbox(value=True, label="Use Web retrieval")
|
631 |
use_wiki = gr.Checkbox(value=True, label="Use Wikipedia")
|
|
|
637 |
run_btn.click(ui_run_factcheck, inputs=[claims_box, use_web, use_wiki, allowlist_box], outputs=[out_text, out_df])
|
638 |
|
639 |
with gr.Tab("Video Ingest (ASR + OCR)"):
|
640 |
+
gr.Markdown("Upload a video **OR** provide a URL. Whisper + EasyOCR/Tesseract run; text is aggregated and claims suggested.")
|
641 |
with gr.Row():
|
642 |
with gr.Column(scale=1):
|
643 |
video_upload = gr.File(label="Upload video (mp4/mov/mkv...)", file_types=["video"])
|
|
|
660 |
inputs=[video_upload, video_url, whisper_model, asr_language, ocr_langs, fps, max_ocr_images],
|
661 |
outputs=[asr_out, ocr_out, agg_out, sugg_out, to_manual])
|
662 |
|
663 |
+
with gr.Tab("Diagnostics"):
|
664 |
+
diag_btn = gr.Button("Run Environment Checks", elem_classes=["neon-btn"])
|
665 |
+
diag_out = gr.Textbox(label="Diagnostics", lines=24, elem_classes=["glass"])
|
666 |
+
diag_btn.click(run_diagnostics, inputs=[], outputs=[diag_out])
|
667 |
+
|
668 |
demo.launch()
|