AhmadXGaballah commited on
Commit
441b2ed
·
verified ·
1 Parent(s): dd277ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -231
app.py CHANGED
@@ -1,6 +1,6 @@
1
- # app.py — AEGIS FactCheck (SLIM, guarded imports, no NLTK)
2
  from __future__ import annotations
3
- import os, re, json, time, glob, uuid, shutil, subprocess, urllib.parse
4
  from typing import List, Dict, Optional
5
  from datetime import datetime, timezone
6
 
@@ -9,17 +9,11 @@ import pandas as pd
9
  import requests
10
  import gradio as gr
11
 
12
- # ---------- lightweight helpers ----------
13
- def now_iso() -> str:
14
- return datetime.now(timezone.utc).isoformat()
15
-
16
- def normalize_ws(s: str) -> str:
17
- return re.sub(r"\s+", " ", s or "").strip()
18
-
19
- def sent_tokenize_fallback(txt: str) -> List[str]:
20
- # NLTK-free sentence splitter
21
  return [s.strip() for s in re.split(r'(?<=[.!?])\s+|\n+', txt or '') if s.strip()]
22
-
23
  def domain_from_url(url: str) -> str:
24
  try: return urllib.parse.urlparse(url).netloc.lower()
25
  except Exception: return ""
@@ -36,66 +30,63 @@ DEFAULT_ALLOWLIST = [
36
  "nature.com","sciencemag.org","thelancet.com","nejm.org",
37
  ]
38
 
39
- FORCE_BASELINE = True # leave True; OpenAI LLM path is optional and guarded
40
-
41
- # ---------- optional imports (guarded) ----------
42
- def _try_import(name: str):
43
- try:
44
- return __import__(name)
45
- except Exception:
46
- return None
47
-
48
- duckduckgo_search = _try_import("duckduckgo_search")
49
- trafilatura = _try_import("trafilatura")
50
- rank_bm25 = _try_import("rank_bm25")
51
- sentence_transformers = _try_import("sentence_transformers")
52
- transformers = _try_import("transformers")
53
- torch = _try_import("torch")
54
 
55
- # Heavy CV/ASR guarded
56
- cv2 = _try_import("cv2")
57
- easyocr = _try_import("easyocr")
58
- whisper = _try_import("whisper")
 
59
 
60
- # OpenAI is optional
61
- _openai = _try_import("openai")
62
- _has_openai_key = bool(os.environ.get("OPENAI_API_KEY"))
63
-
64
- # ---- ASR guarded imports
65
  try:
66
  from faster_whisper import WhisperModel as FWWhisperModel
67
  except Exception:
68
  FWWhisperModel = None
69
-
70
  try:
71
  import whisper as OpenAIWhisper
72
  except Exception:
73
  OpenAIWhisper = None
74
 
75
- # ---- OCR guarded imports
76
  try:
77
  import easyocr as _easyocr
78
  except Exception:
79
  _easyocr = None
80
-
81
  try:
82
  import pytesseract as _pyt
83
  except Exception:
84
  _pyt = None
85
-
86
  try:
87
  import cv2
88
  except Exception:
89
  cv2 = None
90
 
 
 
 
91
 
 
 
92
 
93
- def has_llm() -> bool:
94
- return (not FORCE_BASELINE) and _openai is not None and _has_openai_key
 
 
 
95
 
96
- # ---------- text splitting ----------
 
 
 
 
 
 
97
  def split_into_chunks(text: str, max_chars: int = 700) -> List[str]:
98
- sents = [normalize_ws(s) for s in sent_tokenize_fallback(text or "")]
99
  chunks, cur = [], ""
100
  for s in sents:
101
  if len(cur) + 1 + len(s) > max_chars and cur:
@@ -107,7 +98,6 @@ def split_into_chunks(text: str, max_chars: int = 700) -> List[str]:
107
 
108
  # ---------- Wikipedia ----------
109
  WIKI_API = "https://en.wikipedia.org/w/api.php"
110
-
111
  def wiki_search(query: str, n: int = 6) -> List[Dict]:
112
  r = requests.get(WIKI_API, params={"action":"query","list":"search","srsearch":query,"srlimit":n,"format":"json"},
113
  headers=HEADERS, timeout=20)
@@ -162,8 +152,7 @@ def build_wiki_corpus(claim: str, max_pages: int = 6, chunk_chars: int = 600) ->
162
 
163
  # ---------- Web retrieval ----------
164
  def ddg_search(query: str, max_results: int = 10, allowlist: Optional[List[str]] = None) -> List[Dict]:
165
- if duckduckgo_search is None:
166
- return []
167
  DDGS = duckduckgo_search.DDGS
168
  allowlist = allowlist or DEFAULT_ALLOWLIST
169
  out = []
@@ -175,9 +164,8 @@ def ddg_search(query: str, max_results: int = 10, allowlist: Optional[List[str]]
175
  return out
176
 
177
  def fetch_clean_text(url: str) -> str:
178
- if trafilatura is None: # degrade
179
  try:
180
- # last-chance plain GET (messy but better than nothing)
181
  r = requests.get(url, headers=HEADERS, timeout=12); r.raise_for_status()
182
  txt = re.sub(r"<[^>]+>", " ", r.text)
183
  return normalize_ws(txt)[:8000]
@@ -206,11 +194,11 @@ def build_web_corpus(claim: str, allowlist: Optional[List[str]] = None, per_quer
206
  for j, ch in enumerate(split_into_chunks(text, max_chars=chunk_chars)):
207
  corpus.append({"id": f"web-{hash(url)}-{j}", "source":"web", "title": h["title"] or domain_from_url(url),
208
  "url": url, "published": now_iso(), "text": ch})
209
- time.sleep(0.6) # polite
210
  if len(corpus) >= per_query_results * 4: break
211
  return list({d["id"]: d for d in corpus}.values())
212
 
213
- # ---------- retrieval scoring (BM25 + optional dense) ----------
214
  def tokenize_simple(text: str) -> List[str]:
215
  text = re.sub(r"[^a-z0-9\s]", " ", (text or "").lower())
216
  return [w for w in text.split() if w and w not in {"the","a","an","and","or","of","to","in","for","on","with"}]
@@ -222,32 +210,25 @@ def rrf_merge(orderings: List[List[str]], k: int = 60) -> List[str]:
222
  scores[doc_id] = scores.get(doc_id, 0.0) + 1.0/(k + r)
223
  return [doc for doc,_ in sorted(scores.items(), key=lambda x: -x[1])]
224
 
225
- # try to load BM25
226
  BM25Okapi = getattr(rank_bm25, "BM25Okapi", None) if rank_bm25 else None
227
 
228
- # try to prepare sentence-transformers
229
- _emb_model = None
230
  if sentence_transformers:
231
  try:
232
  _emb_model = sentence_transformers.SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
233
  from sentence_transformers import util as st_util
234
  except Exception:
235
- _emb_model = None
236
- st_util = None
237
- else:
238
- st_util = None
239
 
240
  def retrieve_hybrid(claim: str, docs: List[Dict], k: int = 8) -> List[Dict]:
241
  if not docs: return []
242
- # BM25 (always available? If not, fall back to keyword cosine)
243
- bm25_order = []
244
  if BM25Okapi:
245
  corpus_tokens = [tokenize_simple(d["text"]) for d in docs]
246
  bm25 = BM25Okapi(corpus_tokens)
247
  bm25_scores = bm25.get_scores(tokenize_simple(claim))
248
  bm25_order = [docs[i]["id"] for i in list(np.argsort(-np.array(bm25_scores)))]
249
  else:
250
- # poor-man BM25: sort by overlap count
251
  q_toks = set(tokenize_simple(claim))
252
  overlaps = [(i, len(q_toks.intersection(set(tokenize_simple(d["text"]))))) for i, d in enumerate(docs)]
253
  bm25_order = [docs[i]["id"] for i,_ in sorted(overlaps, key=lambda x: -x[1])]
@@ -270,27 +251,26 @@ def retrieve_hybrid(claim: str, docs: List[Dict], k: int = 8) -> List[Dict]:
270
  return [{**doc, "score": float(1/(60+i))} for i, doc in enumerate(ranked_docs[:k])]
271
 
272
  # ---------- verifier (transformers optional; heuristic fallback) ----------
273
- _nli_pipeline = None
274
  if transformers:
275
  try:
276
  AutoModelForSequenceClassification = transformers.AutoModelForSequenceClassification
277
  AutoTokenizer = transformers.AutoTokenizer
278
- _nli_tok = AutoTokenizer.from_pretrained("roberta-large-mnli")
279
- _nli_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
280
- _nli_pipeline = transformers.pipeline("text-classification", model=_nli_model, tokenizer=_nli_tok,
281
- return_all_scores=True, truncation=True, device=-1)
282
  except Exception:
283
- _nli_pipeline = None
284
 
285
  def verify_with_nli(claim: str, evidence: List[Dict]) -> Dict:
286
- # If NLI pipeline available
287
- if _nli_pipeline:
288
  best_ent_id, best_ent_p = None, 0.0
289
  best_con_id, best_con_p = None, 0.0
290
  for e in evidence or []:
291
  prem = (e.get("text") or "").strip()
292
  if not prem: continue
293
- outputs = _nli_pipeline([{"text": prem, "text_pair": claim}])
294
  probs = {d["label"].upper(): float(d["score"]) for d in outputs[0]}
295
  ent, con = probs.get("ENTAILMENT", 0.0), probs.get("CONTRADICTION", 0.0)
296
  if ent > best_ent_p: best_ent_id, best_ent_p = e.get("id"), ent
@@ -303,15 +283,14 @@ def verify_with_nli(claim: str, evidence: List[Dict]) -> Dict:
303
  elif best_con_p >= 0.60 and (best_con_p - best_ent_p) >= 0.10:
304
  label, used, conf, rationale = "REFUTE", [best_con_id] if best_con_id else [], best_con_p, "Top evidence contradicts the claim."
305
  return {"label": label, "used_evidence_ids": used, "confidence": float(conf), "rationale": rationale}
306
-
307
- # Heuristic fallback (no transformers)
308
  text = " ".join((e.get("text") or "")[:400].lower() for e in evidence[:6])
309
  k = sanitize_claim_for_search(claim).lower()
310
- if any(x in text for x in ["false", "hoax", "debunked", "misinformation", "no evidence", "not true"]) and any(y in text for y in k.split()[:4]):
311
- return {"label": "REFUTE", "used_evidence_ids": [evidence[0]["id"]] if evidence else [], "confidence": 0.6, "rationale": "Heuristic: evidence indicates refutation keywords."}
312
- if any(x in text for x in ["confirmed", "approved", "verified", "evidence shows", "found that"]) and any(y in text for y in k.split()[:4]):
313
- return {"label": "SUPPORT", "used_evidence_ids": [evidence[0]["id"]] if evidence else [], "confidence": 0.55, "rationale": "Heuristic: evidence indicates support keywords."}
314
- return {"label": "NEI", "used_evidence_ids": [], "confidence": 0.4, "rationale": "Insufficient signal without NLI."}
315
 
316
  def enforce_json_schema(x: Dict) -> Dict:
317
  return {"label": str(x.get("label","NEI")).upper(),
@@ -320,10 +299,8 @@ def enforce_json_schema(x: Dict) -> Dict:
320
  "rationale": str(x.get("rationale","")).strip()[:300]}
321
 
322
  def filter_by_time(docs: List[Dict], t_max_iso: str) -> List[Dict]:
323
- try:
324
- tmax = datetime.fromisoformat(t_max_iso.replace("Z","+00:00"))
325
- except Exception:
326
- tmax = datetime.now(timezone.utc)
327
  kept = []
328
  for d in docs:
329
  try:
@@ -333,15 +310,6 @@ def filter_by_time(docs: List[Dict], t_max_iso: str) -> List[Dict]:
333
  kept.append(d)
334
  return kept
335
 
336
- def format_evidence_block(evs: List[Dict]) -> str:
337
- lines = []
338
- for e in evs:
339
- snippet = (e.get("text","") or "")
340
- if len(snippet) > 420: snippet = snippet[:420] + "..."
341
- title = e.get("title","") or e.get("source","")
342
- lines.append(f"[{e['id']}] ({e.get('published','')}) {title} — {e.get('url','')}\n{snippet}")
343
- return "\n\n".join(lines)
344
-
345
  def verify_claim(claim_text: str, use_web: bool = True, use_wiki: bool = True,
346
  allowlist: Optional[List[str]] = None, t_claim_iso: Optional[str] = None, k: int = 8) -> Dict:
347
  t_claim_iso = t_claim_iso or now_iso()
@@ -368,36 +336,71 @@ def run_on_claims(claims: List[str], use_web: bool, use_wiki: bool, allowlist: L
368
  outs.append(verify_claim(c, use_web=use_web, use_wiki=use_wiki, allowlist=allowlist, t_claim_iso=now_iso(), k=k))
369
  return outs
370
 
371
- # ---------- ASR + OCR (guarded) ----------
372
  def extract_audio_ffmpeg(video_path: str, out_wav: str, sr: int = 16000) -> str:
373
  cmd = ["ffmpeg","-y","-i",video_path,"-vn","-acodec","pcm_s16le","-ar",str(sr),"-ac","1",out_wav]
374
  subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
375
  return out_wav
376
-
377
  def run_whisper_asr(audio_path: str, model_size: str = "base", language: Optional[str] = None) -> str:
378
- # Prefer faster-whisper (lighter, no PyTorch needed)
379
  if FWWhisperModel is not None:
380
- device = "cuda" if shutil.which("nvidia-smi") else "cpu"
381
- compute_type = "float16" if device == "cuda" else "int8" # int8 is great on CPU
382
  model = FWWhisperModel(model_size, device=device, compute_type=compute_type)
383
  segments, info = model.transcribe(audio_path, language=language, vad_filter=True, beam_size=5)
384
  return " ".join(seg.text for seg in segments).strip()
385
-
386
- # Fallback to OpenAI whisper (PyTorch)
387
  if OpenAIWhisper is not None:
388
  model = OpenAIWhisper.load_model(model_size)
389
  result = model.transcribe(audio_path, language=language) if language else model.transcribe(audio_path)
390
  return (result.get("text") or "").strip()
 
 
391
 
392
- # Last resort: don't crash; let OCR + manual flow continue
393
- return "" # <- return empty string instead of raising
 
 
 
394
 
395
- # Nothing available
396
- raise RuntimeError("No ASR backend available (install faster-whisper or openai-whisper).")
 
 
 
 
 
 
 
 
397
 
398
- def _ocr_with_tesseract(frames: list[str], langs_csv: str, max_images: int | None) -> list[str]:
399
- if _pyt is None or cv2 is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  return []
 
 
 
401
  lang = _tess_langs(langs_csv)
402
  texts, count = [], 0
403
  for fp in frames:
@@ -407,26 +410,31 @@ def _ocr_with_tesseract(frames: list[str], langs_csv: str, max_images: int | Non
407
  count += 1;
408
  continue
409
  try:
410
- raw = _pyt.image_to_string(img, lang=lang) # returns a blob of text
411
  except Exception:
412
- raw = ""
 
 
 
413
  for line in (raw or "").splitlines():
414
  line = normalize_ws(line)
415
- if len(line) >= 3:
416
- texts.append(line)
417
  count += 1
418
- # dedupe
419
  uniq, seen = [], set()
420
  for t in texts:
421
  k = t.lower()
422
- if k not in seen:
423
- uniq.append(t); seen.add(k)
424
  return uniq
425
 
 
 
 
 
 
 
426
 
427
-
428
  def download_video(url: str, out_dir: str = "videos") -> str:
429
- # yt-dlp is installed via requirements; call binary
430
  os.makedirs(out_dir, exist_ok=True)
431
  out_tpl = os.path.join(out_dir, "%(title)s.%(ext)s")
432
  subprocess.run(["yt-dlp","-o",out_tpl,url], check=True)
@@ -435,66 +443,13 @@ def download_video(url: str, out_dir: str = "videos") -> str:
435
 
436
  def sample_frames_ffmpeg(video_path: str, out_dir: str = "frames", fps: float = 0.5) -> List[str]:
437
  os.makedirs(out_dir, exist_ok=True)
438
- subprocess.run(["ffmpeg","-y","-i",video_path,"-vf",f"fps={fps}", os.path.join(out_dir, "frame_%06d.jpg")],
439
- stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
 
 
 
440
  return sorted(glob.glob(os.path.join(out_dir, "frame_*.jpg")))
441
 
442
- def preprocess_for_ocr(img_path: str):
443
- if cv2 is None: return None
444
- img = cv2.imread(img_path, cv2.IMREAD_COLOR)
445
- if img is None: return None
446
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
447
- gray = cv2.bilateralFilter(gray, 7, 50, 50)
448
- gray = cv2.equalizeHist(gray)
449
- th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
450
- cv2.THRESH_BINARY, 31, 9)
451
- return th
452
-
453
- def run_ocr_on_frames(frames: list[str], languages: list[str] | str = "en", gpu: bool | None = None, max_images: int | None = None) -> list[str]:
454
- # Normalize languages input
455
- if isinstance(languages, list):
456
- langs_csv = ",".join(languages)
457
- else:
458
- langs_csv = languages or "en"
459
-
460
- # 1) Try EasyOCR
461
- if _easyocr is not None and cv2 is not None:
462
- try:
463
- if gpu is None:
464
- gpu = True if (os.environ.get("SPACE_ID") or shutil.which("nvidia-smi")) else False
465
- reader = _easyocr.Reader([c.strip() for c in langs_csv.split(",") if c.strip()], gpu=gpu)
466
- texts, count = [], 0
467
- for fp in frames:
468
- if max_images and count >= max_images: break
469
- img = preprocess_for_ocr(fp)
470
- if img is None:
471
- count += 1;
472
- continue
473
- for (_bbox, txt, conf) in reader.readtext(img):
474
- txt = normalize_ws(txt)
475
- if txt and conf >= 0.35:
476
- texts.append(txt)
477
- count += 1
478
- # dedupe
479
- uniq, seen = [], set()
480
- for t in texts:
481
- k = t.lower()
482
- if k not in seen:
483
- uniq.append(t); seen.add(k)
484
- if uniq:
485
- return uniq
486
- except Exception:
487
- pass # fall through to tesseract
488
-
489
- # 2) Fallback: Tesseract
490
- t_res = _ocr_with_tesseract(frames, langs_csv, max_images)
491
- if t_res:
492
- return t_res
493
-
494
- # 3) Nothing available
495
- return []
496
-
497
-
498
  def aggregate_text(asr_text: str, ocr_lines: List[str]) -> str:
499
  parts = []
500
  if asr_text: parts.append(asr_text)
@@ -506,54 +461,72 @@ def aggregate_text(asr_text: str, ocr_lines: List[str]) -> str:
506
  if k and k not in seen: uniq_lines.append(line.strip()); seen.add(k)
507
  return "\n".join(uniq_lines)
508
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  def process_video(video_file: Optional[str] = None, video_url: Optional[str] = None,
510
  whisper_model: str = "base", asr_language: Optional[str] = None,
511
  ocr_langs: str = "en", fps: float = 0.5, max_ocr_images: int = 200) -> Dict:
512
  workdir = f"session_{uuid.uuid4().hex[:8]}"; os.makedirs(workdir, exist_ok=True)
513
- if video_url and video_url.strip(): vp = download_video(video_url.strip(), out_dir=workdir)
514
- elif video_file and os.path.exists(video_file): vp = shutil.copy(video_file, os.path.join(workdir, os.path.basename(video_file)))
515
- else: raise ValueError("Provide either a local video file path or a URL.")
 
 
 
 
 
 
516
  wav = os.path.join(workdir, "audio_16k.wav")
 
 
517
  extract_audio_ffmpeg(vp, wav, sr=16000)
518
- asr_text = run_whisper_asr(wav, model_size=whisper_model, language=asr_language)
 
 
 
 
 
 
 
 
519
  open(os.path.join(workdir, "transcript_asr.txt"), "w").write(asr_text)
 
 
520
  frames_dir = os.path.join(workdir, "frames")
521
  frames = sample_frames_ffmpeg(vp, out_dir=frames_dir, fps=fps)
522
- langs = [x.strip() for x in ocr_langs.split(",") if x.strip()]
523
- ocr_langs_csv = ",".join(langs)
524
- ocr_lines = run_ocr_on_frames(frames, languages=ocr_langs_csv, gpu=None, max_images=int(max_ocr_images))
 
 
 
 
 
 
 
525
  if not ocr_lines:
526
  ocr_lines = ["[OCR skipped: no backend available]"]
527
-
528
  open(os.path.join(workdir, "transcript_ocr.txt"), "w").write("\n".join(ocr_lines))
 
 
529
  agg = aggregate_text(asr_text, ocr_lines)
530
  open(os.path.join(workdir, "transcript_aggregated.txt"), "w").write(agg)
531
  suggestions = suggest_claims(agg, top_k=10)
 
532
  return {"workdir": workdir, "video_path": vp, "asr_text": asr_text, "ocr_lines": ocr_lines,
533
  "aggregated_text": agg, "suggested_claims": suggestions}
534
 
535
- # ---------- claim suggestions ----------
536
- CLAIM_MIN_LEN = 12
537
- VERB_TRIGGERS = r"\b(" + "|".join([
538
- "is","are","was","were","has","have","had","will","can","does","did",
539
- "cause","causes","leads","led","prove","proves","confirm","confirms",
540
- "predict","predicts","announce","announces","claim","claims","say","says",
541
- "warn","warns","plan","plans","declare","declares","ban","bans","approve","approves"
542
- ]) + r")\b"
543
-
544
- def suggest_claims(text: str, top_k: int = 10) -> List[str]:
545
- sents = [re.sub(r'^[\'"“”]+|[\'"“”]+$', '', x).strip() for x in re.split(r'[.!?\n]+', text or "") if x.strip()]
546
- candidates = [s for s in sents if len(s) >= CLAIM_MIN_LEN and re.search(VERB_TRIGGERS, s, re.I)]
547
- if not candidates:
548
- fallback = [s for s in sents if 8 <= len(s) <= 140]
549
- scored = []
550
- for s in fallback:
551
- score = (1 if re.search(r'\d', s) else 0) + sum(1 for w in s.split()[:6] if w[:1].isupper())
552
- scored.append((score, s))
553
- candidates = [s for _, s in sorted(scored, key=lambda x: -x[0])[:top_k]]
554
- return candidates[:top_k]
555
-
556
- # ---------- Gradio theme & UI ----------
557
  THEME_CSS = """
558
  <style>
559
  body, .gradio-container {
@@ -562,34 +535,20 @@ THEME_CSS = """
562
  linear-gradient(180deg, #0f1020, #0a0a12) !important;
563
  color: #fff;
564
  }
565
- .glass {
566
- background: rgba(255,255,255,0.06);
567
- backdrop-filter: blur(8px);
568
- border: 1px solid rgba(255,255,255,0.08);
569
- box-shadow: 0 0 40px rgba(122,60,255,0.15), 0 0 60px rgba(0,179,255,0.10);
570
- border-radius: 18px !important;
571
- }
572
- .neon-btn {
573
- background: linear-gradient(90deg, rgba(122,60,255,0.9), rgba(0,179,255,0.9));
574
- border-radius: 12px;
575
- color: white;
576
- box-shadow: 0 0 24px rgba(122,60,255,0.35);
577
- }
578
- .neon-title {
579
- background: linear-gradient(90deg, #b28cff, #7a3cff, #00b3ff);
580
- -webkit-background-clip: text; -webkit-text-fill-color: transparent;
581
- font-weight: 900;
582
- }
583
  </style>
584
  """
585
 
586
  def ui_run_factcheck(claims_text: str, use_web: bool, use_wiki: bool, allowlist_str: str):
587
  claims = [c.strip() for c in (claims_text or "").splitlines() if c.strip()]
588
- if not claims:
589
- return "Please enter one claim per line.", None
590
  allow = [d.strip() for d in (allowlist_str or ", ".join(DEFAULT_ALLOWLIST)).split(",") if d.strip()]
591
  res = run_on_claims(claims, use_web=use_web, use_wiki=use_wiki, allowlist=allow, k=8)
592
-
593
  rows, cards = [], []
594
  for v in res:
595
  lines = ["─"*74, f"CLAIM: {v['claim']}", f"t_claim: {v['t_claim']}",
@@ -609,32 +568,64 @@ def ui_run_factcheck(claims_text: str, use_web: bool, use_wiki: bool, allowlist_
609
  rows.append({"claim": v["claim"], "verdict_at_t": v["label_at_t"], "verdict_now": v["label_now"],
610
  "confidence": round(float(v["confidence"]), 3),
611
  "used_ids": "|".join(v.get("used_evidence_ids_now", []))})
612
-
613
  df = pd.DataFrame(rows)
614
  return "\n\n".join(cards), df
615
 
616
  def ui_ingest_and_suggest(video_file, video_url, whisper_model, asr_language, ocr_langs, fps, max_ocr_images):
617
  try: vp = video_file.name if video_file else None
618
  except Exception: vp = None
619
- try:
620
- out = process_video(video_file=vp, video_url=video_url,
621
- whisper_model=whisper_model, asr_language=asr_language or None,
622
- ocr_langs=ocr_langs, fps=float(fps), max_ocr_images=int(max_ocr_images))
623
- except Exception as e:
624
- return f"Error during ingest: {e}", "", "", "", ""
625
  asr_preview = (out["asr_text"][:1200] + "...") if len(out["asr_text"]) > 1200 else out["asr_text"]
626
  ocr_preview = "\n".join(out["ocr_lines"][:50])
627
  agg_preview = (out["aggregated_text"][:2000] + "...") if len(out["aggregated_text"]) > 2000 else out["aggregated_text"]
628
  sugg = "\n".join(out["suggested_claims"])
629
  return asr_preview, ocr_preview, agg_preview, sugg, sugg
630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  with gr.Blocks(css=THEME_CSS, fill_height=True) as demo:
632
- gr.HTML("<h1 class='neon-title' style='font-size:42px;margin:8px 0;'>FactChecker</h1><p style='opacity:.75;margin:-6px 0 18px;'>Make every claim earn its proof.</p>")
633
 
634
  with gr.Tab("Manual Claims"):
635
  with gr.Row():
636
  with gr.Column(scale=1):
637
- claims_box = gr.Textbox(label="Claims (one per line)", lines=8, placeholder="e.g. 5G towers caused COVID-19", elem_classes=["glass"])
638
  with gr.Row():
639
  use_web = gr.Checkbox(value=True, label="Use Web retrieval")
640
  use_wiki = gr.Checkbox(value=True, label="Use Wikipedia")
@@ -646,7 +637,7 @@ with gr.Blocks(css=THEME_CSS, fill_height=True) as demo:
646
  run_btn.click(ui_run_factcheck, inputs=[claims_box, use_web, use_wiki, allowlist_box], outputs=[out_text, out_df])
647
 
648
  with gr.Tab("Video Ingest (ASR + OCR)"):
649
- gr.Markdown("Upload a video **OR** provide a URL. Runs Whisper + EasyOCR, aggregates text, then suggests claims.")
650
  with gr.Row():
651
  with gr.Column(scale=1):
652
  video_upload = gr.File(label="Upload video (mp4/mov/mkv...)", file_types=["video"])
@@ -669,4 +660,9 @@ with gr.Blocks(css=THEME_CSS, fill_height=True) as demo:
669
  inputs=[video_upload, video_url, whisper_model, asr_language, ocr_langs, fps, max_ocr_images],
670
  outputs=[asr_out, ocr_out, agg_out, sugg_out, to_manual])
671
 
 
 
 
 
 
672
  demo.launch()
 
1
+ # app.py — resilient ASR/OCR with Diagnostics (no NLTK)
2
  from __future__ import annotations
3
+ import os, re, json, time, glob, uuid, shutil, subprocess, urllib.parse, io
4
  from typing import List, Dict, Optional
5
  from datetime import datetime, timezone
6
 
 
9
  import requests
10
  import gradio as gr
11
 
12
+ # ---------- small helpers ----------
13
+ def now_iso(): return datetime.now(timezone.utc).isoformat()
14
+ def normalize_ws(s: str) -> str: return re.sub(r"\s+", " ", s or "").strip()
15
+ def sent_tokenize(txt: str) -> List[str]:
 
 
 
 
 
16
  return [s.strip() for s in re.split(r'(?<=[.!?])\s+|\n+', txt or '') if s.strip()]
 
17
  def domain_from_url(url: str) -> str:
18
  try: return urllib.parse.urlparse(url).netloc.lower()
19
  except Exception: return ""
 
30
  "nature.com","sciencemag.org","thelancet.com","nejm.org",
31
  ]
32
 
33
+ # ---------- guarded imports ----------
34
+ def _try(name):
35
+ try: return __import__(name)
36
+ except Exception: return None
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ duckduckgo_search = _try("duckduckgo_search")
39
+ trafilatura = _try("trafilatura")
40
+ rank_bm25 = _try("rank_bm25")
41
+ sentence_transformers = _try("sentence_transformers")
42
+ transformers = _try("transformers")
43
 
44
+ # ASR backends
 
 
 
 
45
  try:
46
  from faster_whisper import WhisperModel as FWWhisperModel
47
  except Exception:
48
  FWWhisperModel = None
 
49
  try:
50
  import whisper as OpenAIWhisper
51
  except Exception:
52
  OpenAIWhisper = None
53
 
54
+ # OCR backends
55
  try:
56
  import easyocr as _easyocr
57
  except Exception:
58
  _easyocr = None
 
59
  try:
60
  import pytesseract as _pyt
61
  except Exception:
62
  _pyt = None
 
63
  try:
64
  import cv2
65
  except Exception:
66
  cv2 = None
67
 
68
+ # ---------- env probes ----------
69
+ def ffmpeg_available() -> bool:
70
+ return bool(shutil.which("ffmpeg"))
71
 
72
+ def gpu_available() -> bool:
73
+ return bool(shutil.which("nvidia-smi"))
74
 
75
+ def asr_backends():
76
+ b = []
77
+ if FWWhisperModel: b.append("faster-whisper")
78
+ if OpenAIWhisper: b.append("openai-whisper")
79
+ return b
80
 
81
+ def ocr_backends():
82
+ b = []
83
+ if _easyocr and cv2: b.append("easyocr")
84
+ if _pyt and shutil.which("tesseract"): b.append("tesseract")
85
+ return b
86
+
87
+ # ---------- text chunking ----------
88
  def split_into_chunks(text: str, max_chars: int = 700) -> List[str]:
89
+ sents = [normalize_ws(s) for s in sent_tokenize(text or "")]
90
  chunks, cur = [], ""
91
  for s in sents:
92
  if len(cur) + 1 + len(s) > max_chars and cur:
 
98
 
99
  # ---------- Wikipedia ----------
100
  WIKI_API = "https://en.wikipedia.org/w/api.php"
 
101
  def wiki_search(query: str, n: int = 6) -> List[Dict]:
102
  r = requests.get(WIKI_API, params={"action":"query","list":"search","srsearch":query,"srlimit":n,"format":"json"},
103
  headers=HEADERS, timeout=20)
 
152
 
153
  # ---------- Web retrieval ----------
154
  def ddg_search(query: str, max_results: int = 10, allowlist: Optional[List[str]] = None) -> List[Dict]:
155
+ if duckduckgo_search is None: return []
 
156
  DDGS = duckduckgo_search.DDGS
157
  allowlist = allowlist or DEFAULT_ALLOWLIST
158
  out = []
 
164
  return out
165
 
166
  def fetch_clean_text(url: str) -> str:
167
+ if trafilatura is None:
168
  try:
 
169
  r = requests.get(url, headers=HEADERS, timeout=12); r.raise_for_status()
170
  txt = re.sub(r"<[^>]+>", " ", r.text)
171
  return normalize_ws(txt)[:8000]
 
194
  for j, ch in enumerate(split_into_chunks(text, max_chars=chunk_chars)):
195
  corpus.append({"id": f"web-{hash(url)}-{j}", "source":"web", "title": h["title"] or domain_from_url(url),
196
  "url": url, "published": now_iso(), "text": ch})
197
+ time.sleep(0.6)
198
  if len(corpus) >= per_query_results * 4: break
199
  return list({d["id"]: d for d in corpus}.values())
200
 
201
+ # ---------- retrieval ----------
202
  def tokenize_simple(text: str) -> List[str]:
203
  text = re.sub(r"[^a-z0-9\s]", " ", (text or "").lower())
204
  return [w for w in text.split() if w and w not in {"the","a","an","and","or","of","to","in","for","on","with"}]
 
210
  scores[doc_id] = scores.get(doc_id, 0.0) + 1.0/(k + r)
211
  return [doc for doc,_ in sorted(scores.items(), key=lambda x: -x[1])]
212
 
 
213
  BM25Okapi = getattr(rank_bm25, "BM25Okapi", None) if rank_bm25 else None
214
 
215
+ _emb_model, st_util = None, None
 
216
  if sentence_transformers:
217
  try:
218
  _emb_model = sentence_transformers.SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
219
  from sentence_transformers import util as st_util
220
  except Exception:
221
+ _emb_model, st_util = None, None
 
 
 
222
 
223
  def retrieve_hybrid(claim: str, docs: List[Dict], k: int = 8) -> List[Dict]:
224
  if not docs: return []
225
+ # BM25 (or overlap fallback)
 
226
  if BM25Okapi:
227
  corpus_tokens = [tokenize_simple(d["text"]) for d in docs]
228
  bm25 = BM25Okapi(corpus_tokens)
229
  bm25_scores = bm25.get_scores(tokenize_simple(claim))
230
  bm25_order = [docs[i]["id"] for i in list(np.argsort(-np.array(bm25_scores)))]
231
  else:
 
232
  q_toks = set(tokenize_simple(claim))
233
  overlaps = [(i, len(q_toks.intersection(set(tokenize_simple(d["text"]))))) for i, d in enumerate(docs)]
234
  bm25_order = [docs[i]["id"] for i,_ in sorted(overlaps, key=lambda x: -x[1])]
 
251
  return [{**doc, "score": float(1/(60+i))} for i, doc in enumerate(ranked_docs[:k])]
252
 
253
  # ---------- verifier (transformers optional; heuristic fallback) ----------
254
+ _nli = None
255
  if transformers:
256
  try:
257
  AutoModelForSequenceClassification = transformers.AutoModelForSequenceClassification
258
  AutoTokenizer = transformers.AutoTokenizer
259
+ _tok = AutoTokenizer.from_pretrained("roberta-large-mnli")
260
+ _mdl = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
261
+ _nli = transformers.pipeline("text-classification", model=_mdl, tokenizer=_tok,
262
+ return_all_scores=True, truncation=True, device=-1)
263
  except Exception:
264
+ _nli = None
265
 
266
  def verify_with_nli(claim: str, evidence: List[Dict]) -> Dict:
267
+ if _nli:
 
268
  best_ent_id, best_ent_p = None, 0.0
269
  best_con_id, best_con_p = None, 0.0
270
  for e in evidence or []:
271
  prem = (e.get("text") or "").strip()
272
  if not prem: continue
273
+ outputs = _nli([{"text": prem, "text_pair": claim}])
274
  probs = {d["label"].upper(): float(d["score"]) for d in outputs[0]}
275
  ent, con = probs.get("ENTAILMENT", 0.0), probs.get("CONTRADICTION", 0.0)
276
  if ent > best_ent_p: best_ent_id, best_ent_p = e.get("id"), ent
 
283
  elif best_con_p >= 0.60 and (best_con_p - best_ent_p) >= 0.10:
284
  label, used, conf, rationale = "REFUTE", [best_con_id] if best_con_id else [], best_con_p, "Top evidence contradicts the claim."
285
  return {"label": label, "used_evidence_ids": used, "confidence": float(conf), "rationale": rationale}
286
+ # heuristic fallback
 
287
  text = " ".join((e.get("text") or "")[:400].lower() for e in evidence[:6])
288
  k = sanitize_claim_for_search(claim).lower()
289
+ if any(x in text for x in ["false","hoax","debunked","misinformation","no evidence","not true"]) and any(y in text for y in k.split()[:4]):
290
+ return {"label":"REFUTE","used_evidence_ids":[evidence[0]["id"]] if evidence else [],"confidence":0.6,"rationale":"Heuristic: refutation keywords."}
291
+ if any(x in text for x in ["confirmed","approved","verified","evidence shows","found that"]) and any(y in text for y in k.split()[:4]):
292
+ return {"label":"SUPPORT","used_evidence_ids":[evidence[0]["id"]] if evidence else [],"confidence":0.55,"rationale":"Heuristic: support keywords."}
293
+ return {"label":"NEI","used_evidence_ids":[],"confidence":0.4,"rationale":"Insufficient signal without NLI."}
294
 
295
  def enforce_json_schema(x: Dict) -> Dict:
296
  return {"label": str(x.get("label","NEI")).upper(),
 
299
  "rationale": str(x.get("rationale","")).strip()[:300]}
300
 
301
  def filter_by_time(docs: List[Dict], t_max_iso: str) -> List[Dict]:
302
+ try: tmax = datetime.fromisoformat(t_max_iso.replace("Z","+00:00"))
303
+ except Exception: tmax = datetime.now(timezone.utc)
 
 
304
  kept = []
305
  for d in docs:
306
  try:
 
310
  kept.append(d)
311
  return kept
312
 
 
 
 
 
 
 
 
 
 
313
  def verify_claim(claim_text: str, use_web: bool = True, use_wiki: bool = True,
314
  allowlist: Optional[List[str]] = None, t_claim_iso: Optional[str] = None, k: int = 8) -> Dict:
315
  t_claim_iso = t_claim_iso or now_iso()
 
336
  outs.append(verify_claim(c, use_web=use_web, use_wiki=use_wiki, allowlist=allowlist, t_claim_iso=now_iso(), k=k))
337
  return outs
338
 
339
+ # ---------- ASR ----------
340
  def extract_audio_ffmpeg(video_path: str, out_wav: str, sr: int = 16000) -> str:
341
  cmd = ["ffmpeg","-y","-i",video_path,"-vn","-acodec","pcm_s16le","-ar",str(sr),"-ac","1",out_wav]
342
  subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
343
  return out_wav
344
+
345
  def run_whisper_asr(audio_path: str, model_size: str = "base", language: Optional[str] = None) -> str:
346
+ # Prefer faster-whisper
347
  if FWWhisperModel is not None:
348
+ device = "cuda" if gpu_available() else "cpu"
349
+ compute_type = "float16" if device == "cuda" else "int8"
350
  model = FWWhisperModel(model_size, device=device, compute_type=compute_type)
351
  segments, info = model.transcribe(audio_path, language=language, vad_filter=True, beam_size=5)
352
  return " ".join(seg.text for seg in segments).strip()
353
+ # Fallback to OpenAI whisper
 
354
  if OpenAIWhisper is not None:
355
  model = OpenAIWhisper.load_model(model_size)
356
  result = model.transcribe(audio_path, language=language) if language else model.transcribe(audio_path)
357
  return (result.get("text") or "").strip()
358
+ # No backend
359
+ return ""
360
 
361
+ # ---------- OCR (EasyOCR Tesseract) ----------
362
+ def _tess_langs(langs_csv: str) -> str:
363
+ map_ = {"en":"eng","ar":"ara","fr":"fra","de":"deu","es":"spa","it":"ita","pt":"por","ru":"rus","zh":"chi_sim"}
364
+ codes = [x.strip().lower() for x in (langs_csv or "en").split(",") if x.strip()]
365
+ return "+".join(map_.get(c, c) for c in codes) or "eng"
366
 
367
+ def preprocess_for_ocr(img_path: str):
368
+ if cv2 is None: return None
369
+ img = cv2.imread(img_path, cv2.IMREAD_COLOR)
370
+ if img is None: return None
371
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
372
+ gray = cv2.bilateralFilter(gray, 7, 50, 50)
373
+ gray = cv2.equalizeHist(gray)
374
+ th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
375
+ cv2.THRESH_BINARY, 31, 9)
376
+ return th
377
 
378
+ def _ocr_with_easyocr(frames: List[str], langs_csv: str, max_images: Optional[int]) -> List[str]:
379
+ if not (_easyocr and cv2): return []
380
+ try:
381
+ gpu = gpu_available()
382
+ reader = _easyocr.Reader([c.strip() for c in langs_csv.split(",") if c.strip()], gpu=gpu)
383
+ texts, count = [], 0
384
+ for fp in frames:
385
+ if max_images and count >= max_images: break
386
+ img = preprocess_for_ocr(fp)
387
+ if img is None:
388
+ count += 1;
389
+ continue
390
+ for (_bbox, txt, conf) in reader.readtext(img):
391
+ txt = normalize_ws(txt)
392
+ if txt and conf >= 0.35: texts.append(txt)
393
+ count += 1
394
+ uniq, seen = [], set()
395
+ for t in texts:
396
+ k = t.lower()
397
+ if k not in seen: uniq.append(t); seen.add(k)
398
+ return uniq
399
+ except Exception:
400
  return []
401
+
402
+ def _ocr_with_tesseract(frames: List[str], langs_csv: str, max_images: Optional[int]) -> List[str]:
403
+ if not (_pyt and shutil.which("tesseract") and cv2): return []
404
  lang = _tess_langs(langs_csv)
405
  texts, count = [], 0
406
  for fp in frames:
 
410
  count += 1;
411
  continue
412
  try:
413
+ raw = _pyt.image_to_string(img, lang=lang)
414
  except Exception:
415
+ try:
416
+ raw = _pyt.image_to_string(img, lang="eng")
417
+ except Exception:
418
+ raw = ""
419
  for line in (raw or "").splitlines():
420
  line = normalize_ws(line)
421
+ if len(line) >= 3: texts.append(line)
 
422
  count += 1
 
423
  uniq, seen = [], set()
424
  for t in texts:
425
  k = t.lower()
426
+ if k not in seen: uniq.append(t); seen.add(k)
 
427
  return uniq
428
 
429
+ def run_ocr_on_frames(frames: List[str], languages: str = "en", max_images: Optional[int] = None) -> List[str]:
430
+ langs_csv = languages or "en"
431
+ out = _ocr_with_easyocr(frames, langs_csv, max_images)
432
+ if out: return out
433
+ out = _ocr_with_tesseract(frames, langs_csv, max_images)
434
+ return out
435
 
436
+ # ---------- video processing ----------
437
  def download_video(url: str, out_dir: str = "videos") -> str:
 
438
  os.makedirs(out_dir, exist_ok=True)
439
  out_tpl = os.path.join(out_dir, "%(title)s.%(ext)s")
440
  subprocess.run(["yt-dlp","-o",out_tpl,url], check=True)
 
443
 
444
  def sample_frames_ffmpeg(video_path: str, out_dir: str = "frames", fps: float = 0.5) -> List[str]:
445
  os.makedirs(out_dir, exist_ok=True)
446
+ try:
447
+ subprocess.run(["ffmpeg","-y","-i",video_path,"-vf",f"fps={fps}", os.path.join(out_dir, "frame_%06d.jpg")],
448
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
449
+ except Exception:
450
+ return []
451
  return sorted(glob.glob(os.path.join(out_dir, "frame_*.jpg")))
452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  def aggregate_text(asr_text: str, ocr_lines: List[str]) -> str:
454
  parts = []
455
  if asr_text: parts.append(asr_text)
 
461
  if k and k not in seen: uniq_lines.append(line.strip()); seen.add(k)
462
  return "\n".join(uniq_lines)
463
 
464
+ def suggest_claims(text: str, top_k: int = 10) -> List[str]:
465
+ sents = [re.sub(r'^[\'"“”]+|[\'"“”]+$', '', x).strip() for x in re.split(r'[.!?\n]+', text or "") if x.strip()]
466
+ candidates = [s for s in sents if len(s) >= 12 and re.search(r"\b(is|are|was|were|has|have|had|will|can|does|did|cause|causes|leads|led|prove|proves|confirm|confirms|predict|predicts|announce|announces|claim|claims|say|says|warn|warns|plan|plans|declare|declares|ban|bans|approve|approves)\b", s, re.I)]
467
+ if not candidates:
468
+ fallback = [s for s in sents if 8 <= len(s) <= 140]
469
+ scored = []
470
+ for s in fallback:
471
+ score = (1 if re.search(r'\d', s) else 0) + sum(1 for w in s.split()[:6] if w[:1].isupper())
472
+ scored.append((score, s))
473
+ candidates = [s for _, s in sorted(scored, key=lambda x: -x[0])[:top_k]]
474
+ return candidates[:top_k]
475
+
476
  def process_video(video_file: Optional[str] = None, video_url: Optional[str] = None,
477
  whisper_model: str = "base", asr_language: Optional[str] = None,
478
  ocr_langs: str = "en", fps: float = 0.5, max_ocr_images: int = 200) -> Dict:
479
  workdir = f"session_{uuid.uuid4().hex[:8]}"; os.makedirs(workdir, exist_ok=True)
480
+ # pick source
481
+ if video_url and video_url.strip():
482
+ vp = download_video(video_url.strip(), out_dir=workdir)
483
+ elif video_file and os.path.exists(video_file):
484
+ vp = shutil.copy(video_file, os.path.join(workdir, os.path.basename(video_file)))
485
+ else:
486
+ raise ValueError("Provide either a local video file path or a URL.")
487
+
488
+ # audio
489
  wav = os.path.join(workdir, "audio_16k.wav")
490
+ if not ffmpeg_available():
491
+ raise RuntimeError("ffmpeg binary not found. Ensure apt.txt includes 'ffmpeg'.")
492
  extract_audio_ffmpeg(vp, wav, sr=16000)
493
+
494
+ # ASR (never hard-fail)
495
+ asr_text = ""
496
+ try:
497
+ asr_text = run_whisper_asr(wav, model_size=whisper_model, language=asr_language)
498
+ if not asr_text:
499
+ asr_text = "[ASR skipped: no backend available]"
500
+ except Exception as e:
501
+ asr_text = f"[ASR skipped: {e}]"
502
  open(os.path.join(workdir, "transcript_asr.txt"), "w").write(asr_text)
503
+
504
+ # frames
505
  frames_dir = os.path.join(workdir, "frames")
506
  frames = sample_frames_ffmpeg(vp, out_dir=frames_dir, fps=fps)
507
+
508
+ # OCR (never hard-fail)
509
+ ocr_lines = []
510
+ try:
511
+ if frames:
512
+ ocr_lines = run_ocr_on_frames(frames, languages=ocr_langs, max_images=int(max_ocr_images))
513
+ else:
514
+ ocr_lines = []
515
+ except Exception as e:
516
+ ocr_lines = [f"[OCR error: {e}]"]
517
  if not ocr_lines:
518
  ocr_lines = ["[OCR skipped: no backend available]"]
 
519
  open(os.path.join(workdir, "transcript_ocr.txt"), "w").write("\n".join(ocr_lines))
520
+
521
+ # aggregate + suggestions
522
  agg = aggregate_text(asr_text, ocr_lines)
523
  open(os.path.join(workdir, "transcript_aggregated.txt"), "w").write(agg)
524
  suggestions = suggest_claims(agg, top_k=10)
525
+
526
  return {"workdir": workdir, "video_path": vp, "asr_text": asr_text, "ocr_lines": ocr_lines,
527
  "aggregated_text": agg, "suggested_claims": suggestions}
528
 
529
+ # ---------- Gradio UI ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  THEME_CSS = """
531
  <style>
532
  body, .gradio-container {
 
535
  linear-gradient(180deg, #0f1020, #0a0a12) !important;
536
  color: #fff;
537
  }
538
+ .glass { background: rgba(255,255,255,0.06); backdrop-filter: blur(8px);
539
+ border: 1px solid rgba(255,255,255,0.08); border-radius: 18px !important; }
540
+ .neon-btn { background: linear-gradient(90deg, rgba(122,60,255,0.9), rgba(0,179,255,0.9));
541
+ border-radius: 12px; color: white; box-shadow: 0 0 24px rgba(122,60,255,0.35); }
542
+ .neon-title { background: linear-gradient(90deg, #b28cff, #7a3cff, #00b3ff);
543
+ -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: 900; }
 
 
 
 
 
 
 
 
 
 
 
 
544
  </style>
545
  """
546
 
547
  def ui_run_factcheck(claims_text: str, use_web: bool, use_wiki: bool, allowlist_str: str):
548
  claims = [c.strip() for c in (claims_text or "").splitlines() if c.strip()]
549
+ if not claims: return "Please enter one claim per line.", None
 
550
  allow = [d.strip() for d in (allowlist_str or ", ".join(DEFAULT_ALLOWLIST)).split(",") if d.strip()]
551
  res = run_on_claims(claims, use_web=use_web, use_wiki=use_wiki, allowlist=allow, k=8)
 
552
  rows, cards = [], []
553
  for v in res:
554
  lines = ["─"*74, f"CLAIM: {v['claim']}", f"t_claim: {v['t_claim']}",
 
568
  rows.append({"claim": v["claim"], "verdict_at_t": v["label_at_t"], "verdict_now": v["label_now"],
569
  "confidence": round(float(v["confidence"]), 3),
570
  "used_ids": "|".join(v.get("used_evidence_ids_now", []))})
 
571
  df = pd.DataFrame(rows)
572
  return "\n\n".join(cards), df
573
 
574
  def ui_ingest_and_suggest(video_file, video_url, whisper_model, asr_language, ocr_langs, fps, max_ocr_images):
575
  try: vp = video_file.name if video_file else None
576
  except Exception: vp = None
577
+ out = process_video(video_file=vp, video_url=video_url,
578
+ whisper_model=whisper_model, asr_language=asr_language or None,
579
+ ocr_langs=ocr_langs, fps=float(fps), max_ocr_images=int(max_ocr_images))
 
 
 
580
  asr_preview = (out["asr_text"][:1200] + "...") if len(out["asr_text"]) > 1200 else out["asr_text"]
581
  ocr_preview = "\n".join(out["ocr_lines"][:50])
582
  agg_preview = (out["aggregated_text"][:2000] + "...") if len(out["aggregated_text"]) > 2000 else out["aggregated_text"]
583
  sugg = "\n".join(out["suggested_claims"])
584
  return asr_preview, ocr_preview, agg_preview, sugg, sugg
585
 
586
+ def run_diagnostics():
587
+ lines = []
588
+ lines.append(f"FFmpeg: {'found' if ffmpeg_available() else 'NOT found'}")
589
+ lines.append(f"GPU: {'available' if gpu_available() else 'CPU only'}")
590
+ lines.append(f"ASR backends: {', '.join(asr_backends()) or 'none'}")
591
+ lines.append(f"OCR backends: {', '.join(ocr_backends()) or 'none'}")
592
+ # ffmpeg version
593
+ try:
594
+ v = subprocess.run(['ffmpeg','-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=5)
595
+ lines.append(v.stdout.splitlines()[0])
596
+ except Exception as e:
597
+ lines.append(f"ffmpeg version: {e}")
598
+ # tesseract version
599
+ try:
600
+ if shutil.which("tesseract"):
601
+ tv = subprocess.run(['tesseract','-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=5)
602
+ lines.append("Tesseract: " + tv.stdout.splitlines()[0])
603
+ else:
604
+ lines.append("Tesseract: NOT found on PATH")
605
+ except Exception as e:
606
+ lines.append(f"Tesseract: {e}")
607
+ # EasyOCR smoke (import only)
608
+ lines.append(f"EasyOCR import: {'ok' if _easyocr else 'fail'}; OpenCV: {'ok' if cv2 is not None else 'fail'}")
609
+ # Create a quick OCR synthetic test with Tesseract if available
610
+ try:
611
+ from PIL import Image, ImageDraw
612
+ img = Image.new("RGB", (480, 120), (255,255,255))
613
+ d = ImageDraw.Draw(img); d.text((10,40), "AEGIS TEST 123", fill=(0,0,0))
614
+ tmp = f"diag_{uuid.uuid4().hex[:6]}.png"; img.save(tmp)
615
+ o = run_ocr_on_frames([tmp], languages="en", max_images=1)
616
+ os.remove(tmp)
617
+ lines.append("OCR synthetic test: " + ("OK: " + " | ".join(o) if o else "no text read"))
618
+ except Exception as e:
619
+ lines.append(f"OCR synthetic test error: {e}")
620
+ return "\n".join(lines)
621
+
622
  with gr.Blocks(css=THEME_CSS, fill_height=True) as demo:
623
+ gr.HTML("<h1 class='neon-title' style='font-size:42px;margin:8px 0;'>AEGIS FactCheck</h1><p style='opacity:.75;margin:-6px 0 18px;'>Receipts or it didn’t happen — evidence-backed verification for video claims.</p>")
624
 
625
  with gr.Tab("Manual Claims"):
626
  with gr.Row():
627
  with gr.Column(scale=1):
628
+ claims_box = gr.Textbox(label="Claims (one per line)", lines=8, placeholder="e.g. WHO approved mRNA vaccines in 2020", elem_classes=["glass"])
629
  with gr.Row():
630
  use_web = gr.Checkbox(value=True, label="Use Web retrieval")
631
  use_wiki = gr.Checkbox(value=True, label="Use Wikipedia")
 
637
  run_btn.click(ui_run_factcheck, inputs=[claims_box, use_web, use_wiki, allowlist_box], outputs=[out_text, out_df])
638
 
639
  with gr.Tab("Video Ingest (ASR + OCR)"):
640
+ gr.Markdown("Upload a video **OR** provide a URL. Whisper + EasyOCR/Tesseract run; text is aggregated and claims suggested.")
641
  with gr.Row():
642
  with gr.Column(scale=1):
643
  video_upload = gr.File(label="Upload video (mp4/mov/mkv...)", file_types=["video"])
 
660
  inputs=[video_upload, video_url, whisper_model, asr_language, ocr_langs, fps, max_ocr_images],
661
  outputs=[asr_out, ocr_out, agg_out, sugg_out, to_manual])
662
 
663
+ with gr.Tab("Diagnostics"):
664
+ diag_btn = gr.Button("Run Environment Checks", elem_classes=["neon-btn"])
665
+ diag_out = gr.Textbox(label="Diagnostics", lines=24, elem_classes=["glass"])
666
+ diag_btn.click(run_diagnostics, inputs=[], outputs=[diag_out])
667
+
668
  demo.launch()