AhmadXGaballah commited on
Commit
b9325a4
·
verified ·
1 Parent(s): 5953834

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -22
app.py CHANGED
@@ -61,7 +61,7 @@ whisper = _try_import("whisper")
61
  _openai = _try_import("openai")
62
  _has_openai_key = bool(os.environ.get("OPENAI_API_KEY"))
63
 
64
- # ---- ASR guarded imports (add these)
65
  try:
66
  from faster_whisper import WhisperModel as FWWhisperModel
67
  except Exception:
@@ -72,6 +72,23 @@ try:
72
  except Exception:
73
  OpenAIWhisper = None
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  def has_llm() -> bool:
77
  return (not FORCE_BASELINE) and _openai is not None and _has_openai_key
@@ -378,6 +395,35 @@ def run_whisper_asr(audio_path: str, model_size: str = "base", language: Optiona
378
  # Nothing available
379
  raise RuntimeError("No ASR backend available (install faster-whisper or openai-whisper).")
380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
  def download_video(url: str, out_dir: str = "videos") -> str:
383
  # yt-dlp is installed via requirements; call binary
@@ -404,26 +450,50 @@ def preprocess_for_ocr(img_path: str):
404
  cv2.THRESH_BINARY, 31, 9)
405
  return th
406
 
407
- def run_easyocr_on_frames(frames: List[str], languages: List[str] = ["en"], gpu: Optional[bool] = None, max_images: Optional[int] = None) -> List[str]:
408
- if easyocr is None:
409
- raise RuntimeError("EasyOCR not available. Ensure easyocr + opencv-python-headless are installed.")
410
- if gpu is None:
411
- gpu = True if (os.environ.get("SPACE_ID") or shutil.which("nvidia-smi")) else False
412
- reader = easyocr.Reader(languages, gpu=gpu)
413
- texts, count = [], 0
414
- for fp in frames:
415
- if max_images and count >= max_images: break
416
- img = preprocess_for_ocr(fp)
417
- if img is None: continue
418
- for (_bbox, txt, conf) in reader.readtext(img):
419
- txt = normalize_ws(txt)
420
- if txt and conf >= 0.35: texts.append(txt)
421
- count += 1
422
- uniq, seen = [], set()
423
- for t in texts:
424
- k = t.lower()
425
- if k not in seen: uniq.append(t); seen.add(k)
426
- return uniq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
  def aggregate_text(asr_text: str, ocr_lines: List[str]) -> str:
429
  parts = []
@@ -450,7 +520,11 @@ def process_video(video_file: Optional[str] = None, video_url: Optional[str] = N
450
  frames_dir = os.path.join(workdir, "frames")
451
  frames = sample_frames_ffmpeg(vp, out_dir=frames_dir, fps=fps)
452
  langs = [x.strip() for x in ocr_langs.split(",") if x.strip()]
453
- ocr_lines = run_easyocr_on_frames(frames, languages=langs, gpu=None, max_images=int(max_ocr_images))
 
 
 
 
454
  open(os.path.join(workdir, "transcript_ocr.txt"), "w").write("\n".join(ocr_lines))
455
  agg = aggregate_text(asr_text, ocr_lines)
456
  open(os.path.join(workdir, "transcript_aggregated.txt"), "w").write(agg)
 
61
  _openai = _try_import("openai")
62
  _has_openai_key = bool(os.environ.get("OPENAI_API_KEY"))
63
 
64
+ # ---- ASR guarded imports
65
  try:
66
  from faster_whisper import WhisperModel as FWWhisperModel
67
  except Exception:
 
72
  except Exception:
73
  OpenAIWhisper = None
74
 
75
+ # ---- OCR guarded imports
76
+ try:
77
+ import easyocr as _easyocr
78
+ except Exception:
79
+ _easyocr = None
80
+
81
+ try:
82
+ import pytesseract as _pyt
83
+ except Exception:
84
+ _pyt = None
85
+
86
+ try:
87
+ import cv2
88
+ except Exception:
89
+ cv2 = None
90
+
91
+
92
 
93
  def has_llm() -> bool:
94
  return (not FORCE_BASELINE) and _openai is not None and _has_openai_key
 
395
  # Nothing available
396
  raise RuntimeError("No ASR backend available (install faster-whisper or openai-whisper).")
397
 
398
+ def _ocr_with_tesseract(frames: list[str], langs_csv: str, max_images: int | None) -> list[str]:
399
+ if _pyt is None or cv2 is None:
400
+ return []
401
+ lang = _tess_langs(langs_csv)
402
+ texts, count = [], 0
403
+ for fp in frames:
404
+ if max_images and count >= max_images: break
405
+ img = preprocess_for_ocr(fp)
406
+ if img is None:
407
+ count += 1;
408
+ continue
409
+ try:
410
+ raw = _pyt.image_to_string(img, lang=lang) # returns a blob of text
411
+ except Exception:
412
+ raw = ""
413
+ for line in (raw or "").splitlines():
414
+ line = normalize_ws(line)
415
+ if len(line) >= 3:
416
+ texts.append(line)
417
+ count += 1
418
+ # dedupe
419
+ uniq, seen = [], set()
420
+ for t in texts:
421
+ k = t.lower()
422
+ if k not in seen:
423
+ uniq.append(t); seen.add(k)
424
+ return uniq
425
+
426
+
427
 
428
  def download_video(url: str, out_dir: str = "videos") -> str:
429
  # yt-dlp is installed via requirements; call binary
 
450
  cv2.THRESH_BINARY, 31, 9)
451
  return th
452
 
453
+ def run_ocr_on_frames(frames: list[str], languages: list[str] | str = "en", gpu: bool | None = None, max_images: int | None = None) -> list[str]:
454
+ # Normalize languages input
455
+ if isinstance(languages, list):
456
+ langs_csv = ",".join(languages)
457
+ else:
458
+ langs_csv = languages or "en"
459
+
460
+ # 1) Try EasyOCR
461
+ if _easyocr is not None and cv2 is not None:
462
+ try:
463
+ if gpu is None:
464
+ gpu = True if (os.environ.get("SPACE_ID") or shutil.which("nvidia-smi")) else False
465
+ reader = _easyocr.Reader([c.strip() for c in langs_csv.split(",") if c.strip()], gpu=gpu)
466
+ texts, count = [], 0
467
+ for fp in frames:
468
+ if max_images and count >= max_images: break
469
+ img = preprocess_for_ocr(fp)
470
+ if img is None:
471
+ count += 1;
472
+ continue
473
+ for (_bbox, txt, conf) in reader.readtext(img):
474
+ txt = normalize_ws(txt)
475
+ if txt and conf >= 0.35:
476
+ texts.append(txt)
477
+ count += 1
478
+ # dedupe
479
+ uniq, seen = [], set()
480
+ for t in texts:
481
+ k = t.lower()
482
+ if k not in seen:
483
+ uniq.append(t); seen.add(k)
484
+ if uniq:
485
+ return uniq
486
+ except Exception:
487
+ pass # fall through to tesseract
488
+
489
+ # 2) Fallback: Tesseract
490
+ t_res = _ocr_with_tesseract(frames, langs_csv, max_images)
491
+ if t_res:
492
+ return t_res
493
+
494
+ # 3) Nothing available
495
+ return []
496
+
497
 
498
  def aggregate_text(asr_text: str, ocr_lines: List[str]) -> str:
499
  parts = []
 
520
  frames_dir = os.path.join(workdir, "frames")
521
  frames = sample_frames_ffmpeg(vp, out_dir=frames_dir, fps=fps)
522
  langs = [x.strip() for x in ocr_langs.split(",") if x.strip()]
523
+ ocr_langs_csv = ",".join(langs)
524
+ ocr_lines = run_ocr_on_frames(frames, languages=ocr_langs_csv, gpu=None, max_images=int(max_ocr_images))
525
+ if not ocr_lines:
526
+ ocr_lines = ["[OCR skipped: no backend available]"]
527
+
528
  open(os.path.join(workdir, "transcript_ocr.txt"), "w").write("\n".join(ocr_lines))
529
  agg = aggregate_text(asr_text, ocr_lines)
530
  open(os.path.join(workdir, "transcript_aggregated.txt"), "w").write(agg)