Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -61,7 +61,7 @@ whisper = _try_import("whisper")
|
|
61 |
_openai = _try_import("openai")
|
62 |
_has_openai_key = bool(os.environ.get("OPENAI_API_KEY"))
|
63 |
|
64 |
-
# ---- ASR guarded imports
|
65 |
try:
|
66 |
from faster_whisper import WhisperModel as FWWhisperModel
|
67 |
except Exception:
|
@@ -72,6 +72,23 @@ try:
|
|
72 |
except Exception:
|
73 |
OpenAIWhisper = None
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
def has_llm() -> bool:
|
77 |
return (not FORCE_BASELINE) and _openai is not None and _has_openai_key
|
@@ -378,6 +395,35 @@ def run_whisper_asr(audio_path: str, model_size: str = "base", language: Optiona
|
|
378 |
# Nothing available
|
379 |
raise RuntimeError("No ASR backend available (install faster-whisper or openai-whisper).")
|
380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
|
382 |
def download_video(url: str, out_dir: str = "videos") -> str:
|
383 |
# yt-dlp is installed via requirements; call binary
|
@@ -404,26 +450,50 @@ def preprocess_for_ocr(img_path: str):
|
|
404 |
cv2.THRESH_BINARY, 31, 9)
|
405 |
return th
|
406 |
|
407 |
-
def
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
|
428 |
def aggregate_text(asr_text: str, ocr_lines: List[str]) -> str:
|
429 |
parts = []
|
@@ -450,7 +520,11 @@ def process_video(video_file: Optional[str] = None, video_url: Optional[str] = N
|
|
450 |
frames_dir = os.path.join(workdir, "frames")
|
451 |
frames = sample_frames_ffmpeg(vp, out_dir=frames_dir, fps=fps)
|
452 |
langs = [x.strip() for x in ocr_langs.split(",") if x.strip()]
|
453 |
-
|
|
|
|
|
|
|
|
|
454 |
open(os.path.join(workdir, "transcript_ocr.txt"), "w").write("\n".join(ocr_lines))
|
455 |
agg = aggregate_text(asr_text, ocr_lines)
|
456 |
open(os.path.join(workdir, "transcript_aggregated.txt"), "w").write(agg)
|
|
|
61 |
_openai = _try_import("openai")
|
62 |
_has_openai_key = bool(os.environ.get("OPENAI_API_KEY"))
|
63 |
|
64 |
+
# ---- ASR guarded imports
|
65 |
try:
|
66 |
from faster_whisper import WhisperModel as FWWhisperModel
|
67 |
except Exception:
|
|
|
72 |
except Exception:
|
73 |
OpenAIWhisper = None
|
74 |
|
75 |
+
# ---- OCR guarded imports
|
76 |
+
try:
|
77 |
+
import easyocr as _easyocr
|
78 |
+
except Exception:
|
79 |
+
_easyocr = None
|
80 |
+
|
81 |
+
try:
|
82 |
+
import pytesseract as _pyt
|
83 |
+
except Exception:
|
84 |
+
_pyt = None
|
85 |
+
|
86 |
+
try:
|
87 |
+
import cv2
|
88 |
+
except Exception:
|
89 |
+
cv2 = None
|
90 |
+
|
91 |
+
|
92 |
|
93 |
def has_llm() -> bool:
|
94 |
return (not FORCE_BASELINE) and _openai is not None and _has_openai_key
|
|
|
395 |
# Nothing available
|
396 |
raise RuntimeError("No ASR backend available (install faster-whisper or openai-whisper).")
|
397 |
|
398 |
+
def _ocr_with_tesseract(frames: list[str], langs_csv: str, max_images: int | None) -> list[str]:
|
399 |
+
if _pyt is None or cv2 is None:
|
400 |
+
return []
|
401 |
+
lang = _tess_langs(langs_csv)
|
402 |
+
texts, count = [], 0
|
403 |
+
for fp in frames:
|
404 |
+
if max_images and count >= max_images: break
|
405 |
+
img = preprocess_for_ocr(fp)
|
406 |
+
if img is None:
|
407 |
+
count += 1;
|
408 |
+
continue
|
409 |
+
try:
|
410 |
+
raw = _pyt.image_to_string(img, lang=lang) # returns a blob of text
|
411 |
+
except Exception:
|
412 |
+
raw = ""
|
413 |
+
for line in (raw or "").splitlines():
|
414 |
+
line = normalize_ws(line)
|
415 |
+
if len(line) >= 3:
|
416 |
+
texts.append(line)
|
417 |
+
count += 1
|
418 |
+
# dedupe
|
419 |
+
uniq, seen = [], set()
|
420 |
+
for t in texts:
|
421 |
+
k = t.lower()
|
422 |
+
if k not in seen:
|
423 |
+
uniq.append(t); seen.add(k)
|
424 |
+
return uniq
|
425 |
+
|
426 |
+
|
427 |
|
428 |
def download_video(url: str, out_dir: str = "videos") -> str:
|
429 |
# yt-dlp is installed via requirements; call binary
|
|
|
450 |
cv2.THRESH_BINARY, 31, 9)
|
451 |
return th
|
452 |
|
453 |
+
def run_ocr_on_frames(frames: list[str], languages: list[str] | str = "en", gpu: bool | None = None, max_images: int | None = None) -> list[str]:
|
454 |
+
# Normalize languages input
|
455 |
+
if isinstance(languages, list):
|
456 |
+
langs_csv = ",".join(languages)
|
457 |
+
else:
|
458 |
+
langs_csv = languages or "en"
|
459 |
+
|
460 |
+
# 1) Try EasyOCR
|
461 |
+
if _easyocr is not None and cv2 is not None:
|
462 |
+
try:
|
463 |
+
if gpu is None:
|
464 |
+
gpu = True if (os.environ.get("SPACE_ID") or shutil.which("nvidia-smi")) else False
|
465 |
+
reader = _easyocr.Reader([c.strip() for c in langs_csv.split(",") if c.strip()], gpu=gpu)
|
466 |
+
texts, count = [], 0
|
467 |
+
for fp in frames:
|
468 |
+
if max_images and count >= max_images: break
|
469 |
+
img = preprocess_for_ocr(fp)
|
470 |
+
if img is None:
|
471 |
+
count += 1;
|
472 |
+
continue
|
473 |
+
for (_bbox, txt, conf) in reader.readtext(img):
|
474 |
+
txt = normalize_ws(txt)
|
475 |
+
if txt and conf >= 0.35:
|
476 |
+
texts.append(txt)
|
477 |
+
count += 1
|
478 |
+
# dedupe
|
479 |
+
uniq, seen = [], set()
|
480 |
+
for t in texts:
|
481 |
+
k = t.lower()
|
482 |
+
if k not in seen:
|
483 |
+
uniq.append(t); seen.add(k)
|
484 |
+
if uniq:
|
485 |
+
return uniq
|
486 |
+
except Exception:
|
487 |
+
pass # fall through to tesseract
|
488 |
+
|
489 |
+
# 2) Fallback: Tesseract
|
490 |
+
t_res = _ocr_with_tesseract(frames, langs_csv, max_images)
|
491 |
+
if t_res:
|
492 |
+
return t_res
|
493 |
+
|
494 |
+
# 3) Nothing available
|
495 |
+
return []
|
496 |
+
|
497 |
|
498 |
def aggregate_text(asr_text: str, ocr_lines: List[str]) -> str:
|
499 |
parts = []
|
|
|
520 |
frames_dir = os.path.join(workdir, "frames")
|
521 |
frames = sample_frames_ffmpeg(vp, out_dir=frames_dir, fps=fps)
|
522 |
langs = [x.strip() for x in ocr_langs.split(",") if x.strip()]
|
523 |
+
ocr_langs_csv = ",".join(langs)
|
524 |
+
ocr_lines = run_ocr_on_frames(frames, languages=ocr_langs_csv, gpu=None, max_images=int(max_ocr_images))
|
525 |
+
if not ocr_lines:
|
526 |
+
ocr_lines = ["[OCR skipped: no backend available]"]
|
527 |
+
|
528 |
open(os.path.join(workdir, "transcript_ocr.txt"), "w").write("\n".join(ocr_lines))
|
529 |
agg = aggregate_text(asr_text, ocr_lines)
|
530 |
open(os.path.join(workdir, "transcript_aggregated.txt"), "w").write(agg)
|