parakeet-tdt_ctc-1.1b_new

Runtime error

App Files Files Community

parakeet-tdt_ctc-1.1b_new / hf_utils.py

msekoyan

enhanced the app

ee0e042 11 months ago

raw

history blame contribute delete

3.14 kB

	import re
	import iso639
	from tqdm import tqdm
	from functools import lru_cache
	from huggingface_hub import list_models, hf_hub_download, snapshot_download
	from collections import defaultdict

	ARCH_TO_INCLUDE = ['parakeet', 'conformer', 'fastconformer']

	def get_models_list(sort_alphabetically=True):
	lang_models = defaultdict(list)
	models_langs = dict()
	models = list_models(author='nvidia',
	task="automatic-speech-recognition",
	sort='downloads',
	cardData=True)

	for model in models:
	model_id = model.modelId.replace('nvidia/', '')

	if not any([arch in model_id for arch in ARCH_TO_INCLUDE]):
	continue

	language_tags = model.cardData.get('language', ['Unknown'])
	language_tags = [language_tags] if isinstance(language_tags, str) else language_tags
	lang_names = []
	for language in language_tags:
	try:
	lang_name = iso639.Language.match(language).name
	lang_names.append(lang_name)
	except:
	lang_name = 'Unknown'
	lang_models[lang_name].append(model_id)

	if sort_alphabetically:
	lang_names = sorted(lang_names)

	models_langs[model_id] = lang_names

	lang_models.pop('Unknown', None)

	if sort_alphabetically:
	lang_models = dict(
	sorted(lang_models.items())
	)
	return lang_models, models_langs


	def extract_section_from_readme(content):
	# Adjust start marker to capture text after the badges section
	start_marker = r"\[\!\[Model architecture\]$https://img\.shields\.io.?$\].?\n\n"
	end_marker = r"##"

	# Use regex to capture content between start_marker and end_marker
	match = re.search(f"{start_marker}(.*?){end_marker}", content, re.DOTALL)

	if not match:
	match = re.search(r"# .+?\n\n(.*?)(?=\n## )", content, re.DOTALL)

	if match:
	# Extract the main content
	section = match.group(1).strip()

	# Remove any sentence starting with "See" or containing the word "RIVA"
	section = re.sub(r"(See.$\|.\bRiva\b.*$)", "", section, flags=re.MULTILINE).strip()

	# Remove numbers in square brackets (e.g., [1], [2])
	section = re.sub(r"\[\d+\]", "", section).strip()

	return section
	else:
	return None

	@lru_cache(maxsize=3)
	def get_model_description(model_name):
	if 'nvidia/' not in model_name:
	model_name = 'nvidia/' + model_name

	readme_path = hf_hub_download(repo_id=model_name, filename="README.md")
	with open(readme_path, "r", encoding="utf-8") as file:
	readme_content = file.read()
	extracted_section = extract_section_from_readme(readme_content)

	more_info = f"See more on the selected model on [{model_name}](https://huggingface.co/{model_name})."

	return extracted_section, more_info


	def predownload_models(models, top=None):
	if top:
	models = models[:top]
	for model_name in tqdm(models):
	snapshot_download('nvidia/' + model_name)