msekoyan's picture
enhanced the app
ee0e042
import re
import iso639
from tqdm import tqdm
from functools import lru_cache
from huggingface_hub import list_models, hf_hub_download, snapshot_download
from collections import defaultdict
ARCH_TO_INCLUDE = ['parakeet', 'conformer', 'fastconformer']
def get_models_list(sort_alphabetically=True):
lang_models = defaultdict(list)
models_langs = dict()
models = list_models(author='nvidia',
task="automatic-speech-recognition",
sort='downloads',
cardData=True)
for model in models:
model_id = model.modelId.replace('nvidia/', '')
if not any([arch in model_id for arch in ARCH_TO_INCLUDE]):
continue
language_tags = model.cardData.get('language', ['Unknown'])
language_tags = [language_tags] if isinstance(language_tags, str) else language_tags
lang_names = []
for language in language_tags:
try:
lang_name = iso639.Language.match(language).name
lang_names.append(lang_name)
except:
lang_name = 'Unknown'
lang_models[lang_name].append(model_id)
if sort_alphabetically:
lang_names = sorted(lang_names)
models_langs[model_id] = lang_names
lang_models.pop('Unknown', None)
if sort_alphabetically:
lang_models = dict(
sorted(lang_models.items())
)
return lang_models, models_langs
def extract_section_from_readme(content):
# Adjust start marker to capture text after the badges section
start_marker = r"\[\!\[Model architecture\]\(https://img\.shields\.io.*?\)\].*?\n\n"
end_marker = r"##"
# Use regex to capture content between start_marker and end_marker
match = re.search(f"{start_marker}(.*?){end_marker}", content, re.DOTALL)
if not match:
match = re.search(r"# .+?\n\n(.*?)(?=\n## )", content, re.DOTALL)
if match:
# Extract the main content
section = match.group(1).strip()
# Remove any sentence starting with "See" or containing the word "RIVA"
section = re.sub(r"(See.*$|.*\bRiva\b.*$)", "", section, flags=re.MULTILINE).strip()
# Remove numbers in square brackets (e.g., [1], [2])
section = re.sub(r"\[\d+\]", "", section).strip()
return section
else:
return None
@lru_cache(maxsize=3)
def get_model_description(model_name):
if 'nvidia/' not in model_name:
model_name = 'nvidia/' + model_name
readme_path = hf_hub_download(repo_id=model_name, filename="README.md")
with open(readme_path, "r", encoding="utf-8") as file:
readme_content = file.read()
extracted_section = extract_section_from_readme(readme_content)
more_info = f"See more on the selected model on [{model_name}](https://huggingface.co/{model_name})."
return extracted_section, more_info
def predownload_models(models, top=None):
if top:
models = models[:top]
for model_name in tqdm(models):
snapshot_download('nvidia/' + model_name)