yonikremer commited on
Commit
99ced83
·
1 Parent(s): a0650fb

added docs

Browse files
Files changed (1) hide show
  1. supported_models.py +17 -10
supported_models.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Generator, Set, Union, List
2
 
3
  import requests
4
  from bs4 import BeautifulSoup, Tag, NavigableString, PageElement
@@ -19,36 +19,41 @@ DEFAULT_MIN_NUMBER_OF_LIKES = 20
19
 
20
 
21
  def get_model_name(model_card: Tag) -> str:
 
22
  h4_class = "text-md truncate font-mono text-black dark:group-hover:text-yellow-500 group-hover:text-indigo-600"
23
  h4_tag = model_card.find("h4", class_=h4_class)
24
  return h4_tag.text
25
 
26
 
27
- def is_a_number(elem: PageElement) -> bool:
28
- text = elem.text
 
 
 
29
  lowered_text = text.strip().lower()
30
  no_characters_text = lowered_text.replace("k", "").replace("m", "").replace("b", "")
31
- elem = no_characters_text.replace(",", "").replace(".", "")
32
  try:
33
- float(elem)
34
  except ValueError:
35
  return False
36
  return True
37
 
38
 
39
  def get_numeric_contents(model_card: Tag) -> List[PageElement]:
 
40
  div: Union[Tag | NavigableString] = model_card.find(
41
  "div",
42
  class_="mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400",
43
  recursive=True
44
  )
45
  contents: List[PageElement] = div.contents
46
- contents_without_tags: List[PageElement] = [content for content in contents if not isinstance(content, Tag)]
47
- number_contents: List[PageElement] = [content for content in contents_without_tags if is_a_number(content)]
48
  return number_contents
49
 
50
 
51
  def convert_to_int(element: PageElement) -> int:
 
52
  element_str = element.text.strip().lower()
53
  if element_str.endswith("k"):
54
  return int(float(element_str[:-1]) * 1_000)
@@ -56,11 +61,11 @@ def convert_to_int(element: PageElement) -> int:
56
  return int(float(element_str[:-1]) * 1_000_000)
57
  elif element_str.endswith("b"):
58
  return int(float(element_str[:-1]) * 1_000_000_000)
59
- else:
60
- return int(element_str)
61
 
62
 
63
- def get_page(page_index: int):
 
64
  curr_page_url = f"{SUPPORTED_MODEL_NAME_PAGES_FORMAT}&p={page_index}"
65
  response = requests.get(curr_page_url)
66
  if response.status_code == 200:
@@ -75,6 +80,7 @@ def card_filter(
75
  min_number_of_downloads: int,
76
  min_number_of_likes: int,
77
  ) -> bool:
 
78
  if model_name in BLACKLISTED_MODEL_NAMES:
79
  return False
80
  numeric_contents = get_numeric_contents(model_card)
@@ -96,6 +102,7 @@ def get_model_names(
96
  min_number_of_downloads: int,
97
  min_number_of_likes: int,
98
  ) -> Generator[str, None, None]:
 
99
  model_cards: List[Tag] = soup.find_all("article", class_="overview-card-wrapper group", recursive=True)
100
  for model_card in model_cards:
101
  model_name = get_model_name(model_card)
 
1
+ from typing import Generator, Set, Union, List, Optional
2
 
3
  import requests
4
  from bs4 import BeautifulSoup, Tag, NavigableString, PageElement
 
19
 
20
 
21
  def get_model_name(model_card: Tag) -> str:
22
+ """returns the model name from the model card tag"""
23
  h4_class = "text-md truncate font-mono text-black dark:group-hover:text-yellow-500 group-hover:text-indigo-600"
24
  h4_tag = model_card.find("h4", class_=h4_class)
25
  return h4_tag.text
26
 
27
 
28
+ def is_a_number(element: Union[PageElement, Tag]) -> bool:
29
+ """returns True if the element is a number, False otherwise"""
30
+ if isinstance(element, Tag):
31
+ return False
32
+ text = element.text
33
  lowered_text = text.strip().lower()
34
  no_characters_text = lowered_text.replace("k", "").replace("m", "").replace("b", "")
35
+ element = no_characters_text.replace(",", "").replace(".", "")
36
  try:
37
+ float(element)
38
  except ValueError:
39
  return False
40
  return True
41
 
42
 
43
  def get_numeric_contents(model_card: Tag) -> List[PageElement]:
44
+ """returns the number of likes and downloads from the model card tag it they exist in the model card"""
45
  div: Union[Tag | NavigableString] = model_card.find(
46
  "div",
47
  class_="mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400",
48
  recursive=True
49
  )
50
  contents: List[PageElement] = div.contents
51
+ number_contents: List[PageElement] = [content for content in contents if is_a_number(content)]
 
52
  return number_contents
53
 
54
 
55
  def convert_to_int(element: PageElement) -> int:
56
+ """converts the element to an int"""
57
  element_str = element.text.strip().lower()
58
  if element_str.endswith("k"):
59
  return int(float(element_str[:-1]) * 1_000)
 
61
  return int(float(element_str[:-1]) * 1_000_000)
62
  elif element_str.endswith("b"):
63
  return int(float(element_str[:-1]) * 1_000_000_000)
64
+ return int(element_str)
 
65
 
66
 
67
+ def get_page(page_index: int) -> Optional[BeautifulSoup]:
68
+ """returns the page with the given index if it exists, None otherwise"""
69
  curr_page_url = f"{SUPPORTED_MODEL_NAME_PAGES_FORMAT}&p={page_index}"
70
  response = requests.get(curr_page_url)
71
  if response.status_code == 200:
 
80
  min_number_of_downloads: int,
81
  min_number_of_likes: int,
82
  ) -> bool:
83
+ """returns True if the model card is valid, False otherwise"""
84
  if model_name in BLACKLISTED_MODEL_NAMES:
85
  return False
86
  numeric_contents = get_numeric_contents(model_card)
 
102
  min_number_of_downloads: int,
103
  min_number_of_likes: int,
104
  ) -> Generator[str, None, None]:
105
+ """Scrapes the model names from the given soup"""
106
  model_cards: List[Tag] = soup.find_all("article", class_="overview-card-wrapper group", recursive=True)
107
  for model_card in model_cards:
108
  model_name = get_model_name(model_card)