molokhovdmitry commited on
Commit
35581ce
1 Parent(s): af5df29

Add batch prediction, max comment size

Browse files
Files changed (2) hide show
  1. main.py +14 -3
  2. yt_api.py +15 -11
main.py CHANGED
@@ -8,6 +8,8 @@ from models import init_emotions_model
8
 
9
  class Settings(BaseSettings):
10
  YT_API_KEY: str
 
 
11
  model_config = SettingsConfigDict(env_file='.env')
12
 
13
 
@@ -25,12 +27,21 @@ def home():
25
  @app.get('/predict')
26
  def predict(video_id):
27
  # Get comments
28
- comments = get_comments(video_id, settings.YT_API_KEY)
 
 
 
 
29
  comments_df = pd.DataFrame(comments)
30
 
31
- # Predict emotions
32
  text_list = comments_df['text_display'].to_list()
33
- preds = emotions_clf(text_list)
 
 
 
 
 
34
 
35
  # Add predictions to DataFrame
36
  preds_df = []
 
8
 
9
  class Settings(BaseSettings):
10
  YT_API_KEY: str
11
+ PRED_BATCH_SIZE: int
12
+ MAX_COMMENT_SIZE: int
13
  model_config = SettingsConfigDict(env_file='.env')
14
 
15
 
 
27
  @app.get('/predict')
28
  def predict(video_id):
29
  # Get comments
30
+ comments = get_comments(
31
+ video_id,
32
+ settings.MAX_COMMENT_SIZE,
33
+ settings.YT_API_KEY
34
+ )
35
  comments_df = pd.DataFrame(comments)
36
 
37
+ # Predict emotions in batches
38
  text_list = comments_df['text_display'].to_list()
39
+ batch_size = settings.PRED_BATCH_SIZE
40
+ text_batches = [text_list[i:i + batch_size]
41
+ for i in range(0, len(text_list), batch_size)]
42
+ preds = []
43
+ for batch in text_batches:
44
+ preds.extend(emotions_clf(batch))
45
 
46
  # Add predictions to DataFrame
47
  preds_df = []
yt_api.py CHANGED
@@ -1,19 +1,19 @@
1
  import requests
2
- # from pprint import pprint
3
 
4
 
5
- def get_comments(video_id, api_key):
6
  """Yields all `commentThreads` from a YouTube video in batches."""
7
 
8
- # Get comments from the first page.
9
  response = get_response(video_id, api_key, max_results=100)
10
- comment_list = response_to_comments(response)
11
 
12
- # Get comments from the other pages.
13
  while 'nextPageToken' in response.keys():
14
  response = get_response(
15
  video_id, api_key, page_token=response['nextPageToken'])
16
- comment_list += (response_to_comments(response))
17
 
18
  return comment_list
19
 
@@ -32,16 +32,20 @@ def get_response(video_id, api_key, page_token=None, max_results=100):
32
  return response.json()
33
 
34
 
35
- def response_to_comments(response):
36
  """Converts JSON response to `comment_list` dict."""
37
  comment_list = []
38
- for comment in response['items']:
39
- comment = comment['snippet']
40
  can_reply = comment['canReply']
41
  total_reply_count = comment['totalReplyCount']
42
  comment = comment['topLevelComment']
43
  comment_id = comment['id']
44
  comment = comment['snippet']
 
 
 
 
45
  try:
46
  comment_list.append({
47
  'comment_id': comment_id,
@@ -57,7 +61,7 @@ def response_to_comments(response):
57
  'total_reply_count': total_reply_count,
58
  })
59
  except Exception as e:
60
- print(f"Error: {e}\nComment: {comment}")
61
- continue
62
 
63
  return comment_list
 
1
  import requests
2
+ from pprint import pprint
3
 
4
 
5
+ def get_comments(video_id, max_comment_size, api_key):
6
  """Yields all `commentThreads` from a YouTube video in batches."""
7
 
8
+ # Get comments from the first page
9
  response = get_response(video_id, api_key, max_results=100)
10
+ comment_list = response_to_comments(response, max_comment_size)
11
 
12
+ # Get comments from the other pages
13
  while 'nextPageToken' in response.keys():
14
  response = get_response(
15
  video_id, api_key, page_token=response['nextPageToken'])
16
+ comment_list += (response_to_comments(response, max_comment_size))
17
 
18
  return comment_list
19
 
 
32
  return response.json()
33
 
34
 
35
+ def response_to_comments(response, max_comment_size):
36
  """Converts JSON response to `comment_list` dict."""
37
  comment_list = []
38
+ for full_comment in response['items']:
39
+ comment = full_comment['snippet']
40
  can_reply = comment['canReply']
41
  total_reply_count = comment['totalReplyCount']
42
  comment = comment['topLevelComment']
43
  comment_id = comment['id']
44
  comment = comment['snippet']
45
+
46
+ # Skip if comment is too long
47
+ if len(comment['textDisplay']) > max_comment_size:
48
+ continue
49
  try:
50
  comment_list.append({
51
  'comment_id': comment_id,
 
61
  'total_reply_count': total_reply_count,
62
  })
63
  except Exception as e:
64
+ print(f"Error: {e}\nComment:")
65
+ pprint(full_comment)
66
 
67
  return comment_list