nesticot commited on
Commit
e8f7ce1
·
verified ·
1 Parent(s): dd3b346

Upload api_scraper.py

Browse files
Files changed (1) hide show
  1. api_scraper.py +61 -40
api_scraper.py CHANGED
@@ -100,20 +100,25 @@ class MLB_Scrape:
100
  # Make API call to retrieve game schedule
101
  game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
102
  try:
103
- # Extract relevant data from the API response
104
- game_list = [item for sublist in [[y['gamePk'] for y in x['games']] for x in game_call['dates']] for item in sublist]
105
- time_list = [item for sublist in [[y['gameDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
106
- date_list = [item for sublist in [[y['officialDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
107
- away_team_list = [item for sublist in [[y['teams']['away']['team']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
108
- away_team_id_list = [item for sublist in [[y['teams']['away']['team']['id'] for y in x['games']] for x in game_call['dates']] for item in sublist]
109
- home_team_list = [item for sublist in [[y['teams']['home']['team']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
110
- home_team_id_list = [item for sublist in [[y['teams']['home']['team']['id'] for y in x['games']] for x in game_call['dates']] for item in sublist]
111
- state_list = [item for sublist in [[y['status']['codedGameState'] for y in x['games']] for x in game_call['dates']] for item in sublist]
112
- venue_id = [item for sublist in [[y['venue']['id'] for y in x['games']] for x in game_call['dates']] for item in sublist]
113
- venue_name = [item for sublist in [[y['venue']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
114
- gameday_type = [item for sublist in [[y['gamedayType'] for y in x['games']] for x in game_call['dates']] for item in sublist]
115
- # Create a Polars DataFrame with the extracted data
116
-
 
 
 
 
 
117
 
118
  # Create a Polars DataFrame with the extracted data
119
  game_df = pl.DataFrame(data={'game_id': game_list,
@@ -154,40 +159,51 @@ class MLB_Scrape:
154
  return game_df
155
 
156
 
157
- # def get_data(self, game_list_input: list):
158
- # """
159
- # Retrieves live game data for a list of game IDs in parallel.
160
-
161
- # Parameters:
162
- # - game_list_input (list): A list of game IDs for which to retrieve live data.
163
-
164
- # Returns:
165
- # - data_total (list): A list of JSON responses containing live game data for each game ID.
166
- # """
167
- # data_total = []
168
- # print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
169
-
170
- # def fetch_data(game_id):
171
- # r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_id}/feed/live')
172
- # return r.json()
173
 
174
- # with ThreadPoolExecutor() as executor:
175
- # futures = {executor.submit(fetch_data, game_id): game_id for game_id in game_list_input}
176
- # for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="iteration"):
177
- # data_total.append(future.result())
178
 
179
- # return data_total
180
-
181
-
182
- def get_data(self,game_list_input = [748540]):
183
  data_total = []
184
- #n_count = 0
185
  print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
 
 
186
  for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
 
187
  r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
 
188
  data_total.append(r.json())
 
189
  return data_total
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  def get_data_df(self, data_list):
193
  """
@@ -232,6 +248,7 @@ class MLB_Scrape:
232
  strikes_after = []
233
  balls_after = []
234
  outs_after = []
 
235
 
236
  start_speed = []
237
  end_speed = []
@@ -339,7 +356,8 @@ class MLB_Scrape:
339
  else:
340
  is_swing.append(None)
341
  is_whiff.append(None)
342
-
 
343
  is_ball.append(ab_list['playEvents'][n]['details']['isOut'] if 'isOut' in ab_list['playEvents'][n]['details'] else None)
344
  is_review.append(ab_list['playEvents'][n]['details']['hasReview'] if 'hasReview' in ab_list['playEvents'][n]['details'] else None)
345
  pitch_type.append(ab_list['playEvents'][n]['details']['type']['code'] if 'type' in ab_list['playEvents'][n]['details'] else None)
@@ -431,6 +449,7 @@ class MLB_Scrape:
431
  vb.append(None)
432
  ivb.append(None)
433
  hb.append(None)
 
434
 
435
  if 'hitData' in ab_list['playEvents'][n]:
436
  launch_speed.append(ab_list['playEvents'][n]['hitData']['launchSpeed'] if 'launchSpeed' in ab_list['playEvents'][n]['hitData'] else None)
@@ -514,6 +533,7 @@ class MLB_Scrape:
514
  is_review.append(None)
515
  pitch_type.append(None)
516
  pitch_description.append(None)
 
517
  strikes.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
518
  balls.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
519
  outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
@@ -592,6 +612,7 @@ class MLB_Scrape:
592
  'pitcher_team':pitcher_team,
593
  'pitcher_team_id':pitcher_team_id,
594
  'ab_number':ab_number,
 
595
  'play_description':play_description,
596
  'play_code':play_code,
597
  'in_play':in_play,
 
100
  # Make API call to retrieve game schedule
101
  game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
102
  try:
103
+ def safe_get(d, keys, default=np.nan):
104
+ """Safely retrieve nested dictionary values."""
105
+ for key in keys:
106
+ d = d.get(key, {})
107
+ if not isinstance(d, dict):
108
+ return d # Return value if it's not a dict
109
+ return default # Return default if keys don't exist
110
+
111
+ game_list = [item for sublist in [[y.get('gamePk', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
112
+ time_list = [item for sublist in [[y.get('gameDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
113
+ date_list = [item for sublist in [[y.get('officialDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
114
+ away_team_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
115
+ away_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
116
+ home_team_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
117
+ home_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
118
+ state_list = [item for sublist in [[safe_get(y, ['status', 'codedGameState'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
119
+ venue_id = [item for sublist in [[safe_get(y, ['venue', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
120
+ venue_name = [item for sublist in [[safe_get(y, ['venue', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
121
+ gameday_type = [item for sublist in [[safe_get(y, ['gamedayType'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
122
 
123
  # Create a Polars DataFrame with the extracted data
124
  game_df = pl.DataFrame(data={'game_id': game_list,
 
159
  return game_df
160
 
161
 
162
+ def get_data(self, game_list_input: list):
163
+ """
164
+ Retrieves live game data for a list of game IDs.
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ Parameters:
167
+ - game_list_input (list): A list of game IDs for which to retrieve live data.
 
 
168
 
169
+ Returns:
170
+ - data_total (list): A list of JSON responses containing live game data for each game ID.
171
+ """
 
172
  data_total = []
 
173
  print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
174
+
175
+ # Iterate over the list of game IDs with a progress bar
176
  for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
177
+ # Make a GET request to the MLB API for each game ID
178
  r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
179
+ # Append the JSON response to the data_total list
180
  data_total.append(r.json())
181
+
182
  return data_total
183
 
184
+ def get_data_new(self, game_list_input: list):
185
+ """
186
+ Retrieves live game data for a list of game IDs in parallel.
187
+
188
+ Parameters:
189
+ - game_list_input (list): A list of game IDs for which to retrieve live data.
190
+
191
+ Returns:
192
+ - data_total (list): A list of JSON responses containing live game data for each game ID.
193
+ """
194
+ data_total = []
195
+ print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
196
+
197
+ def fetch_data(game_id):
198
+ r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_id}/feed/live')
199
+ return r.json()
200
+
201
+ with ThreadPoolExecutor() as executor:
202
+ futures = {executor.submit(fetch_data, game_id): game_id for game_id in game_list_input}
203
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="iteration"):
204
+ data_total.append(future.result())
205
+
206
+ return data_total
207
 
208
  def get_data_df(self, data_list):
209
  """
 
248
  strikes_after = []
249
  balls_after = []
250
  outs_after = []
251
+ inning = []
252
 
253
  start_speed = []
254
  end_speed = []
 
356
  else:
357
  is_swing.append(None)
358
  is_whiff.append(None)
359
+
360
+ inning.append(ab_list['about']['inning'] if 'inning' in ab_list['about'] else None)
361
  is_ball.append(ab_list['playEvents'][n]['details']['isOut'] if 'isOut' in ab_list['playEvents'][n]['details'] else None)
362
  is_review.append(ab_list['playEvents'][n]['details']['hasReview'] if 'hasReview' in ab_list['playEvents'][n]['details'] else None)
363
  pitch_type.append(ab_list['playEvents'][n]['details']['type']['code'] if 'type' in ab_list['playEvents'][n]['details'] else None)
 
449
  vb.append(None)
450
  ivb.append(None)
451
  hb.append(None)
452
+
453
 
454
  if 'hitData' in ab_list['playEvents'][n]:
455
  launch_speed.append(ab_list['playEvents'][n]['hitData']['launchSpeed'] if 'launchSpeed' in ab_list['playEvents'][n]['hitData'] else None)
 
533
  is_review.append(None)
534
  pitch_type.append(None)
535
  pitch_description.append(None)
536
+ inning.append(None)
537
  strikes.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
538
  balls.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
539
  outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
 
612
  'pitcher_team':pitcher_team,
613
  'pitcher_team_id':pitcher_team_id,
614
  'ab_number':ab_number,
615
+ 'inning':inning,
616
  'play_description':play_description,
617
  'play_code':play_code,
618
  'in_play':in_play,