Spaces:

CoreyMorris
/

MMLU-by-task-Leaderboard

Running

MMLU-by-task-Leaderboard / details_data_processor.py

Corey Morris

updated pipeline and init

7f2d984 almost 2 years ago

4.04 kB

	import pandas as pd
	import os
	import fnmatch
	import json
	import re
	import numpy as np
	import requests
	from urllib.parse import quote
	from datetime import datetime
	import uuid



	class DetailsDataProcessor:
	# Download
	#url example https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/64bits/LexPodLM-13B/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-25T13%3A41%3A51.227672.json

	# def __init__(self, directory='results', pattern='results*.json'):
	# self.directory = directory
	# self.pattern = pattern

	def __init__(self, directory='results', pattern='results*.json'):
	self.directory = directory
	self.pattern = pattern
	if not os.path.exists('details_data'):
	os.makedirs('details_data')


	def _find_files(self, directory='results', pattern='results*.json'):
	matching_files = [] # List to hold matching filenames
	for root, dirs, files in os.walk(directory):
	for basename in files:
	if fnmatch.fnmatch(basename, pattern):
	filename = os.path.join(root, basename)
	matching_files.append(filename) # Append the matching filename to the list
	return matching_files # Return the list of matching filenames

	@staticmethod
	def download_file(url, directory='details_data'):
	# Extract relevant parts from the URL
	segments = url.split('/')
	organization = segments[-3]
	model_name = segments[-2]
	task = url.split('%7ChendrycksTest-')[1].split('%7C')[0]

	# Construct the filename
	safe_file_name = f"{organization}_{model_name}_{task}.json"

	# Create the full save file path
	save_file_path = os.path.join(directory, safe_file_name)

	error_count = 0
	success_count = 0
	try:
	# Sending a GET request
	r = requests.get(url, allow_redirects=True)
	r.raise_for_status()

	# Writing the content to the specified file
	with open(save_file_path, 'wb') as file:
	file.write(r.content)
	print(save_file_path)

	success_count += 1
	except requests.ConnectionError as e:
	error_count += 1
	except requests.HTTPError as e:
	error_count += 1
	except FileNotFoundError as e:
	error_count += 1
	except Exception as e:
	error_count += 1

	return error_count, success_count



	@staticmethod
	def single_file_pipeline(url, filename):
	DetailsDataProcessor.download_file(url, filename)
	# read file
	with open(filename) as f:
	data = json.load(f)
	# convert to dataframe
	df = pd.DataFrame(data)
	return df

	@staticmethod
	def build_url(file_path):
	segments = file_path.split('/')
	bits = segments[1]
	model_name = segments[2]

	try:
	timestamp = segments[3].split('_')[1]
	except IndexError:
	print(f"Error: {file_path}")
	return None

	url = f'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/{bits}/{model_name}/details_harness%7ChendrycksTest-moral_scenarios%7C5_{quote(timestamp, safe="")}'
	return url

	def pipeline(self):
	error_count = 0
	success_count = 0
	file_paths = self._find_files(self.directory, self.pattern)
	for file_path in file_paths:
	print(f"Processing file path: {file_path}")
	url = self.build_url(file_path)
	if url:
	errors, successes = self.download_file(url)
	error_count += errors
	success_count += successes
	else:
	print(f"Error building URL for file path: {file_path}")
	error_count += 1

	print(f"Downloaded {success_count} files successfully. Encountered {error_count} errors.")
	return success_count, error_count