soury's picture
readme
3c24662
from huggingface_hub import login
from datasets import load_dataset, Dataset, concatenate_datasets
import json
from src.services.util import HF_TOKEN, DATASET_NAME
def init_huggingface():
"""Initialize Hugging Face authentication."""
if HF_TOKEN is None:
raise ValueError(
"Hugging Face token not found in environment variables.")
login(token=HF_TOKEN)
def update_dataset(json_data):
"""Update the Hugging Face dataset with new data."""
if json_data is None or json_data.startswith("The following fields are required"):
return json_data or "No data to submit. Please fill in all required fields."
try:
data = json.loads(json_data)
except json.JSONDecodeError:
return "Invalid JSON data. Please ensure all required fields are filled correctly."
try:
dataset = load_dataset(DATASET_NAME, split="train")
print(dataset)
except:
dataset = Dataset.from_dict({})
new_data = create_flattened_data(data)
new_dataset = Dataset.from_dict(new_data)
if len(dataset) > 0:
print("dataset intitial")
print(dataset)
print("data to add ")
print(new_dataset)
updated_dataset = concatenate_datasets([dataset, new_dataset])
else:
updated_dataset = new_dataset
updated_dataset.push_to_hub(DATASET_NAME)
return "Data submitted successfully and dataset updated!"
def create_flattened_data(data):
"""Create a flattened data structure for the algorithms."""
# Handle algorithms
algorithms = data.get("task", {}).get("algorithms", [])
fields = ["trainingType", "algorithmType", "algorithmName", "algorithmUri", "foundationModelName", "foundationModelUri",
"parametersNumber", "framework", "frameworkVersion", "classPath", "layersNumber", "epochsNumber", "optimizer", "quantization"]
"""Create a flattened data structure for the algorithms."""
algorithms_data = {field: "| ".join(str(algo.get(
field)) for algo in algorithms if algo.get(field)) or "" for field in fields}
trainingType_str = algorithms_data["trainingType"]
algorithmType_str = algorithms_data["algorithmType"]
algorithmName_str = algorithms_data["algorithmName"]
algorithmUri_str = algorithms_data["algorithmUri"]
foundationModelName_str = algorithms_data["foundationModelName"]
foundationModelUri_str = algorithms_data["foundationModelUri"]
parametersNumber_str = algorithms_data["parametersNumber"]
framework_str = algorithms_data["framework"]
frameworkVersion_str = algorithms_data["frameworkVersion"]
classPath_str = algorithms_data["classPath"]
layersNumber_str = algorithms_data["layersNumber"]
epochsNumber_str = algorithms_data["epochsNumber"]
optimizer_str = algorithms_data["optimizer"]
quantization_str = algorithms_data["quantization"]
"""Create a flattened data structure for the dataset."""
# Handle dataset
dataset = data.get("task", {}).get("dataset", [])
fields = ["dataUsage", "dataType", "dataFormat", "dataSize",
"dataQuantity", "shape", "source", "sourceUri", "owner"]
"""Create a flattened data structure for the dataset."""
dataset_data = {field: "| ".join(
str(d.get(field)) for d in dataset if d.get(field)) or "" for field in fields}
dataUsage_str = dataset_data["dataUsage"]
dataType_str = dataset_data["dataType"]
dataFormat_str = dataset_data["dataFormat"]
dataSize_str = dataset_data["dataSize"]
dataQuantity_str = dataset_data["dataQuantity"]
shape_str = dataset_data["shape"]
source_str = dataset_data["source"]
sourceUri_str = dataset_data["sourceUri"]
owner_str = dataset_data["owner"]
"""Create a flattened data structure for the measures."""
# Handle measures
measures = data.get("measures", [])
fields = ["measurementMethod", "manufacturer", "version", "cpuTrackingMode", "gpuTrackingMode", "averageUtilizationCpu", "averageUtilizationGpu",
"powerCalibrationMeasurement", "durationCalibrationMeasurement", "powerConsumption", "measurementDuration", "measurementDateTime"]
"""Create a flattened data structure for the measures."""
measures_data = {field: "| ".join(str(measure.get(
field)) for measure in measures if measure.get(field)) or "" for field in fields}
measurementMethod_str = measures_data["measurementMethod"]
manufacturer_str = measures_data["manufacturer"]
version_str = measures_data["version"]
cpuTrackingMode_str = measures_data["cpuTrackingMode"]
gpuTrackingMode_str = measures_data["gpuTrackingMode"]
averageUtilizationCpu_str = measures_data["averageUtilizationCpu"]
averageUtilizationGpu_str = measures_data["averageUtilizationGpu"]
powerCalibrationMeasurement_str = measures_data["powerCalibrationMeasurement"]
durationCalibrationMeasurement_str = measures_data["durationCalibrationMeasurement"]
powerConsumption_str = measures_data["powerConsumption"]
measurementDuration_str = measures_data["measurementDuration"]
measurementDateTime_str = measures_data["measurementDateTime"]
# Handle components
components = data.get("infrastructure", {}).get("components", [])
fields = ["componentName", "componentType", "nbComponent", "memorySize",
"manufacturer", "family", "series", "share"]
# Generate concatenated strings for each field
component_data = {field: "| ".join(str(comp.get(
field)) for comp in components if comp.get(field)) or "" for field in fields}
componentName_str = component_data["componentName"]
componentType_str = component_data["componentType"]
nbComponent_str = component_data["nbComponent"]
memorySize_str = component_data["memorySize"]
manufacturer_infra_str = component_data["manufacturer"]
family_str = component_data["family"]
series_str = component_data["series"]
share_str = component_data["share"]
return {
# Header
"licensing": [data.get("header", {}).get("licensing", "")],
"formatVersion": [data.get("header", {}).get("formatVersion", "")],
"formatVersionSpecificationUri": [data.get("header", {}).get("formatVersionSpecificationUri", "")],
"reportId": [data.get("header", {}).get("reportId", "")],
"reportDatetime": [data.get("header", {}).get("reportDatetime", "")],
"reportStatus": [data.get("header", {}).get("reportStatus", "")],
"publisher_name": [data.get("header", {}).get("publisher", {}).get("name", "")],
"publisher_division": [data.get("header", {}).get("publisher", {}).get("division", "")],
"publisher_projectName": [data.get("header", {}).get("publisher", {}).get("projectName", "")],
"publisher_confidentialityLevel": [data.get("header", {}).get("publisher", {}).get("confidentialityLevel", "")],
"publisher_publicKey": [data.get("header", {}).get("publisher", {}).get("publicKey", "")],
# Task
"taskStage": [data.get("task", {}).get("taskStage", "")],
"taskFamily": [data.get("task", {}).get("taskFamily", "")],
"nbRequest": [data.get("task", {}).get("nbRequest", "")],
# Algorithms
"trainingType": [trainingType_str],
"algorithmType": [algorithmType_str],
"algorithmName": [algorithmName_str],
"algorithmUri": [algorithmUri_str],
"foundationModelName": [foundationModelName_str],
"foundationModelUri": [foundationModelUri_str],
"parametersNumber": [parametersNumber_str],
"framework": [framework_str],
"frameworkVersion": [frameworkVersion_str],
"classPath": [classPath_str],
"layersNumber": [layersNumber_str],
"epochsNumber": [epochsNumber_str],
"optimizer": [optimizer_str],
"quantization": [quantization_str],
# Dataset
"dataUsage": [dataUsage_str],
"dataType": [dataType_str],
"dataFormat": [dataFormat_str],
"dataSize": [dataSize_str],
"dataQuantity": [dataQuantity_str],
"shape": [shape_str],
"source": [source_str],
"sourceUri": [sourceUri_str],
"owner": [owner_str],
"measuredAccuracy": [data.get("task", {}).get("measuredAccuracy", "")],
"estimatedAccuracy": [data.get("task", {}).get("estimatedAccuracy", "")],
"taskDescription": [data.get("task", {}).get("taskDescription", "")],
# Measures
"measurementMethod": [measurementMethod_str],
"manufacturer": [manufacturer_str],
"version": [version_str],
"cpuTrackingMode": [cpuTrackingMode_str],
"gpuTrackingMode": [gpuTrackingMode_str],
"averageUtilizationCpu": [averageUtilizationCpu_str],
"averageUtilizationGpu": [averageUtilizationGpu_str],
"powerCalibrationMeasurement": [powerCalibrationMeasurement_str],
"durationCalibrationMeasurement": [durationCalibrationMeasurement_str],
"powerConsumption": [powerConsumption_str],
"measurementDuration": [measurementDuration_str],
"measurementDateTime": [measurementDateTime_str],
# System
"os": [data.get("system", {}).get("os", "")],
"distribution": [data.get("system", {}).get("distribution", "")],
"distributionVersion": [data.get("system", {}).get("distributionVersion", "")],
# Software
"language": [data.get("software", {}).get("language", "")],
"version_software": [data.get("software", {}).get("version_software", "")],
# Infrastructure
"infraType": [data.get("infrastructure", {}).get("infra_type", "")],
"cloudProvider": [data.get("infrastructure", {}).get("cloudProvider", "")],
"cloudInstance": [data.get("infrastructure", {}).get("cloudInstance", "")],
"cloudService": [data.get("infrastructure", {}).get("cloudService", "")],
"componentName": [componentName_str],
"componentType": [componentType_str],
"nbComponent": [nbComponent_str],
"memorySize": [memorySize_str],
"manufacturer_infra": [manufacturer_infra_str],
"family": [family_str],
"series": [series_str],
"share": [share_str],
# Environment
"country": [data.get("environment", {}).get("country", "")],
"latitude": [data.get("environment", {}).get("latitude", "")],
"longitude": [data.get("environment", {}).get("longitude", "")],
"location": [data.get("environment", {}).get("location", "")],
"powerSupplierType": [data.get("environment", {}).get("powerSupplierType", "")],
"powerSource": [data.get("environment", {}).get("powerSource", "")],
"powerSourceCarbonIntensity": [data.get("environment", {}).get("powerSourceCarbonIntensity", "")],
# Quality
"quality": [data.get("quality", "")],
}
"""
def create_flattened_data(data):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(data)
return out
"""