|
from huggingface_hub import login |
|
from datasets import load_dataset, Dataset, concatenate_datasets |
|
import json |
|
from src.services.util import HF_TOKEN, DATASET_NAME |
|
|
|
|
|
def init_huggingface(): |
|
"""Initialize Hugging Face authentication.""" |
|
if HF_TOKEN is None: |
|
raise ValueError( |
|
"Hugging Face token not found in environment variables.") |
|
login(token=HF_TOKEN) |
|
|
|
|
|
def update_dataset(json_data): |
|
"""Update the Hugging Face dataset with new data.""" |
|
if json_data is None or json_data.startswith("The following fields are required"): |
|
return json_data or "No data to submit. Please fill in all required fields." |
|
|
|
try: |
|
data = json.loads(json_data) |
|
except json.JSONDecodeError: |
|
return "Invalid JSON data. Please ensure all required fields are filled correctly." |
|
|
|
try: |
|
dataset = load_dataset(DATASET_NAME, split="train") |
|
print(dataset) |
|
except: |
|
dataset = Dataset.from_dict({}) |
|
|
|
new_data = create_flattened_data(data) |
|
new_dataset = Dataset.from_dict(new_data) |
|
|
|
if len(dataset) > 0: |
|
print("dataset intitial") |
|
print(dataset) |
|
print("data to add ") |
|
print(new_dataset) |
|
updated_dataset = concatenate_datasets([dataset, new_dataset]) |
|
else: |
|
updated_dataset = new_dataset |
|
|
|
updated_dataset.push_to_hub(DATASET_NAME) |
|
return "Data submitted successfully and dataset updated!" |
|
|
|
|
|
def create_flattened_data(data): |
|
"""Create a flattened data structure for the algorithms.""" |
|
|
|
algorithms = data.get("task", {}).get("algorithms", []) |
|
fields = ["trainingType", "algorithmType", "algorithmName", "algorithmUri", "foundationModelName", "foundationModelUri", |
|
"parametersNumber", "framework", "frameworkVersion", "classPath", "layersNumber", "epochsNumber", "optimizer", "quantization"] |
|
"""Create a flattened data structure for the algorithms.""" |
|
algorithms_data = {field: "| ".join(str(algo.get( |
|
field)) for algo in algorithms if algo.get(field)) or "" for field in fields} |
|
trainingType_str = algorithms_data["trainingType"] |
|
algorithmType_str = algorithms_data["algorithmType"] |
|
algorithmName_str = algorithms_data["algorithmName"] |
|
algorithmUri_str = algorithms_data["algorithmUri"] |
|
foundationModelName_str = algorithms_data["foundationModelName"] |
|
foundationModelUri_str = algorithms_data["foundationModelUri"] |
|
parametersNumber_str = algorithms_data["parametersNumber"] |
|
framework_str = algorithms_data["framework"] |
|
frameworkVersion_str = algorithms_data["frameworkVersion"] |
|
classPath_str = algorithms_data["classPath"] |
|
layersNumber_str = algorithms_data["layersNumber"] |
|
epochsNumber_str = algorithms_data["epochsNumber"] |
|
optimizer_str = algorithms_data["optimizer"] |
|
quantization_str = algorithms_data["quantization"] |
|
|
|
"""Create a flattened data structure for the dataset.""" |
|
|
|
dataset = data.get("task", {}).get("dataset", []) |
|
fields = ["dataUsage", "dataType", "dataFormat", "dataSize", |
|
"dataQuantity", "shape", "source", "sourceUri", "owner"] |
|
"""Create a flattened data structure for the dataset.""" |
|
dataset_data = {field: "| ".join( |
|
str(d.get(field)) for d in dataset if d.get(field)) or "" for field in fields} |
|
dataUsage_str = dataset_data["dataUsage"] |
|
dataType_str = dataset_data["dataType"] |
|
dataFormat_str = dataset_data["dataFormat"] |
|
dataSize_str = dataset_data["dataSize"] |
|
dataQuantity_str = dataset_data["dataQuantity"] |
|
shape_str = dataset_data["shape"] |
|
source_str = dataset_data["source"] |
|
sourceUri_str = dataset_data["sourceUri"] |
|
owner_str = dataset_data["owner"] |
|
|
|
"""Create a flattened data structure for the measures.""" |
|
|
|
measures = data.get("measures", []) |
|
fields = ["measurementMethod", "manufacturer", "version", "cpuTrackingMode", "gpuTrackingMode", "averageUtilizationCpu", "averageUtilizationGpu", |
|
"powerCalibrationMeasurement", "durationCalibrationMeasurement", "powerConsumption", "measurementDuration", "measurementDateTime"] |
|
"""Create a flattened data structure for the measures.""" |
|
measures_data = {field: "| ".join(str(measure.get( |
|
field)) for measure in measures if measure.get(field)) or "" for field in fields} |
|
measurementMethod_str = measures_data["measurementMethod"] |
|
manufacturer_str = measures_data["manufacturer"] |
|
version_str = measures_data["version"] |
|
cpuTrackingMode_str = measures_data["cpuTrackingMode"] |
|
gpuTrackingMode_str = measures_data["gpuTrackingMode"] |
|
averageUtilizationCpu_str = measures_data["averageUtilizationCpu"] |
|
averageUtilizationGpu_str = measures_data["averageUtilizationGpu"] |
|
powerCalibrationMeasurement_str = measures_data["powerCalibrationMeasurement"] |
|
durationCalibrationMeasurement_str = measures_data["durationCalibrationMeasurement"] |
|
powerConsumption_str = measures_data["powerConsumption"] |
|
measurementDuration_str = measures_data["measurementDuration"] |
|
measurementDateTime_str = measures_data["measurementDateTime"] |
|
|
|
|
|
components = data.get("infrastructure", {}).get("components", []) |
|
fields = ["componentName", "componentType", "nbComponent", "memorySize", |
|
"manufacturer", "family", "series", "share"] |
|
|
|
|
|
component_data = {field: "| ".join(str(comp.get( |
|
field)) for comp in components if comp.get(field)) or "" for field in fields} |
|
|
|
componentName_str = component_data["componentName"] |
|
componentType_str = component_data["componentType"] |
|
nbComponent_str = component_data["nbComponent"] |
|
memorySize_str = component_data["memorySize"] |
|
manufacturer_infra_str = component_data["manufacturer"] |
|
family_str = component_data["family"] |
|
series_str = component_data["series"] |
|
share_str = component_data["share"] |
|
|
|
return { |
|
|
|
"licensing": [data.get("header", {}).get("licensing", "")], |
|
"formatVersion": [data.get("header", {}).get("formatVersion", "")], |
|
"formatVersionSpecificationUri": [data.get("header", {}).get("formatVersionSpecificationUri", "")], |
|
"reportId": [data.get("header", {}).get("reportId", "")], |
|
"reportDatetime": [data.get("header", {}).get("reportDatetime", "")], |
|
"reportStatus": [data.get("header", {}).get("reportStatus", "")], |
|
"publisher_name": [data.get("header", {}).get("publisher", {}).get("name", "")], |
|
"publisher_division": [data.get("header", {}).get("publisher", {}).get("division", "")], |
|
"publisher_projectName": [data.get("header", {}).get("publisher", {}).get("projectName", "")], |
|
"publisher_confidentialityLevel": [data.get("header", {}).get("publisher", {}).get("confidentialityLevel", "")], |
|
"publisher_publicKey": [data.get("header", {}).get("publisher", {}).get("publicKey", "")], |
|
|
|
|
|
"taskStage": [data.get("task", {}).get("taskStage", "")], |
|
"taskFamily": [data.get("task", {}).get("taskFamily", "")], |
|
"nbRequest": [data.get("task", {}).get("nbRequest", "")], |
|
|
|
"trainingType": [trainingType_str], |
|
"algorithmType": [algorithmType_str], |
|
"algorithmName": [algorithmName_str], |
|
"algorithmUri": [algorithmUri_str], |
|
"foundationModelName": [foundationModelName_str], |
|
"foundationModelUri": [foundationModelUri_str], |
|
"parametersNumber": [parametersNumber_str], |
|
"framework": [framework_str], |
|
"frameworkVersion": [frameworkVersion_str], |
|
"classPath": [classPath_str], |
|
"layersNumber": [layersNumber_str], |
|
"epochsNumber": [epochsNumber_str], |
|
"optimizer": [optimizer_str], |
|
"quantization": [quantization_str], |
|
|
|
"dataUsage": [dataUsage_str], |
|
"dataType": [dataType_str], |
|
"dataFormat": [dataFormat_str], |
|
"dataSize": [dataSize_str], |
|
"dataQuantity": [dataQuantity_str], |
|
"shape": [shape_str], |
|
"source": [source_str], |
|
"sourceUri": [sourceUri_str], |
|
"owner": [owner_str], |
|
"measuredAccuracy": [data.get("task", {}).get("measuredAccuracy", "")], |
|
"estimatedAccuracy": [data.get("task", {}).get("estimatedAccuracy", "")], |
|
"taskDescription": [data.get("task", {}).get("taskDescription", "")], |
|
|
|
|
|
"measurementMethod": [measurementMethod_str], |
|
"manufacturer": [manufacturer_str], |
|
"version": [version_str], |
|
"cpuTrackingMode": [cpuTrackingMode_str], |
|
"gpuTrackingMode": [gpuTrackingMode_str], |
|
"averageUtilizationCpu": [averageUtilizationCpu_str], |
|
"averageUtilizationGpu": [averageUtilizationGpu_str], |
|
"powerCalibrationMeasurement": [powerCalibrationMeasurement_str], |
|
"durationCalibrationMeasurement": [durationCalibrationMeasurement_str], |
|
"powerConsumption": [powerConsumption_str], |
|
"measurementDuration": [measurementDuration_str], |
|
"measurementDateTime": [measurementDateTime_str], |
|
|
|
|
|
"os": [data.get("system", {}).get("os", "")], |
|
"distribution": [data.get("system", {}).get("distribution", "")], |
|
"distributionVersion": [data.get("system", {}).get("distributionVersion", "")], |
|
|
|
|
|
"language": [data.get("software", {}).get("language", "")], |
|
"version_software": [data.get("software", {}).get("version_software", "")], |
|
|
|
|
|
"infraType": [data.get("infrastructure", {}).get("infra_type", "")], |
|
"cloudProvider": [data.get("infrastructure", {}).get("cloudProvider", "")], |
|
"cloudInstance": [data.get("infrastructure", {}).get("cloudInstance", "")], |
|
"cloudService": [data.get("infrastructure", {}).get("cloudService", "")], |
|
"componentName": [componentName_str], |
|
"componentType": [componentType_str], |
|
"nbComponent": [nbComponent_str], |
|
"memorySize": [memorySize_str], |
|
"manufacturer_infra": [manufacturer_infra_str], |
|
"family": [family_str], |
|
"series": [series_str], |
|
"share": [share_str], |
|
|
|
|
|
"country": [data.get("environment", {}).get("country", "")], |
|
"latitude": [data.get("environment", {}).get("latitude", "")], |
|
"longitude": [data.get("environment", {}).get("longitude", "")], |
|
"location": [data.get("environment", {}).get("location", "")], |
|
"powerSupplierType": [data.get("environment", {}).get("powerSupplierType", "")], |
|
"powerSource": [data.get("environment", {}).get("powerSource", "")], |
|
"powerSourceCarbonIntensity": [data.get("environment", {}).get("powerSourceCarbonIntensity", "")], |
|
|
|
|
|
"quality": [data.get("quality", "")], |
|
} |
|
|
|
|
|
""" |
|
def create_flattened_data(data): |
|
out = {} |
|
|
|
def flatten(x, name=''): |
|
if type(x) is dict: |
|
for a in x: |
|
flatten(x[a], name + a + '_') |
|
elif type(x) is list: |
|
i = 0 |
|
for a in x: |
|
flatten(a, name + str(i) + '_') |
|
i += 1 |
|
else: |
|
out[name[:-1]] = x |
|
|
|
flatten(data) |
|
return out |
|
""" |
|
|