from huggingface_hub import login from datasets import load_dataset, Dataset, concatenate_datasets import json from src.services.util import HF_TOKEN, DATASET_NAME def init_huggingface(): """Initialize Hugging Face authentication.""" if HF_TOKEN is None: raise ValueError( "Hugging Face token not found in environment variables.") login(token=HF_TOKEN) def update_dataset(json_data): """Update the Hugging Face dataset with new data.""" if json_data is None or json_data.startswith("The following fields are required"): return json_data or "No data to submit. Please fill in all required fields." try: data = json.loads(json_data) except json.JSONDecodeError: return "Invalid JSON data. Please ensure all required fields are filled correctly." try: dataset = load_dataset(DATASET_NAME, split="train") print(dataset) except: dataset = Dataset.from_dict({}) new_data = create_flattened_data(data) new_dataset = Dataset.from_dict(new_data) if len(dataset) > 0: print("dataset intitial") print(dataset) print("data to add ") print(new_dataset) updated_dataset = concatenate_datasets([dataset, new_dataset]) else: updated_dataset = new_dataset updated_dataset.push_to_hub(DATASET_NAME) return "Data submitted successfully and dataset updated!" def create_flattened_data(data): """Create a flattened data structure for the algorithms.""" # Handle algorithms algorithms = data.get("task", {}).get("algorithms", []) fields = ["trainingType", "algorithmType", "algorithmName", "algorithmUri", "foundationModelName", "foundationModelUri", "parametersNumber", "framework", "frameworkVersion", "classPath", "layersNumber", "epochsNumber", "optimizer", "quantization"] """Create a flattened data structure for the algorithms.""" algorithms_data = {field: "| ".join(str(algo.get( field)) for algo in algorithms if algo.get(field)) or "" for field in fields} trainingType_str = algorithms_data["trainingType"] algorithmType_str = algorithms_data["algorithmType"] algorithmName_str = algorithms_data["algorithmName"] algorithmUri_str = algorithms_data["algorithmUri"] foundationModelName_str = algorithms_data["foundationModelName"] foundationModelUri_str = algorithms_data["foundationModelUri"] parametersNumber_str = algorithms_data["parametersNumber"] framework_str = algorithms_data["framework"] frameworkVersion_str = algorithms_data["frameworkVersion"] classPath_str = algorithms_data["classPath"] layersNumber_str = algorithms_data["layersNumber"] epochsNumber_str = algorithms_data["epochsNumber"] optimizer_str = algorithms_data["optimizer"] quantization_str = algorithms_data["quantization"] """Create a flattened data structure for the dataset.""" # Handle dataset dataset = data.get("task", {}).get("dataset", []) fields = ["dataUsage", "dataType", "dataFormat", "dataSize", "dataQuantity", "shape", "source", "sourceUri", "owner"] """Create a flattened data structure for the dataset.""" dataset_data = {field: "| ".join( str(d.get(field)) for d in dataset if d.get(field)) or "" for field in fields} dataUsage_str = dataset_data["dataUsage"] dataType_str = dataset_data["dataType"] dataFormat_str = dataset_data["dataFormat"] dataSize_str = dataset_data["dataSize"] dataQuantity_str = dataset_data["dataQuantity"] shape_str = dataset_data["shape"] source_str = dataset_data["source"] sourceUri_str = dataset_data["sourceUri"] owner_str = dataset_data["owner"] """Create a flattened data structure for the measures.""" # Handle measures measures = data.get("measures", []) fields = ["measurementMethod", "manufacturer", "version", "cpuTrackingMode", "gpuTrackingMode", "averageUtilizationCpu", "averageUtilizationGpu", "powerCalibrationMeasurement", "durationCalibrationMeasurement", "powerConsumption", "measurementDuration", "measurementDateTime"] """Create a flattened data structure for the measures.""" measures_data = {field: "| ".join(str(measure.get( field)) for measure in measures if measure.get(field)) or "" for field in fields} measurementMethod_str = measures_data["measurementMethod"] manufacturer_str = measures_data["manufacturer"] version_str = measures_data["version"] cpuTrackingMode_str = measures_data["cpuTrackingMode"] gpuTrackingMode_str = measures_data["gpuTrackingMode"] averageUtilizationCpu_str = measures_data["averageUtilizationCpu"] averageUtilizationGpu_str = measures_data["averageUtilizationGpu"] powerCalibrationMeasurement_str = measures_data["powerCalibrationMeasurement"] durationCalibrationMeasurement_str = measures_data["durationCalibrationMeasurement"] powerConsumption_str = measures_data["powerConsumption"] measurementDuration_str = measures_data["measurementDuration"] measurementDateTime_str = measures_data["measurementDateTime"] # Handle components components = data.get("infrastructure", {}).get("components", []) fields = ["componentName", "componentType", "nbComponent", "memorySize", "manufacturer", "family", "series", "share"] # Generate concatenated strings for each field component_data = {field: "| ".join(str(comp.get( field)) for comp in components if comp.get(field)) or "" for field in fields} componentName_str = component_data["componentName"] componentType_str = component_data["componentType"] nbComponent_str = component_data["nbComponent"] memorySize_str = component_data["memorySize"] manufacturer_infra_str = component_data["manufacturer"] family_str = component_data["family"] series_str = component_data["series"] share_str = component_data["share"] return { # Header "licensing": [data.get("header", {}).get("licensing", "")], "formatVersion": [data.get("header", {}).get("formatVersion", "")], "formatVersionSpecificationUri": [data.get("header", {}).get("formatVersionSpecificationUri", "")], "reportId": [data.get("header", {}).get("reportId", "")], "reportDatetime": [data.get("header", {}).get("reportDatetime", "")], "reportStatus": [data.get("header", {}).get("reportStatus", "")], "publisher_name": [data.get("header", {}).get("publisher", {}).get("name", "")], "publisher_division": [data.get("header", {}).get("publisher", {}).get("division", "")], "publisher_projectName": [data.get("header", {}).get("publisher", {}).get("projectName", "")], "publisher_confidentialityLevel": [data.get("header", {}).get("publisher", {}).get("confidentialityLevel", "")], "publisher_publicKey": [data.get("header", {}).get("publisher", {}).get("publicKey", "")], # Task "taskStage": [data.get("task", {}).get("taskStage", "")], "taskFamily": [data.get("task", {}).get("taskFamily", "")], "nbRequest": [data.get("task", {}).get("nbRequest", "")], # Algorithms "trainingType": [trainingType_str], "algorithmType": [algorithmType_str], "algorithmName": [algorithmName_str], "algorithmUri": [algorithmUri_str], "foundationModelName": [foundationModelName_str], "foundationModelUri": [foundationModelUri_str], "parametersNumber": [parametersNumber_str], "framework": [framework_str], "frameworkVersion": [frameworkVersion_str], "classPath": [classPath_str], "layersNumber": [layersNumber_str], "epochsNumber": [epochsNumber_str], "optimizer": [optimizer_str], "quantization": [quantization_str], # Dataset "dataUsage": [dataUsage_str], "dataType": [dataType_str], "dataFormat": [dataFormat_str], "dataSize": [dataSize_str], "dataQuantity": [dataQuantity_str], "shape": [shape_str], "source": [source_str], "sourceUri": [sourceUri_str], "owner": [owner_str], "measuredAccuracy": [data.get("task", {}).get("measuredAccuracy", "")], "estimatedAccuracy": [data.get("task", {}).get("estimatedAccuracy", "")], "taskDescription": [data.get("task", {}).get("taskDescription", "")], # Measures "measurementMethod": [measurementMethod_str], "manufacturer": [manufacturer_str], "version": [version_str], "cpuTrackingMode": [cpuTrackingMode_str], "gpuTrackingMode": [gpuTrackingMode_str], "averageUtilizationCpu": [averageUtilizationCpu_str], "averageUtilizationGpu": [averageUtilizationGpu_str], "powerCalibrationMeasurement": [powerCalibrationMeasurement_str], "durationCalibrationMeasurement": [durationCalibrationMeasurement_str], "powerConsumption": [powerConsumption_str], "measurementDuration": [measurementDuration_str], "measurementDateTime": [measurementDateTime_str], # System "os": [data.get("system", {}).get("os", "")], "distribution": [data.get("system", {}).get("distribution", "")], "distributionVersion": [data.get("system", {}).get("distributionVersion", "")], # Software "language": [data.get("software", {}).get("language", "")], "version_software": [data.get("software", {}).get("version_software", "")], # Infrastructure "infraType": [data.get("infrastructure", {}).get("infra_type", "")], "cloudProvider": [data.get("infrastructure", {}).get("cloudProvider", "")], "cloudInstance": [data.get("infrastructure", {}).get("cloudInstance", "")], "cloudService": [data.get("infrastructure", {}).get("cloudService", "")], "componentName": [componentName_str], "componentType": [componentType_str], "nbComponent": [nbComponent_str], "memorySize": [memorySize_str], "manufacturer_infra": [manufacturer_infra_str], "family": [family_str], "series": [series_str], "share": [share_str], # Environment "country": [data.get("environment", {}).get("country", "")], "latitude": [data.get("environment", {}).get("latitude", "")], "longitude": [data.get("environment", {}).get("longitude", "")], "location": [data.get("environment", {}).get("location", "")], "powerSupplierType": [data.get("environment", {}).get("powerSupplierType", "")], "powerSource": [data.get("environment", {}).get("powerSource", "")], "powerSourceCarbonIntensity": [data.get("environment", {}).get("powerSourceCarbonIntensity", "")], # Quality "quality": [data.get("quality", "")], } """ def create_flattened_data(data): out = {} def flatten(x, name=''): if type(x) is dict: for a in x: flatten(x[a], name + a + '_') elif type(x) is list: i = 0 for a in x: flatten(a, name + str(i) + '_') i += 1 else: out[name[:-1]] = x flatten(data) return out """