import json import os import pickle from pathlib import Path from tempfile import mkdtemp, mkstemp from uuid import uuid4 import numpy as np import xgboost from xgboost import XGBClassifier import sklearn from huggingface_hub import HfApi from sklearn.datasets import load_breast_cancer from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.experimental import enable_halving_search_cv # noqa from sklearn.model_selection import HalvingGridSearchCV, train_test_split import shutil from skops import card, hub_utils from data_loader import get_data_splits_from_clean_data # Paths model_path = "xgb_model_bayse_optimization_00000.bin" label_path = "labels.csv" # Data X, y = load_breast_cancer(as_frame=True, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 ) print("X's summary: ", X.describe()) print("y's summary: ", y.describe()) # # Train model param_grid = { "max_leaf_nodes": [5, 10, 15], "max_depth": [2, 5, 10], } train_tuple = get_data_splits_from_clean_data( "./cifs_test_s_trained_model", label_path, simple_load=True, n_data=-1 ) print(train_tuple) X_test = train_tuple[0] booster = xgboost.Booster({'nthread': 8}) booster.load_model(model_path) model = XGBClassifier() # Set the booster model._Booster = booster local_repo = mkdtemp(prefix="skops-") hub_utils.init( model=model_path, requirements=[f"xgboost={xgboost.__version__}"], dst=local_repo, task="tabular-classification", data=X_test, ) shutil.copy(label_path, os.path.join(local_repo, label_path)) if "__file__" in locals(): # __file__ not defined during docs built # Add this script itself to the files to be uploaded for reproducibility hub_utils.add_files(__file__, dst=local_repo) print(os.listdir(local_repo)) print(type(model)) print(card.metadata_from_config(Path(local_repo))) print(type(card.metadata_from_config(Path(local_repo)))) model_card = card.Card(model, metadata=card.metadata_from_config(Path(local_repo))) model_card.save(Path(local_repo) / "README.md") # you can put your own token here, or set it as an environment variable before # running this script. token = os.environ["HF_HUB_TOKEN"] repo_name = f"MLstructureMining" user_name = HfApi().whoami(token=token)["name"] repo_id = f"{user_name}/{repo_name}" print(f"Creating and pushing to repo: {repo_id}") hub_utils.push( repo_id=repo_id, source=local_repo, token=token, commit_message="pushing files to the repo from the example!", create_remote=True, private=True, )