Spaces:

HF-test-lab
/

bulk_embeddings

Runtime error

App Files Files Community

nbroad commited on Jul 14, 2023

Commit

aca33e8

1 Parent(s): ad93c5f

Upload 4 files

Browse files

Files changed (4) hide show

README.md +0 -13
app.py +197 -0
requirements.txt +8 -0
utils.py +452 -0

README.md CHANGED Viewed

@@ -1,13 +0,0 @@
----
-title: Bulk Embeddings
-emoji: 🐠
-colorFrom: purple
-colorTo: indigo
-sdk: gradio
-sdk_version: 3.36.1
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import gradio as gr
+from utils import load_hf_dataset, get_model_and_tokenizer, batch_embed
+# TODO: add instructor models
+# "hkunlp/instructor-xl",
+# "hkunlp/instructor-large",
+# "hkunlp/instructor-base",
+# model ids and hidden sizes
+models_and_hidden_sizes = [
+    ("intfloat/e5-small-v2", 384),
+    ("intfloat/e5-base-v2", 768),
+    ("intfloat/e5-large-v2", 1024),
+    ("intfloat/multilingual-e5-small", 384),
+    ("intfloat/multilingual-e5-base", 768),
+    ("intfloat/multilingual-e5-large", 1024),
+    ("sentence-transformers/all-MiniLM-L6-v2", 384),
+    ("sentence-transformers/all-MiniLM-L12-v2", 384),
+    ("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 384),
+]
+model_options = [
+    f"{model_name} (hidden_size = {hidden_size})"
+    for model_name, hidden_size in models_and_hidden_sizes
+]
+opt2desc = {
+    "O2": "Most precise, slowest (O2: basic and extended general optimizations, transformers-specific fusions)",
+    "O3": "Less precise, faster (O3: O2 + gelu approx)",
+    "O4": "Least precise, fastest (O4: O3 + fp16/bf16)",
+}
+desc2opt = {v: k for k, v in opt2desc.items()}
+optimization_options = list(opt2desc.values())
+def run(
+    ds_name,
+    ds_config,
+    column_name,
+    ds_split,
+    model_choice,
+    opt_desc,
+    new_dataset_id,
+    num2skip,
+    num2embed,
+    progress=gr.Progress(),
+):
+    if progress is not None:
+        progress(0.5, "Loading dataset...")
+    ds = load_hf_dataset(ds_name, ds_config, ds_split)
+    opt_level = desc2opt[opt_desc]
+    model_name = model_choice.split()[0]
+    if progress is not None:
+        progress(0.2, "Downloading model and tokenizer...")
+    model, tokenizer = get_model_and_tokenizer(model_name, opt_level, progress)
+    doc_count, seconds_taken = batch_embed(
+        ds,
+        model,
+        tokenizer,
+        model_name=model_name,
+        column_name=column_name,
+        new_dataset_id=new_dataset_id,
+        opt_level=opt_level,
+        num2skip=num2skip,
+        num2embed=num2embed,
+        progress=progress,
+    )
+    return f"Embedded {doc_count} docs in {seconds_taken/60:.2f} minutes ({doc_count/seconds_taken:.1f} docs/sec)"
+with gr.Blocks(title="Bulk embeddings") as demo:
+    gr.Markdown(
+        """
+        This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
+        articles -- taking about __ hours and costing approximately $__.
+        This utilizes state-of-the-art open-source embedding models, \
+        and optimizes them for inference using Hugging Face [optimum](https://github.com/huggingface/optimum). There are various \
+        levels of optimizations that can be applied - the quality of the embeddings will degrade as the optimizations increase.
+        Currently available options: O2/O3/O4 on T4/A10 GPUs using onnx runtime.
+        Future options:
+          - OpenVino for CPU inference
+          - TensorRT for GPU inference
+          - Quantized models
+          - Instructor models
+          - Text splitting options
+          - More control about which rows to embed (skip some, stop early)
+          - Dynamic padding
+        ## Steps
+        1. Upload the dataset to the Hugging Face Hub.
+        2. Enter dataset details into the form below.
+        3. Choose a model. These are taken from the top of the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
+        4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
+        5. Choose a name for the new dataset.
+        6. Hit run!
+        ### Note:
+        If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
+            O4 requires the tokenized documents to be padded to max length.
+        """
+    )
+    with gr.Row():
+        ds_name = gr.Textbox(
+            lines=1,
+            label="Dataset to load from Hugging Face Hub",
+            value="nbroad/basic_text_dataset",
+        )
+        ds_config = gr.Textbox(
+            lines=1, label="Dataset config (leave blank to use default)", value=""
+        )
+        column_name = gr.Textbox(lines=1, label="Enter column to embed", value="text")
+        ds_split = gr.Dropdown(
+            choices=["train", "validation", "test"],
+            label="Dataset split",
+            value="train",
+        )
+        # TODO: idx column
+        # TODO: text splitting options
+    with gr.Row():
+        model_choice = gr.Dropdown(
+            choices=model_options, label="Embedding model", value=model_options[0]
+        )
+        opt_desc = gr.Dropdown(
+            choices=optimization_options,
+            label="Optimization level",
+            value=optimization_options[0],
+        )
+    with gr.Row():
+        new_dataset_id = gr.Textbox(
+            lines=1,
+            label="New dataset name, including username",
+            value="nbroad/test-embeds",
+        )
+        num2skip = gr.Slider(
+            value=0,
+            minimum=0,
+            maximum=10_000_000,
+            step=1,
+            label="Number of rows to skip",
+        )
+        num2embed = gr.Slider(
+            value=-1,
+            minimum=-1,
+            maximum=10_000_000,
+            step=1,
+            label="Number of rows to embed (-1 = all)",
+        )
+    with gr.Row():
+        btn = gr.Button(value="Embed texts!")
+        last = gr.Textbox(value="")
+    btn.click(
+        fn=run,
+        inputs=[
+            ds_name,
+            ds_config,
+            column_name,
+            ds_split,
+            model_choice,
+            opt_desc,
+            new_dataset_id,
+            num2skip,
+            num2embed,
+        ],
+        outputs=last,
+    )
+if __name__ == "__main__":
+    demo.queue(concurrency_count=20).launch(show_error=True, debug=True, share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+datasets==2.13.1
+tokenizers>=0.11.1,!=0.11.3,<0.14
+optimum[onnxruntime-gpu]==1.8.8
+transformers==4.30.1
+accelerate==0.20.3
+gradio==3.35.2
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.0.1

utils.py ADDED Viewed

	@@ -0,0 +1,452 @@

+import os
+import time
+import shutil
+from pathlib import Path
+from typing import Union, Dict, List
+import torch
+import datasets
+from datasets import load_dataset, Dataset
+from transformers import AutoTokenizer, PreTrainedTokenizer
+from huggingface_hub import Repository, create_repo, HfApi
+from optimum.onnxruntime import (
+    AutoOptimizationConfig,
+    ORTModelForFeatureExtraction,
+    ORTOptimizer,
+)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+opt_configs = {
+    "O2": AutoOptimizationConfig.O2(),
+    "O3": AutoOptimizationConfig.O3(),
+    "O4": AutoOptimizationConfig.O4(),
+}
+def get_batch_size(device_name: str, model_name: str, opt_level: str):
+    """
+    TODO: run actual tests
+    T4 has 16GB
+    A10 has 24GB
+    Args:
+        device_name (`str`):
+            The name of the GPU device in use.
+        model_name (`str`):
+            The name of the model in use.
+        opt_level (`str`):
+            The optimization level in use.
+    Returns:
+        `int`:
+            The batch size to use.
+    """
+    if "small" in model_name:
+        bs = 192
+    elif "base" in model_name:
+        bs = 128
+    elif "large" in model_name:
+        bs = 64
+    else:
+        bs = 32
+    if "A10" in device_name:
+        bs *= 2
+    if opt_level == "O4":
+        bs *= 2
+    return bs
+def mean_pooling(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor):
+    """
+    Mean pool the token embeddings.
+    Args:
+        last_hidden_state (`tuple`):
+            The output of the model.
+        attention_mask (`torch.Tensor`):
+            The attention mask.
+    Returns:
+        `torch.Tensor`:
+            The mean pooled embeddings.
+    """
+    input_mask_expanded = (
+        attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+    )
+    return torch.sum(last_hidden_state * input_mask_expanded, 1) / torch.clamp(
+        input_mask_expanded.sum(1), min=1e-9
+    )
+def load_hf_dataset(ds_name: str, ds_config: str = None, ds_split: str = "train"):
+    """
+    Load a dataset from the HuggingFace Hub. Will be streaming so
+    as to not load the whole dataset to local storage.
+    Args:
+        ds_name (`str`):
+            The name of the dataset to load.
+        ds_config (`str`, *optional*, Defaults to `None`):
+            The configuration of the dataset to load.
+        ds_split (`str`, *optional*, Defaults to `"train"`):
+            The split of the dataset to load.
+    Returns:
+        ds (`datasets.IterableDataset`):
+            The loaded dataset.
+    """
+    if ds_config == "":
+        ds_config = None
+    ds = load_dataset(ds_name, ds_config, split=ds_split, streaming=True)
+    return ds
+def get_model_and_tokenizer(model_name: str, optimization_level: str, progress):
+    """
+    Load the model and tokenizer from the HuggingFace Hub.
+    If the model is not already optimized, optimize it and save it to the local directory.
+    Args:
+        model_name (`str`):
+            The name of the model to load.
+        optimization_level (`str`):
+            The optimization level to use. Should be one of `"O2"`, `"O3"`, or `"O4"`.
+    Returns:
+        model (`ORTModelForFeatureExtraction`):
+            The optimized model.
+        tokenizer (`PreTrainedTokenizer`):
+            The tokenizer.
+    """
+    optimized_model_name = f"model_optimized_{optimization_level}.onnx"
+    model_dir = Path(model_name.replace("/", "_"))
+    if not (model_dir / optimized_model_name).exists():
+        if progress is not None:
+            progress(0.2, "Downloading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.save_pretrained(model_dir)
+        if progress is not None:
+            progress(0.4, "Downloading model...")
+        model = ORTModelForFeatureExtraction.from_pretrained(model_name, export=True)
+        model.save_pretrained(model_dir)
+        optimizer = ORTOptimizer.from_pretrained(model)
+        optimization_config = opt_configs[optimization_level]
+        if progress is not None:
+            progress(0.6, "Optimizing model...")
+        optimizer.optimize(save_dir=model_dir, optimization_config=optimization_config)
+        Path(model_dir / "model_optimized.onnx").rename(
+            model_dir / optimized_model_name
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_dir)
+    if progress is not None:
+        progress(0.8, "Loading optimized model and tokenizer...")
+    return (
+        ORTModelForFeatureExtraction.from_pretrained(
+            model_dir,
+            file_name=optimized_model_name,
+            provider="CUDAExecutionProvider",
+        ),
+        tokenizer,
+    )
+def tokenize(
+    examples: Dict[str, List[str]],
+    tokenizer: PreTrainedTokenizer,
+    column_name: str = "text",
+    padding: Union[bool, str] = True,
+    max_length: int = 512,
+):
+    """
+    Tokenize the examples using the tokenizer.
+    Args:
+        examples (`Dict[str, List[str]]`):
+            examples to tokenize
+        tokenizer (`PreTrainedTokenizer`):
+            tokenizer to use
+        column_name (`str`, *optional*, defaults to `text`):
+            column name to use for tokenization. Defaults to `text`
+        padding (`bool`, *optional*, defaults to `True`):
+            whether to pad the examples. Defaults to `True`
+            Use `"max_length"` if using `O4` optimization level
+            If `True`, the batch will be padded to the longest in the batch.
+        max_length (`int`, *optional*, Defaults to `512`):
+            max length to use for the model. Defaults to `512`.
+            Any sequences longer will be truncated.
+            If padding is `"max_length"`, the padding will be added until the sequence
+            is of length `max_length`.
+    Returns:
+        `Dict[str, List[List[int]]]`:
+            tokenized examples
+    """
+    # TODO: add lengths, sort by length, use dynamic padding
+    # TODO: option for controlling length for models that can go shorter/longer than 512
+    return tokenizer(
+        examples[column_name], truncation=True, padding=padding, max_length=max_length
+    )
+@torch.inference_mode()
+def batch_embed(
+    ds: datasets.IterableDataset,
+    model: ORTModelForFeatureExtraction,
+    tokenizer: PreTrainedTokenizer,
+    model_name: str,
+    column_name: str,
+    new_dataset_id: str,
+    opt_level: str,
+    upload_batch_size: int = 10_000,
+    map_batch_size: int = 2000,
+    num2skip: int = 0,
+    num2embed: int = -1,
+    progress=None,
+):
+    """
+    Run the model on the dataset and upload the embeddings to the hub.
+    Args:
+        ds (`datasets.Dataset`):
+            dataset to embed. From `load_hf_dataset`
+        model (`ORTModelForFeatureExtraction`):
+            model to use for embedding. From `get_model_and_tokenizer`
+        tokenizer (`AutoTokenizer`):
+            tokenizer to use for embedding. From `get_model_and_tokenizer`
+        model_name (`str`):
+            name of the model to use. Used to determine batch size.
+        column_name (`str`):
+            column name to use for embedding. Default option in gradio app is `text`
+        new_dataset_id (`str`):
+            id of the new dataset to create. Should include username or organization.
+            e.g. nbroad/new-embeddings
+        opt_level (`str`):
+            optimization level to use. Should be one of `O2`, `O3`, `O4`
+            See here for more details on optimization levels:
+            https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration
+        upload_batch_size (`int`, *optional*, defaults to `10_000`):
+            number of embeddings to upload at once. Defaults to 10,000.
+        map_batch_size (`int`, *optional*, defaults to `2000`):
+            number of examples to tokenize at once. Defaults to 2000.
+        num2skip (`int`, *optional*, defaults to `0`):
+            number of examples to skip. Defaults to 0.
+        num2embed (`int`, *optional*, defaults to `-1`):
+            number of examples to embed. Defaults to -1, which means all examples.
+    Returns:
+        current_count (`int`):
+            number of examples embedded so far
+        time_taken (`float`):
+            time taken to embed the examples in seconds
+    """
+    api = HfApi(
+        token=os.environ["HF_TOKEN"],
+    )
+    username = api.whoami()["name"]
+    if "/" in new_dataset_id:
+        new_dataset_id = username + "/" + new_dataset_id
+    repo = init_git_repo(new_dataset_id)
+    iterator = iter(
+        ds.map(
+            tokenize,
+            batched=True,
+            batch_size=map_batch_size,
+            fn_kwargs={
+                "tokenizer": tokenizer,
+                "column_name": column_name,
+                "padding": "max_length" if opt_level == "O4" else True,
+            },
+            remove_columns=ds.column_names,
+        )
+    )
+    embeds = []
+    texts = []
+    # last_count keeps track of how many had been embedded since last push
+    last_count = 0
+    # current count keeps track of how many have been embedded in total
+    current_count = 0
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    inference_bs = get_batch_size(torch.cuda.get_device_name(0), model_name, opt_level)
+    loop = True
+    # skip through some examples
+    if num2skip > 0:
+        [next(iterator) for _ in range(num2skip)]
+    start_time = time.time()
+    while loop:
+        batch = [next(iterator, None) for _ in range(inference_bs)]
+        # batch will have None values when iterator runs out
+        if batch[-1] is None:
+            batch = [x for x in batch if x is not None]
+            loop = False
+            if len(batch) == 0:
+                break
+        ids = torch.tensor([b["input_ids"] for b in batch], device=device)
+        mask = torch.tensor([b["attention_mask"] for b in batch], device=device)
+        t_ids = torch.zeros_like(ids)
+        outputs = model(input_ids=ids, attention_mask=mask, token_type_ids=t_ids)
+        embeds.extend(mean_pooling(outputs[0], mask).cpu().tolist())
+        texts.extend([b[column_name] for b in batch])
+        current_count += len(batch)
+        # Check if we have embedded enough examples
+        if current_count >= num2embed:
+            diff = current_count - num2embed
+            embeds = embeds[:-diff]
+            texts = texts[:-diff]
+            current_count = num2embed
+            break
+        # Periodically upload to the hub
+        if len(embeds) > upload_batch_size:
+            push_to_repo(repo, last_count, current_count, embeds, texts)
+            embeds = []
+            last_count = current_count
+        # Provide updates
+        if progress is not None:
+            progress(
+                (current_count, None),
+                "Embedding docs...",
+                total=None,
+                unit="Docs Embedded",
+            )
+    time_taken = time.time() - start_time
+    # If there are any remaining embeddings, upload them
+    if len(embeds) > 0:
+        push_to_repo(repo, last_count, current_count, embeds, texts)
+    return current_count - num2skip, time_taken
+def init_git_repo(repo_id: str):
+    """
+    Initialize a git repo for the new dataset.
+    ***Removes existing local folder if exists***
+    Args:
+        repo_id (`str`):
+            id of the new dataset to create. Should include username or organization.
+            e.g. nbroad/new-embeddings
+    """
+    local_dir = repo_id.replace("/", "_")
+    create_repo(
+        repo_id,
+        repo_type="dataset",
+        token=os.environ["HF_TOKEN"],
+        private=True,
+        exist_ok=True,
+    )
+    try:
+        repo = Repository(
+            local_dir=local_dir,
+            clone_from=repo_id,
+            repo_type="dataset",
+            token=os.environ["HF_TOKEN"],
+            skip_lfs_files=True,
+        )
+    except EnvironmentError:
+        shutil.rmtree(local_dir)
+        repo = Repository(
+            local_dir=local_dir,
+            clone_from=repo_id,
+            repo_type="dataset",
+            token=os.environ["HF_TOKEN"],
+            skip_lfs_files=True,
+        )
+    if repo is not None:
+        repo.git_pull()
+    return repo
+def push_to_repo(
+    repo: str,
+    last_count: int,
+    current_count: int,
+    embeds: List[List[float]],
+    texts: List[str],
+):
+    """
+    Push embeddings to the repo.
+    Args:
+        repo (`huggingface_hub.Repository`):
+            repo to push to
+        last_count (`int`):
+            last count of embeddings.
+            This is the number of embeddings that have already been pushed.
+        current_count (`int`):
+            current count of embeddings.
+            This is the number of embeddings that have been pushed after this batch.
+        embeds (`List[List[float]]`):
+            list of embeddings to push to the repo
+        texts (`List[str]`):
+            list of texts to push to the repo
+    """
+    # TODO: write dataset loading script as well
+    temp_ds = Dataset.from_dict(
+        {
+            "embedding": embeds,
+            "text": texts,
+        }
+    )
+    data_dir = Path(repo.local_dir) / "data"
+    data_dir.mkdir(exist_ok=True, parents=True)
+    temp_ds.to_parquet(
+        str(data_dir / f"embeddings_{last_count}_{current_count}.parquet")
+    )
+    repo.push_to_hub(
+        commit_message=f"Embedded examples {last_count} thru {current_count}",
+        blocking=False,
+        auto_lfs_prune=True,
+    )