Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, Request, Depends | |
from fastapi.responses import FileResponse, JSONResponse | |
from fastapi.staticfiles import StaticFiles | |
from transformers import pipeline | |
from pydantic import BaseModel | |
from typing import Optional, Any | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, GenerationConfig | |
import os | |
from huggingface_hub import login | |
# Check whether we are executing inside a Hugging Face Space | |
SPACE_NAME = os.getenv("SPACE_NAME", default=None) | |
if SPACE_NAME is not None: | |
print(f"Running inside {SPACE_NAME} Space.") | |
try: | |
# Try to auto-login using the Space's environment variables | |
login(automatically=True) | |
except Exception as e: | |
print(f"Failed to auto-login ({str(e)}). Manually check the HF_ACCESS_TOKEN environment variable.") | |
sys.exit(1) | |
try: | |
HUGGINGFACE_TOKEN = os.environ['HF_ACCESS_TOKEN'] | |
except KeyError: | |
print('The environment variable "HF_ACCESS_TOKEN" is not found. Please configure it correctly in your Space.') | |
sys.exit(1) | |
# Packages and model loading | |
import torch | |
base_model_id = "152334H/miqu-1-70b-sf" | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.bfloat16 | |
) | |
base_model = AutoModelForCausalLM.from_pretrained( | |
base_model_id, | |
quantization_config=bnb_config, | |
device_map="auto", | |
trust_remote_code=True, | |
) | |
# Tokenizer loading | |
eval_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf", add_bos_token=True, trust_remote_code=True, use_auth_token=True) | |
# Streamer | |
streamer = TextStreamer(eval_tokenizer) | |
# App definition | |
app = FastAPI() | |
# Helper function to read raw request bodies | |
async def parse_raw(request: Request): | |
return await request.body() | |
# Generate text | |
def generate_text(prompt: str) -> str: | |
model_input = eval_tokenizer(prompt, return_tensors="pt").to("cuda") | |
base_model.eval() | |
with torch.no_grad(): | |
generated_sequences = base_model.generate( | |
**model_input, | |
max_new_tokens=4096, | |
repetition_penalty=1.1, | |
do_sample=True, | |
temperature=1, | |
streamer=streamer, | |
) | |
return eval_tokenizer.decode(generated_sequences[0], skip_special_tokens=True) | |
# Route for generating text | |
async def generate_text_route(data: BaseModel = Depends(parse_raw)): | |
input_text = data.raw.decode("utf-8") | |
if not input_text or len(input_text) <= 0: | |
return JSONResponse({"error": "Empty input received."}, status_code=400) | |
return {"output": generate_text(input_text)} | |
# Mount static files | |
app.mount("/static", StaticFiles(directory="static"), name="static") |