from fastapi import FastAPI, Request, Depends from fastapi.responses import FileResponse, JSONResponse from fastapi.staticfiles import StaticFiles from transformers import pipeline from pydantic import BaseModel from typing import Optional, Any import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, GenerationConfig import os from huggingface_hub import login # Check whether we are executing inside a Hugging Face Space SPACE_NAME = os.getenv("SPACE_NAME", default=None) if SPACE_NAME is not None: print(f"Running inside {SPACE_NAME} Space.") try: # Try to auto-login using the Space's environment variables login(automatically=True) except Exception as e: print(f"Failed to auto-login ({str(e)}). Manually check the HF_ACCESS_TOKEN environment variable.") sys.exit(1) try: HUGGINGFACE_TOKEN = os.environ['HF_ACCESS_TOKEN'] except KeyError: print('The environment variable "HF_ACCESS_TOKEN" is not found. Please configure it correctly in your Space.') sys.exit(1) # Packages and model loading import torch base_model_id = "152334H/miqu-1-70b-sf" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) base_model = AutoModelForCausalLM.from_pretrained( base_model_id, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, ) # Tokenizer loading eval_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf", add_bos_token=True, trust_remote_code=True, use_auth_token=True) # Streamer streamer = TextStreamer(eval_tokenizer) # App definition app = FastAPI() # Helper function to read raw request bodies async def parse_raw(request: Request): return await request.body() # Generate text def generate_text(prompt: str) -> str: model_input = eval_tokenizer(prompt, return_tensors="pt").to("cuda") base_model.eval() with torch.no_grad(): generated_sequences = base_model.generate( **model_input, max_new_tokens=4096, repetition_penalty=1.1, do_sample=True, temperature=1, streamer=streamer, ) return eval_tokenizer.decode(generated_sequences[0], skip_special_tokens=True) # Route for generating text @app.post("/generate_text") async def generate_text_route(data: BaseModel = Depends(parse_raw)): input_text = data.raw.decode("utf-8") if not input_text or len(input_text) <= 0: return JSONResponse({"error": "Empty input received."}, status_code=400) return {"output": generate_text(input_text)} # Mount static files app.mount("/static", StaticFiles(directory="static"), name="static")