Spaces:
Sleeping
Sleeping
File size: 2,794 Bytes
0551907 6f96f84 27a4aad d8a1f28 e5591d2 6f96f84 c2fe3af 03ee1c6 b7c9474 b837945 b7c9474 080b041 03ee1c6 080b041 03ee1c6 64a93dc 03ee1c6 080b041 79e06e3 1b710b0 03ee1c6 adea8c1 03ee1c6 779fc66 03ee1c6 6f96f84 03ee1c6 79e06e3 03ee1c6 79e06e3 c5c4414 03ee1c6 c5c4414 79e06e3 03ee1c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
from fastapi import FastAPI, Request, Depends
from fastapi.responses import FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from transformers import pipeline
from pydantic import BaseModel
from typing import Optional, Any
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, GenerationConfig
import os
from huggingface_hub import login
# Check whether we are executing inside a Hugging Face Space
SPACE_NAME = os.getenv("SPACE_NAME", default=None)
if SPACE_NAME is not None:
print(f"Running inside {SPACE_NAME} Space.")
try:
# Try to auto-login using the Space's environment variables
login(automatically=True)
except Exception as e:
print(f"Failed to auto-login ({str(e)}). Manually check the HF_ACCESS_TOKEN environment variable.")
sys.exit(1)
try:
HUGGINGFACE_TOKEN = os.environ['HF_ACCESS_TOKEN']
except KeyError:
print('The environment variable "HF_ACCESS_TOKEN" is not found. Please configure it correctly in your Space.')
sys.exit(1)
# Packages and model loading
import torch
base_model_id = "152334H/miqu-1-70b-sf"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_id,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
# Tokenizer loading
eval_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf", add_bos_token=True, trust_remote_code=True, use_auth_token=True)
# Streamer
streamer = TextStreamer(eval_tokenizer)
# App definition
app = FastAPI()
# Helper function to read raw request bodies
async def parse_raw(request: Request):
return await request.body()
# Generate text
def generate_text(prompt: str) -> str:
model_input = eval_tokenizer(prompt, return_tensors="pt").to("cuda")
base_model.eval()
with torch.no_grad():
generated_sequences = base_model.generate(
**model_input,
max_new_tokens=4096,
repetition_penalty=1.1,
do_sample=True,
temperature=1,
streamer=streamer,
)
return eval_tokenizer.decode(generated_sequences[0], skip_special_tokens=True)
# Route for generating text
@app.post("/generate_text")
async def generate_text_route(data: BaseModel = Depends(parse_raw)):
input_text = data.raw.decode("utf-8")
if not input_text or len(input_text) <= 0:
return JSONResponse({"error": "Empty input received."}, status_code=400)
return {"output": generate_text(input_text)}
# Mount static files
app.mount("/static", StaticFiles(directory="static"), name="static") |