File size: 2,794 Bytes
0551907
6f96f84
27a4aad
d8a1f28
e5591d2
6f96f84
c2fe3af
03ee1c6
b7c9474
b837945
 
 
 
 
 
 
 
 
 
 
 
b7c9474
 
 
 
 
 
080b041
03ee1c6
 
 
 
 
 
 
 
 
080b041
03ee1c6
 
 
 
 
 
64a93dc
03ee1c6
 
 
 
 
 
 
 
080b041
79e06e3
 
 
1b710b0
03ee1c6
 
 
adea8c1
03ee1c6
 
 
 
 
 
 
 
 
 
779fc66
03ee1c6
6f96f84
03ee1c6
 
 
79e06e3
03ee1c6
79e06e3
c5c4414
03ee1c6
c5c4414
79e06e3
03ee1c6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from fastapi import FastAPI, Request, Depends
from fastapi.responses import FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from transformers import pipeline
from pydantic import BaseModel
from typing import Optional, Any
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, GenerationConfig
import os
from huggingface_hub import login

# Check whether we are executing inside a Hugging Face Space
SPACE_NAME = os.getenv("SPACE_NAME", default=None)
if SPACE_NAME is not None:
    print(f"Running inside {SPACE_NAME} Space.")
    try:
        # Try to auto-login using the Space's environment variables
        login(automatically=True)
    except Exception as e:
        print(f"Failed to auto-login ({str(e)}). Manually check the HF_ACCESS_TOKEN environment variable.")
        sys.exit(1)

try:
    HUGGINGFACE_TOKEN = os.environ['HF_ACCESS_TOKEN']
except KeyError:
    print('The environment variable "HF_ACCESS_TOKEN" is not found. Please configure it correctly in your Space.')
    sys.exit(1)

# Packages and model loading
import torch
base_model_id = "152334H/miqu-1-70b-sf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Tokenizer loading
eval_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf", add_bos_token=True, trust_remote_code=True, use_auth_token=True)

# Streamer
streamer = TextStreamer(eval_tokenizer)

# App definition
app = FastAPI()

# Helper function to read raw request bodies
async def parse_raw(request: Request):
    return await request.body()

# Generate text
def generate_text(prompt: str) -> str:
    model_input = eval_tokenizer(prompt, return_tensors="pt").to("cuda")

    base_model.eval()
    with torch.no_grad():
        generated_sequences = base_model.generate(
            **model_input,
            max_new_tokens=4096,
            repetition_penalty=1.1,
            do_sample=True,
            temperature=1,
            streamer=streamer,
        )

    return eval_tokenizer.decode(generated_sequences[0], skip_special_tokens=True)

# Route for generating text
@app.post("/generate_text")
async def generate_text_route(data: BaseModel = Depends(parse_raw)):
    input_text = data.raw.decode("utf-8")
    if not input_text or len(input_text) <= 0:
        return JSONResponse({"error": "Empty input received."}, status_code=400)

    return {"output": generate_text(input_text)}

# Mount static files
app.mount("/static", StaticFiles(directory="static"), name="static")