Spaces:

Empereur-Pirate
/

Empereur

Sleeping

App Files Files Community

Empereur / main.py

Empereur-Pirate

Update main.py

b837945 verified 11 months ago

raw

history blame

2.79 kB

	from fastapi import FastAPI, Request, Depends
	from fastapi.responses import FileResponse, JSONResponse
	from fastapi.staticfiles import StaticFiles
	from transformers import pipeline
	from pydantic import BaseModel
	from typing import Optional, Any
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, GenerationConfig
	import os
	from huggingface_hub import login

	# Check whether we are executing inside a Hugging Face Space
	SPACE_NAME = os.getenv("SPACE_NAME", default=None)
	if SPACE_NAME is not None:
	print(f"Running inside {SPACE_NAME} Space.")
	try:
	# Try to auto-login using the Space's environment variables
	login(automatically=True)
	except Exception as e:
	print(f"Failed to auto-login ({str(e)}). Manually check the HF_ACCESS_TOKEN environment variable.")
	sys.exit(1)

	try:
	HUGGINGFACE_TOKEN = os.environ['HF_ACCESS_TOKEN']
	except KeyError:
	print('The environment variable "HF_ACCESS_TOKEN" is not found. Please configure it correctly in your Space.')
	sys.exit(1)

	# Packages and model loading
	import torch
	base_model_id = "152334H/miqu-1-70b-sf"
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16
	)

	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_id,
	quantization_config=bnb_config,
	device_map="auto",
	trust_remote_code=True,
	)

	# Tokenizer loading
	eval_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf", add_bos_token=True, trust_remote_code=True, use_auth_token=True)

	# Streamer
	streamer = TextStreamer(eval_tokenizer)

	# App definition
	app = FastAPI()

	# Helper function to read raw request bodies
	async def parse_raw(request: Request):
	return await request.body()

	# Generate text
	def generate_text(prompt: str) -> str:
	model_input = eval_tokenizer(prompt, return_tensors="pt").to("cuda")

	base_model.eval()
	with torch.no_grad():
	generated_sequences = base_model.generate(
	**model_input,
	max_new_tokens=4096,
	repetition_penalty=1.1,
	do_sample=True,
	temperature=1,
	streamer=streamer,
	)

	return eval_tokenizer.decode(generated_sequences[0], skip_special_tokens=True)

	# Route for generating text
	@app.post("/generate_text")
	async def generate_text_route(data: BaseModel = Depends(parse_raw)):
	input_text = data.raw.decode("utf-8")
	if not input_text or len(input_text) <= 0:
	return JSONResponse({"error": "Empty input received."}, status_code=400)

	return {"output": generate_text(input_text)}

	# Mount static files
	app.mount("/static", StaticFiles(directory="static"), name="static")