Spaces:

freQuensy23
/

LLMhistory

Running

App Files Files Community

LLMhistory / generators.py

freQuensy23

Fix torch

e4bfc4a 11 months ago

raw

history blame

3.59 kB

	import asyncio
	import json
	import os

	import aiohttp
	import gradio as gr
	import numpy as np
	import spaces
	from huggingface_hub import InferenceClient

	import random
	import torch
	from huggingface_hub import AsyncInferenceClient
	from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer


	async def query_llm(payload, model_name):
	headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
	async with aiohttp.ClientSession() as session:
	async with session.post(f"https://api-inference.huggingface.co/models/{model_name}", headers=headers,
	json=payload) as response:
	return await response.json()


	async def generate_mistral_7bvo1(system_input, user_input):
	client = AsyncInferenceClient(
	"mistralai/Mistral-7B-Instruct-v0.1",
	token=os.getenv('HF_TOKEN'),
	)

	async for message in await client.chat_completion(
	messages=[
	{"role": "system", "content": system_input},
	{"role": "user", "content": user_input}, ],
	max_tokens=256,
	stream=True,
	):
	yield message.choices[0].delta.content


	async def generate_gpt2(system_input, user_input):
	output = await query_llm({
	"inputs": (inputs:=f"{system_input}\n{user_input}"),
	}, "openai-community/gpt2")
	yield output[0]["generated_text"].replace(inputs, '')


	async def generate_llama2(system_input, user_input):
	client = AsyncInferenceClient(
	"meta-llama/Llama-2-7b-chat-hf",
	token=os.getenv('HF_TOKEN')
	)
	async for message in await client.chat_completion(
	messages=[
	{"role": "system", "content": system_input},
	{"role": "user", "content": user_input}, ],
	max_tokens=256,
	stream=True,
	):
	yield message.choices[0].delta.content


	@spaces.GPU(duration=120)
	async def generate_openllama(system_input, user_input):
	model_path = 'openlm-research/open_llama_3b_v2'
	tokenizer = LlamaTokenizer.from_pretrained(model_path)
	model = LlamaForCausalLM.from_pretrained(
	model_path, torch_dtype=torch.float16, device_map='cuda',
	)
	# model = model.to("cuda")
	input_text = f"{system_input}\n{user_input}"
	input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
	output = model.generate(input_ids, max_length=128)
	return tokenizer.decode(output[0], skip_special_tokens=True)


	@spaces.GPU(duration=120)
	async def generate_bloom(system_input, user_input):
	model_path = 'bigscience/bloom-7b1'
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = LlamaForCausalLM.from_pretrained(
	model_path, torch_dtype=torch.float16, device_map='cuda',
	)
	input_text = f"{system_input}\n{user_input}"
	input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
	output = model.generate(input_ids, max_length=128)
	return tokenizer.decode(output[0], skip_special_tokens=True)



	async def generate_llama3(system_input, user_input):
	client = AsyncInferenceClient(
	"meta-llama/Meta-Llama-3.1-8B-Instruct",
	token=os.getenv('HF_TOKEN')
	)
	try:
	async for message in await client.chat_completion(
	messages=[
	{"role": "system", "content": system_input},
	{"role": "user", "content": user_input}, ],
	max_tokens=256,
	stream=True,
	):
	yield message.choices[0].delta.content
	except json.JSONDecodeError:
	pass