Spaces:

freQuensy23
/

LLMhistory

Sleeping

App Files Files Community

LLMhistory / generators.py

freQuensy23

INIT

c0be431 4 months ago

raw

history blame

3.56 kB

	import asyncio
	import json
	import os

	import aiohttp
	import gradio as gr
	import numpy as np
	import spaces
	from huggingface_hub import InferenceClient

	import random
	import torch
	from huggingface_hub import AsyncInferenceClient
	from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer


	async def query_llm(payload, model_name):
	headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
	async with aiohttp.ClientSession() as session:
	async with session.post(f"https://api-inference.huggingface.co/models/{model_name}", headers=headers,
	json=payload) as response:
	return await response.json()


	async def generate_mistral_7bvo1(system_input, user_input):
	client = AsyncInferenceClient(
	"mistralai/Mistral-7B-Instruct-v0.1",
	token=os.getenv('HF_TOKEN'),
	)

	async for message in await client.chat_completion(
	messages=[
	{"role": "system", "content": system_input},
	{"role": "user", "content": user_input}, ],
	max_tokens=256,
	stream=True,
	):
	yield message.choices[0].delta.content


	async def generate_gpt2(system_input, user_input):
	output = await query_llm({
	"inputs": (inputs:=f"{system_input}\n{user_input}"),
	}, "openai-community/gpt2")
	yield output[0]["generated_text"].replace(inputs, '')


	async def generate_llama2(system_input, user_input):
	client = AsyncInferenceClient(
	"meta-llama/Llama-2-7b-chat-hf",
	token=os.getenv('HF_TOKEN')
	)
	async for message in await client.chat_completion(
	messages=[
	{"role": "system", "content": system_input},
	{"role": "user", "content": user_input}, ],
	max_tokens=256,
	stream=True,
	):
	yield message.choices[0].delta.content


	@spaces.GPU
	async def generate_openllama(system_input, user_input):
	model_path = 'openlm-research/open_llama_3b_v2'
	tokenizer = LlamaTokenizer.from_pretrained(model_path)
	model = LlamaForCausalLM.from_pretrained(
	model_path, torch_dtype=torch.float16, device_map='cuda',
	)
	# model = model.to("cuda")
	input_text = f"{system_input}\n{user_input}"
	input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
	output = model.generate(input_ids, max_length=128)
	return tokenizer.decode(output[0], skip_special_tokens=True)


	@spaces.GPU
	async def generate_bloom(system_input, user_input):
	model_path = 'bigscience/bloom-7b1'
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = LlamaForCausalLM.from_pretrained(
	model_path, torch_dtype=torch.float16, device_map='cuda',
	)
	input_text = f"{system_input}\n{user_input}"
	input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
	output = model.generate(input_ids, max_length=128)
	return tokenizer.decode(output[0], skip_special_tokens=True)



	async def generate_llama3(system_input, user_input):
	client = AsyncInferenceClient(
	"meta-llama/Meta-Llama-3.1-8B-Instruct",
	token=os.getenv('HF_TOKEN')
	)
	try:
	async for message in await client.chat_completion(
	messages=[
	{"role": "system", "content": system_input},
	{"role": "user", "content": user_input}, ],
	max_tokens=256,
	stream=True,
	):
	yield message.choices[0].delta.content
	except json.JSONDecodeError:
	pass