import asyncio import json import os import aiohttp import gradio as gr import numpy as np import spaces from huggingface_hub import InferenceClient import random import torch from huggingface_hub import AsyncInferenceClient from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM async def query_llm(payload, model_name): headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"} async with aiohttp.ClientSession() as session: async with session.post(f"https://api-inference.huggingface.co/models/{model_name}", headers=headers, json=payload) as response: return await response.json() async def generate_mistral_7bvo1(system_input, user_input): client = AsyncInferenceClient( "mistralai/Mistral-7B-Instruct-v0.1", token=os.getenv('HF_TOKEN'), ) async for message in await client.chat_completion( messages=[ {"role": "system", "content": system_input}, {"role": "user", "content": user_input}, ], max_tokens=256, stream=True, ): yield message.choices[0].delta.content async def generate_t5(system_input, user_input): output = await query_llm({ "inputs": (inputs := f"{system_input}\n{user_input}"), }, "google/flan-t5-large") yield output[0]["generated_text"] async def generate_gpt2(system_input, user_input): output = await query_llm({ "inputs": (inputs := f"{system_input}\n{user_input}"), }, "openai-community/gpt2") yield output[0]["generated_text"][:532] async def generate_llama2(system_input, user_input): client = AsyncInferenceClient( "meta-llama/Llama-2-7b-chat-hf", token=os.getenv('HF_TOKEN') ) async for message in await client.chat_completion( messages=[ {"role": "system", "content": system_input}, {"role": "user", "content": user_input}, ], max_tokens=256, stream=True, ): yield message.choices[0].delta.content @spaces.GPU(duration=120) def generate_openllama(system_input, user_input): model_path = 'openlm-research/open_llama_3b_v2' tokenizer = LlamaTokenizer.from_pretrained(model_path) model = LlamaForCausalLM.from_pretrained( model_path, torch_dtype=torch.float16, device_map='cuda', ) print('model openllama loaded') input_text = f"{system_input}\n{user_input}" input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_length=128) return tokenizer.decode(output[0], skip_special_tokens=True) @spaces.GPU(duration=120) def generate_bloom(system_input, user_input): model_path = 'bigscience/bloom-7b1' tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.float16, device_map='cuda', ) input_text = f"{system_input}\n{user_input}" input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_length=128) return tokenizer.decode(output[0], skip_special_tokens=True) async def generate_llama3(system_input, user_input): client = AsyncInferenceClient( "meta-llama/Meta-Llama-3.1-8B-Instruct", token=os.getenv('HF_TOKEN') ) try: async for message in await client.chat_completion( messages=[ {"role": "system", "content": system_input}, {"role": "user", "content": user_input}, ], max_tokens=256, stream=True, ): yield message.choices[0].delta.content except json.JSONDecodeError: pass