llm-arch / src /models.py
alfraser's picture
Set up configuration for models on HF and an associated page on the application to allow end use test chat.
54b3256
raw
history blame
2.25 kB
import json
import os
import requests
from typing import List
from src.common import config_dir
class HFLlamaChatModel:
models = None
@classmethod
def load_configs(cls):
config_file = os.path.join(config_dir, "models.json")
with open(config_file, "r") as f:
configs = json.load(f)['models']
cls.models = []
for cfg in configs:
if cls.get_model(cfg['name']) is None:
cls.models.append(HFLlamaChatModel(cfg['name'], cfg['id'], cfg['description']))
@classmethod
def get_model(cls, model: str):
for m in cls.models:
if m.name == model:
return m
@classmethod
def available_models(cls) -> List[str]:
if cls.models is None:
cls.load_configs()
return [m.name for m in cls.models]
def __init__(self, name: str, id: str, description: str):
self.name = name
self.id = id
self.description = description
def __call__(self,
query: str,
auth_token: str,
system_prompt: str = None,
max_new_tokens: str = 256,
temperature: float = 1.0):
headers = {"Authorization": f"Bearer {auth_token}"}
api_url = f"https://api-inference.huggingface.co/models/{self.id}"
if system_prompt is None:
system_prompt = "You are a helpful assistant."
query_input = f"[INST] <<SYS>> {system_prompt} <<SYS>> {query} [/INST] "
query_payload = {
"inputs": query_input,
"parameters": {"max_new_tokens": max_new_tokens, "temperature": temperature}
}
print(query_payload)
response = requests.post(api_url, headers=headers, json=query_payload)
if response.status_code == 200:
resp_json = json.loads(response.text)
llm_text = resp_json[0]['generated_text']
query_len = len(query_input)
llm_text = llm_text[query_len:].strip()
return llm_text
else:
error_detail = f"Error from hugging face code: {response.status_code}: {response.reason} ({response.content})"
raise ValueError(error_detail)