Spaces:
Runtime error
Runtime error
Removed now unused code which calls huggin face via the free HTTP interface - moved everything to endpoints to be equivalent.
Browse files- config/models.json +0 -14
- pages/005_LLM_Models.py +0 -61
- src/architectures.py +0 -38
- src/models.py +0 -73
config/models.json
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"models": [
|
3 |
-
{
|
4 |
-
"name": "Llama2 Chat 7B",
|
5 |
-
"id": "meta-llama/Llama-2-7b-chat-hf",
|
6 |
-
"description": "The unmodified 7 billion parameter version of the llama 2 chat model from meta."
|
7 |
-
},
|
8 |
-
{
|
9 |
-
"name": "Llama2 Chat 13B",
|
10 |
-
"id": "meta-llama/Llama-2-13b-chat-hf",
|
11 |
-
"description": "The unmodified 13 billion parameter version of the llama 2 chat model from meta."
|
12 |
-
}
|
13 |
-
]
|
14 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/005_LLM_Models.py
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
from src.models import HFLlamaChatModel
|
4 |
-
from src.st_helpers import st_setup
|
5 |
-
|
6 |
-
if st_setup('LLM Models'):
|
7 |
-
st.write("# LLM Models")
|
8 |
-
st.write("The project uses a number of different models which are deployed with other components to form a variety of architectures. This page lists those models, and allows users to interact in isolation just with the model directly, excluding any other architecture components.")
|
9 |
-
|
10 |
-
if st.button('Force reload of models config'):
|
11 |
-
HFLlamaChatModel.load_configs()
|
12 |
-
|
13 |
-
SESSION_KEY_CHAT_SERVER = 'chat_server'
|
14 |
-
button_count = 0
|
15 |
-
|
16 |
-
|
17 |
-
def button_key() -> str:
|
18 |
-
global button_count
|
19 |
-
button_count += 1
|
20 |
-
return f"btn_{button_count}"
|
21 |
-
|
22 |
-
server_container = st.container()
|
23 |
-
chat_container = st.container()
|
24 |
-
|
25 |
-
with server_container:
|
26 |
-
server_count = len(HFLlamaChatModel.available_models())
|
27 |
-
if server_count == 1:
|
28 |
-
st.write(f'### 1 model configured')
|
29 |
-
else:
|
30 |
-
st.write(f'### {server_count} models configured')
|
31 |
-
|
32 |
-
with st.container():
|
33 |
-
st.divider()
|
34 |
-
for i, m in enumerate(HFLlamaChatModel.models):
|
35 |
-
with st.container(): # row
|
36 |
-
content, actions = st.columns([4, 1])
|
37 |
-
with content:
|
38 |
-
st.write(f'**{m.name}** \n\n _{m.description}_')
|
39 |
-
|
40 |
-
with actions:
|
41 |
-
if st.button("Chat with this model", key=button_key()):
|
42 |
-
st.session_state[SESSION_KEY_CHAT_SERVER] = m.name
|
43 |
-
st.rerun()
|
44 |
-
|
45 |
-
st.divider()
|
46 |
-
|
47 |
-
if SESSION_KEY_CHAT_SERVER in st.session_state:
|
48 |
-
with chat_container:
|
49 |
-
st.write(f"### Chatting with {st.session_state[SESSION_KEY_CHAT_SERVER]}")
|
50 |
-
st.write(
|
51 |
-
"Note this is a simple single prompt call back to the relevant chat server. This is just a toy so you can interact with it and does not manage a chat session history.")
|
52 |
-
with st.chat_message("assistant"):
|
53 |
-
st.write("Chat with me in the box below")
|
54 |
-
if prompt := st.chat_input("Ask a question"):
|
55 |
-
with chat_container:
|
56 |
-
with st.chat_message("user"):
|
57 |
-
st.write(prompt)
|
58 |
-
chat_model = HFLlamaChatModel.for_name(st.session_state[SESSION_KEY_CHAT_SERVER])
|
59 |
-
response = chat_model(prompt)
|
60 |
-
with st.chat_message("assistant"):
|
61 |
-
st.write(response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/architectures.py
CHANGED
@@ -17,7 +17,6 @@ from typing import List, Optional
|
|
17 |
from better_profanity import profanity
|
18 |
|
19 |
from src.common import config_dir, data_dir, hf_api_token, escape_dollars
|
20 |
-
from src.models import HFLlamaChatModel
|
21 |
|
22 |
|
23 |
class ArchitectureRequest:
|
@@ -343,43 +342,6 @@ class RetrievalAugmentor(ArchitectureComponent):
|
|
343 |
return desc
|
344 |
|
345 |
|
346 |
-
class HFLlamaHttpRequestor(ArchitectureComponent):
|
347 |
-
"""
|
348 |
-
A concrete pipeline component which sends the user text to a given llama chat based
|
349 |
-
model on hugging face.
|
350 |
-
"""
|
351 |
-
description = "Passes the request to a model hosted on hugging face hub"
|
352 |
-
|
353 |
-
def __init__(self, model: str, system_prompt: str, max_tokens: int, temperature: float = 1.0):
|
354 |
-
self.model: str = model
|
355 |
-
self.system_prompt: str = system_prompt
|
356 |
-
self.max_tokens = max_tokens
|
357 |
-
self.api_token = hf_api_token()
|
358 |
-
self.temperature = temperature
|
359 |
-
|
360 |
-
def config_description(self) -> str:
|
361 |
-
"""
|
362 |
-
Custom config details as markdown
|
363 |
-
"""
|
364 |
-
desc = f"Model: {self.model}; "
|
365 |
-
desc += f"Max tokens: {self.max_tokens}; "
|
366 |
-
desc += f"Temperature: {self.temperature}; "
|
367 |
-
desc += f"System prompt: {self.system_prompt}"
|
368 |
-
return desc
|
369 |
-
|
370 |
-
def process_request(self, request: ArchitectureRequest) -> None:
|
371 |
-
"""
|
372 |
-
Main processing method for this function. Calls the HTTP service for the model
|
373 |
-
by port if provided or attempting to lookup by name, and then adds this to the
|
374 |
-
response element of the request.
|
375 |
-
"""
|
376 |
-
llm = HFLlamaChatModel.for_model(self.model)
|
377 |
-
if llm is None:
|
378 |
-
raise ValueError(f'No model {self.model} configured in the environment')
|
379 |
-
response = llm(request.request, system_prompt=self.system_prompt, max_new_tokens=self.max_tokens, temperature=self.temperature)
|
380 |
-
request.response = response
|
381 |
-
|
382 |
-
|
383 |
class HFInferenceEndpoint(ArchitectureComponent):
|
384 |
"""
|
385 |
A concrete pipeline component which sends the user text to a given llama chat based
|
|
|
17 |
from better_profanity import profanity
|
18 |
|
19 |
from src.common import config_dir, data_dir, hf_api_token, escape_dollars
|
|
|
20 |
|
21 |
|
22 |
class ArchitectureRequest:
|
|
|
342 |
return desc
|
343 |
|
344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
class HFInferenceEndpoint(ArchitectureComponent):
|
346 |
"""
|
347 |
A concrete pipeline component which sends the user text to a given llama chat based
|
src/models.py
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import requests
|
4 |
-
from typing import List
|
5 |
-
|
6 |
-
from src.common import config_dir, hf_api_token
|
7 |
-
|
8 |
-
|
9 |
-
class HFLlamaChatModel:
|
10 |
-
models = None
|
11 |
-
|
12 |
-
@classmethod
|
13 |
-
def load_configs(cls):
|
14 |
-
config_file = os.path.join(config_dir, "models.json")
|
15 |
-
with open(config_file, "r") as f:
|
16 |
-
configs = json.load(f)['models']
|
17 |
-
cls.models = []
|
18 |
-
for cfg in configs:
|
19 |
-
if cls.for_name(cfg['name']) is None:
|
20 |
-
cls.models.append(HFLlamaChatModel(cfg['name'], cfg['id'], cfg['description']))
|
21 |
-
|
22 |
-
@classmethod
|
23 |
-
def for_name(cls, name: str):
|
24 |
-
if cls.models is None:
|
25 |
-
cls.load_configs()
|
26 |
-
for m in cls.models:
|
27 |
-
if m.name == name:
|
28 |
-
return m
|
29 |
-
|
30 |
-
@classmethod
|
31 |
-
def for_model(cls, model: str):
|
32 |
-
if cls.models is None:
|
33 |
-
cls.load_configs()
|
34 |
-
for m in cls.models:
|
35 |
-
if m.id == model:
|
36 |
-
return m
|
37 |
-
|
38 |
-
@classmethod
|
39 |
-
def available_models(cls) -> List[str]:
|
40 |
-
if cls.models is None:
|
41 |
-
cls.load_configs()
|
42 |
-
return [m.name for m in cls.models]
|
43 |
-
|
44 |
-
def __init__(self, name: str, id: str, description: str):
|
45 |
-
self.name = name
|
46 |
-
self.id = id
|
47 |
-
self.description = description
|
48 |
-
|
49 |
-
def __call__(self,
|
50 |
-
query: str,
|
51 |
-
auth_token: str = None,
|
52 |
-
system_prompt: str = None,
|
53 |
-
max_new_tokens: str = 256,
|
54 |
-
temperature: float = 1.0):
|
55 |
-
if auth_token is None:
|
56 |
-
auth_token = hf_api_token() # Attempt look up if not provided
|
57 |
-
headers = {"Authorization": f"Bearer {auth_token}"}
|
58 |
-
api_url = f"https://api-inference.huggingface.co/models/{self.id}"
|
59 |
-
if system_prompt is None:
|
60 |
-
system_prompt = "You are a helpful assistant."
|
61 |
-
query_input = f"[INST] <<SYS>> {system_prompt} <<SYS>> {query} [/INST] "
|
62 |
-
query_payload = {
|
63 |
-
"inputs": query_input,
|
64 |
-
"parameters": {"max_new_tokens": max_new_tokens, "temperature": temperature}
|
65 |
-
}
|
66 |
-
response = requests.post(api_url, headers=headers, json=query_payload)
|
67 |
-
if response.status_code == 200:
|
68 |
-
resp_json = json.loads(response.text)
|
69 |
-
llm_text = resp_json[0]['generated_text'].strip()
|
70 |
-
return llm_text
|
71 |
-
else:
|
72 |
-
error_detail = f"Error from hugging face code: {response.status_code}: {response.reason} ({response.content})"
|
73 |
-
raise ValueError(error_detail)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|