Spaces:

alfraser
/

llm-arch

Runtime error

App Files Files Community

alfraser commited on Jan 19, 2024

Commit

227dc25

1 Parent(s): 8c04dd2

Removed now unused code which calls huggin face via the free HTTP interface - moved everything to endpoints to be equivalent.

Browse files

Files changed (4) hide show

config/models.json +0 -14
pages/005_LLM_Models.py +0 -61
src/architectures.py +0 -38
src/models.py +0 -73

config/models.json DELETED Viewed

@@ -1,14 +0,0 @@
-{
-  "models": [
-    {
-      "name": "Llama2 Chat 7B",
-      "id": "meta-llama/Llama-2-7b-chat-hf",
-      "description": "The unmodified 7 billion parameter version of the llama 2 chat model from meta."
-    },
-    {
-      "name": "Llama2 Chat 13B",
-      "id": "meta-llama/Llama-2-13b-chat-hf",
-      "description": "The unmodified 13 billion parameter version of the llama 2 chat model from meta."
-    }
-  ]
-}

pages/005_LLM_Models.py DELETED Viewed

@@ -1,61 +0,0 @@
-import streamlit as st
-from src.models import HFLlamaChatModel
-from src.st_helpers import st_setup
-if st_setup('LLM Models'):
-    st.write("# LLM Models")
-    st.write("The project uses a number of different models which are deployed with other components to form a variety of architectures.  This page lists those models, and allows users to interact in isolation just with the model directly, excluding any other architecture components.")
-    if st.button('Force reload of models config'):
-        HFLlamaChatModel.load_configs()
-    SESSION_KEY_CHAT_SERVER = 'chat_server'
-    button_count = 0
-    def button_key() -> str:
-        global button_count
-        button_count += 1
-        return f"btn_{button_count}"
-    server_container = st.container()
-    chat_container = st.container()
-    with server_container:
-        server_count = len(HFLlamaChatModel.available_models())
-        if server_count == 1:
-            st.write(f'### 1 model configured')
-        else:
-            st.write(f'### {server_count} models configured')
-        with st.container():
-            st.divider()
-            for i, m in enumerate(HFLlamaChatModel.models):
-                with st.container():  # row
-                    content, actions = st.columns([4, 1])
-                    with content:
-                        st.write(f'**{m.name}** \n\n _{m.description}_')
-                    with actions:
-                        if st.button("Chat with this model", key=button_key()):
-                            st.session_state[SESSION_KEY_CHAT_SERVER] = m.name
-                            st.rerun()
-                st.divider()
-    if SESSION_KEY_CHAT_SERVER in st.session_state:
-        with chat_container:
-            st.write(f"### Chatting with {st.session_state[SESSION_KEY_CHAT_SERVER]}")
-            st.write(
-                "Note this is a simple single prompt call back to the relevant chat server. This is just a toy so you can interact with it and does not manage a chat session history.")
-            with st.chat_message("assistant"):
-                st.write("Chat with me in the box below")
-            if prompt := st.chat_input("Ask a question"):
-                with chat_container:
-                    with st.chat_message("user"):
-                        st.write(prompt)
-                    chat_model = HFLlamaChatModel.for_name(st.session_state[SESSION_KEY_CHAT_SERVER])
-                    response = chat_model(prompt)
-                    with st.chat_message("assistant"):
-                        st.write(response)

src/architectures.py CHANGED Viewed

@@ -17,7 +17,6 @@ from typing import List, Optional
 from better_profanity import profanity
 from src.common import config_dir, data_dir, hf_api_token, escape_dollars
-from src.models import HFLlamaChatModel
 class ArchitectureRequest:
@@ -343,43 +342,6 @@ class RetrievalAugmentor(ArchitectureComponent):
         return desc
-class HFLlamaHttpRequestor(ArchitectureComponent):
-    """
-    A concrete pipeline component which sends the user text to a given llama chat based
-    model on hugging face.
-    """
-    description = "Passes the request to a model hosted on hugging face hub"
-    def __init__(self, model: str, system_prompt: str, max_tokens: int, temperature: float = 1.0):
-        self.model: str = model
-        self.system_prompt: str = system_prompt
-        self.max_tokens = max_tokens
-        self.api_token = hf_api_token()
-        self.temperature = temperature
-    def config_description(self) -> str:
-        """
-        Custom config details as markdown
-        """
-        desc = f"Model: {self.model};  "
-        desc += f"Max tokens: {self.max_tokens};  "
-        desc += f"Temperature: {self.temperature};  "
-        desc += f"System prompt: {self.system_prompt}"
-        return desc
-    def process_request(self, request: ArchitectureRequest) -> None:
-        """
-        Main processing method for this function. Calls the HTTP service for the model
-        by port if provided or attempting to lookup by name, and then adds this to the
-        response element of the request.
-        """
-        llm = HFLlamaChatModel.for_model(self.model)
-        if llm is None:
-            raise ValueError(f'No model {self.model} configured in the environment')
-        response = llm(request.request, system_prompt=self.system_prompt, max_new_tokens=self.max_tokens, temperature=self.temperature)
-        request.response = response
 class HFInferenceEndpoint(ArchitectureComponent):
     """
     A concrete pipeline component which sends the user text to a given llama chat based

 from better_profanity import profanity
 from src.common import config_dir, data_dir, hf_api_token, escape_dollars
 class ArchitectureRequest:
         return desc
 class HFInferenceEndpoint(ArchitectureComponent):
     """
     A concrete pipeline component which sends the user text to a given llama chat based

src/models.py DELETED Viewed

@@ -1,73 +0,0 @@
-import json
-import os
-import requests
-from typing import List
-from src.common import config_dir, hf_api_token
-class HFLlamaChatModel:
-    models = None
-    @classmethod
-    def load_configs(cls):
-        config_file = os.path.join(config_dir, "models.json")
-        with open(config_file, "r") as f:
-            configs = json.load(f)['models']
-            cls.models = []
-            for cfg in configs:
-                if cls.for_name(cfg['name']) is None:
-                    cls.models.append(HFLlamaChatModel(cfg['name'], cfg['id'], cfg['description']))
-    @classmethod
-    def for_name(cls, name: str):
-        if cls.models is None:
-            cls.load_configs()
-        for m in cls.models:
-            if m.name == name:
-                return m
-    @classmethod
-    def for_model(cls, model: str):
-        if cls.models is None:
-            cls.load_configs()
-        for m in cls.models:
-            if m.id == model:
-                return m
-    @classmethod
-    def available_models(cls) -> List[str]:
-        if cls.models is None:
-            cls.load_configs()
-        return [m.name for m in cls.models]
-    def __init__(self, name: str, id: str, description: str):
-        self.name = name
-        self.id = id
-        self.description = description
-    def __call__(self,
-                 query: str,
-                 auth_token: str = None,
-                 system_prompt: str = None,
-                 max_new_tokens: str = 256,
-                 temperature: float = 1.0):
-        if auth_token is None:
-            auth_token = hf_api_token()  # Attempt look up if not provided
-        headers = {"Authorization": f"Bearer {auth_token}"}
-        api_url = f"https://api-inference.huggingface.co/models/{self.id}"
-        if system_prompt is None:
-            system_prompt = "You are a helpful assistant."
-        query_input = f"[INST] <<SYS>> {system_prompt} <<SYS>> {query} [/INST] "
-        query_payload = {
-            "inputs": query_input,
-            "parameters": {"max_new_tokens": max_new_tokens, "temperature": temperature}
-        }
-        response = requests.post(api_url, headers=headers, json=query_payload)
-        if response.status_code == 200:
-            resp_json = json.loads(response.text)
-            llm_text = resp_json[0]['generated_text'].strip()
-            return llm_text
-        else:
-            error_detail = f"Error from hugging face code: {response.status_code}: {response.reason} ({response.content})"
-            raise ValueError(error_detail)