Spaces:

alfraser
/

llm-arch

Runtime error

App Files Files Community

alfraser commited on Jan 19, 2024

Commit

6000142

1 Parent(s): 0cd8882

Added a new architecture component which calls hugging face via a dedicated inference end point and not the HTTP interface - needed due to the size of the fine-tuned model

Browse files

Files changed (1) hide show

src/architectures.py +47 -1

src/architectures.py CHANGED Viewed

@@ -7,6 +7,7 @@ import chromadb
 import json
 import os
 import regex as re
 import traceback
 from abc import ABC, abstractmethod
@@ -363,6 +364,52 @@ class HFLlamaHttpRequestor(ArchitectureComponent):
         request.response = response
 class ResponseTrimmer(ArchitectureComponent):
     """
     A concrete pipeline component which trims the response based on a regex match,
@@ -384,4 +431,3 @@ class ResponseTrimmer(ArchitectureComponent):
     def config_description(self) -> str:
         return f"Regexes: {self.regex_display}"

 import json
 import os
 import regex as re
+import requests
 import traceback
 from abc import ABC, abstractmethod
         request.response = response
+class HFInferenceEndpoint(ArchitectureComponent):
+    """
+    A concrete pipeline component which sends the user text to a given llama chat based
+    inference endpoint on HuggingFace
+    """
+    def __init__(self, endpoint_url: str, system_prompt: str, max_new_tokens: int, temperature: float = 1.0):
+        self.endpoint_url: str = endpoint_url
+        self.system_prompt: str = system_prompt
+        self.max_new_tokens = max_new_tokens
+        self.api_token = hf_api_token()
+        self.temperature = temperature
+    def config_description(self) -> str:
+        """
+        Custom config details as markdown
+        """
+        desc = f"Endpoint: {self.endpoint_url};  "
+        desc += f"Max tokens: {self.max_tokens};  "
+        desc += f"Temperature: {self.temperature};  "
+        desc += f"System prompt: {self.system_prompt}"
+        return desc
+    def process_request(self, request: ArchitectureRequest) -> None:
+        """
+        Main processing method for this function. Calls the HTTP service for the model
+        by port if provided or attempting to lookup by name, and then adds this to the
+        response element of the request.
+        """
+        headers = {
+            "Accept": "application/json",
+            "Authorization": f"Bearer {self.api_token}",
+            "Content-Type": "application/json"
+        }
+        query_input = f"[INST] <<SYS>> {self.system_prompt} <<SYS>> {request.request} [/INST] "
+        payload = {
+            "inputs": query_input,
+            "parameters": {
+                "temperature": self.temperature,
+                "max_new_tokens": self.max_new_tokens
+            }
+        }
+        response = requests.post(self.endpoint_url, headers=headers, json=payload)
+        generated_text = json.loads(response.text)[0]['generated_text'].strip()
+        request.response = generated_text
 class ResponseTrimmer(ArchitectureComponent):
     """
     A concrete pipeline component which trims the response based on a regex match,
     def config_description(self) -> str:
         return f"Regexes: {self.regex_display}"