Spaces:

dkdaniz
/

katara

Paused

App Files Files Community

Daniel Marques commited on Oct 17, 2023

Commit

0b74b4d

•

1 Parent(s): dca490a

fix: add websocket in handlerToken

Browse files

Files changed (4) hide show

Dockerfile +7 -1
main.py +10 -10
prompt_template_utils.py +7 -8
run.sh +1 -1

Dockerfile CHANGED Viewed

@@ -4,7 +4,6 @@
 FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
 RUN apt-get update && apt-get upgrade -y \
  && apt-get install -y git build-essential libpq-dev gcc \
  wget ocl-icd-opencl-dev opencl-headers clinfo \
@@ -14,11 +13,14 @@ RUN apt-get update && apt-get upgrade -y \
 WORKDIR /app
 COPY . .
 # setting build related env vars
 ENV CUDA_DOCKER_ARCH=all
 ENV LLAMA_CUBLAS=1
 # Install depencencies
 RUN python -m pip install --upgrade pip pytest cmake \
@@ -29,6 +31,10 @@ RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
 RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 PIP_ROOT_USER_ACTION=ignore pip install --ignore-installed --timeout 100 -r requirements.txt
 RUN pip install uvicorn
 # RUN useradd -m -u 1000 user
 # USER user

 FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
 RUN apt-get update && apt-get upgrade -y \
  && apt-get install -y git build-essential libpq-dev gcc \
  wget ocl-icd-opencl-dev opencl-headers clinfo \
 WORKDIR /app
+RUN mkdir "./cache"
 COPY . .
 # setting build related env vars
 ENV CUDA_DOCKER_ARCH=all
 ENV LLAMA_CUBLAS=1
+ENV TRANSFORMERS_CACHE="./cache"
 # Install depencencies
 RUN python -m pip install --upgrade pip pytest cmake \
 RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 PIP_ROOT_USER_ACTION=ignore pip install --ignore-installed --timeout 100 -r requirements.txt
 RUN pip install uvicorn
 # RUN useradd -m -u 1000 user
 # USER user

main.py CHANGED Viewed

@@ -4,7 +4,6 @@ import shutil
 import subprocess
 import asyncio
 from typing import Any, Dict, List
 from fastapi import FastAPI, HTTPException, UploadFile, WebSocket, WebSocketDisconnect
@@ -20,6 +19,8 @@ from langchain.memory import ConversationBufferMemory
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.schema import LLMResult
 # from langchain.embeddings import HuggingFaceEmbeddings
 from load_models import load_model
@@ -77,15 +78,13 @@ handlerToken = MyCustomSyncHandler()
 LLM = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME, stream=True, callbacks=[handlerToken])
-template = """You are a helpful, respectful and honest assistant.
-Always answer in the most helpful and safe way possible without trying to make up an answer, if you don't know the answer just say "I don't know" and don't share false information or topics that were not provided in your training. Use a maximum of 15 sentences. Your answer should be as concise and clear as possible. Always say "thank you for asking!" at the end of your answer.
-Context: {context}
-Question: {question}
-"""
-memory = ConversationBufferMemory(input_key="question", memory_key="history")
-QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"], template=template)
 QA = RetrievalQA.from_chain_type(
  llm=LLM,
@@ -93,7 +92,8 @@ QA = RetrievalQA.from_chain_type(
  retriever=RETRIEVER,
  return_source_documents=SHOW_SOURCES,
  chain_type_kwargs={
- "prompt": QA_CHAIN_PROMPT,
  },
 )

 import subprocess
 import asyncio
 from typing import Any, Dict, List
 from fastapi import FastAPI, HTTPException, UploadFile, WebSocket, WebSocketDisconnect
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.schema import LLMResult
+from prompt_template_utils import get_prompt_template
 # from langchain.embeddings import HuggingFaceEmbeddings
 from load_models import load_model
 LLM = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME, stream=True, callbacks=[handlerToken])
+# template = """You are a helpful, respectful and honest assistant.
+# Always answer in the most helpful and safe way possible without trying to make up an answer, if you don't know the answer just say "I don't know" and don't share false information or topics that were not provided in your training. Use a maximum of 15 sentences. Your answer should be as concise and clear as possible. Always say "thank you for asking!" at the end of your answer.
+# Context: {context}
+# Question: {question}
+# """
+prompt, memory = get_prompt_template(promptTemplate_type="llama", history=True)
 QA = RetrievalQA.from_chain_type(
  llm=LLM,
  retriever=RETRIEVER,
  return_source_documents=SHOW_SOURCES,
  chain_type_kwargs={
+ "prompt": prompt,
+ "memory": memory
  },
 )

prompt_template_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
-This file implements prompt template for llama based models.
-Modify the prompt template based on the model you select.
 This seems to have significant impact on the output of the LLM.
 """
@@ -10,10 +10,9 @@ from langchain.prompts import PromptTemplate
 # this is specific to Llama-2.
 system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
-Read the given context before answering questions and think step by step. If you can not answer a user question based on
 the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
 def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, history=False):
  if promptTemplate_type == "llama":
  B_INST, E_INST = "[INST]", "[/INST]"
@@ -40,7 +39,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
  B_INST
  + system_prompt
  + """
  Context: {history} \n {context}
  User: {question}"""
  + E_INST
@@ -51,7 +50,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
  B_INST
  + system_prompt
  + """
  Context: {context}
  User: {question}"""
  + E_INST
@@ -63,7 +62,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
  prompt_template = (
  system_prompt
  + """
  Context: {history} \n {context}
  User: {question}
  Answer:"""
@@ -73,7 +72,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
  prompt_template = (
  system_prompt
  + """
  Context: {context}
  User: {question}
  Answer:"""

 """
+This file implements prompt template for llama based models.
+Modify the prompt template based on the model you select.
 This seems to have significant impact on the output of the LLM.
 """
 # this is specific to Llama-2.
 system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
+Read the given context before answering questions and think step by step. If you can not answer a user question based on
 the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
 def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, history=False):
  if promptTemplate_type == "llama":
  B_INST, E_INST = "[INST]", "[/INST]"
  B_INST
  + system_prompt
  + """
  Context: {history} \n {context}
  User: {question}"""
  + E_INST
  B_INST
  + system_prompt
  + """
  Context: {context}
  User: {question}"""
  + E_INST
  prompt_template = (
  system_prompt
  + """
  Context: {history} \n {context}
  User: {question}
  Answer:"""
  prompt_template = (
  system_prompt
  + """
  Context: {context}
  User: {question}
  Answer:"""

run.sh CHANGED Viewed

@@ -1,5 +1,5 @@
 redis-cli --version
-sudo service redis-server start
 uvicorn "main:app" --port 7860 --host 0.0.0.0

 redis-cli --version
+service redis-server start
 uvicorn "main:app" --port 7860 --host 0.0.0.0