AI-Interface_memRAG

Runtime error

App Files Files Community

AFischer1985 commited on Feb 22, 2024

Commit

8691fee

verified ·

1 Parent(s): 8d8b439

Update run.py

Browse files

Files changed (1) hide show

run.py +292 -74

run.py CHANGED Viewed

@@ -1,19 +1,38 @@
-#############################################################################################################
-# Title:  Gradio Interface to LLM-chatbot (for recommending AI) with RAG-funcionality and ChromaDB on HF-Hub
 # Author: Andreas Fischer
-# Date:   December 30th, 2023
-# Last update: January 2nd, 2023
-##############################################################################################################
 # Chroma-DB
 #-----------
 import os
 import chromadb
-dbPath="/home/af/Schreibtisch/gradio/Chroma/db"
-if(os.path.exists(dbPath)==False):
-  dbPath="/home/user/app/db"
 print(dbPath)
 #client = chromadb.Client()
 path=dbPath
 client = chromadb.PersistentClient(path=path)
@@ -22,69 +41,213 @@ print(client.get_version())
 print(client.list_collections())
 from chromadb.utils import embedding_functions
 default_ef = embedding_functions.DefaultEmbeddingFunction()
-sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
 #instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-large", device="cuda")
 print(str(client.list_collections()))
 global collection
-if("name=ChromaDB1" in str(client.list_collections())):
-  print("ChromaDB1 found!")
-  collection = client.get_collection(name="ChromaDB1", embedding_function=sentence_transformer_ef)
 else:
-  print("ChromaDB1 created!")
   collection = client.create_collection(
-    "ChromaDB1",
-    embedding_function=sentence_transformer_ef,
     metadata={"hnsw:space": "cosine"})
   collection.add(
-    documents=[
-      "Text generating AI model mistralai/Mixtral-8x7B-Instruct-v0.1: Suitable for text generation, e.g., social media content, marketing copy, blog posts, short stories, etc.",
-      "Image generating AI model stabilityai/sdxl-turbo: Suitable for image generation, e.g., illustrations, graphics, AI art, etc.",
-      "Audio transcribing AI model openai/whisper-large-v3: Suitable for audio-transcription in different languages",
-      "Speech synthesizing AI model coqui/XTTS-v2: Suitable for generating audio from text and for voice-cloning",
-      "Code generating AI model deepseek-ai/deepseek-coder-6.7b-instruct: Suitable for programming in Python, JavaScript, PHP, Bash and many other programming languages.",
-      "Translation AI model Helsinki-NLP/opus-mt: Suitable for translating text, e.g., from English to German or vice versa",
-      "Search result-integrating AI model phind/phind-v9-model: Suitable for researching current topics and for obtaining precise and up-to-date answers to questions based on web search results"
     ],
-    metadatas=[{"source": "AF"}, {"source": "AF"}, {"source": "AF"}, {"source": "AF"}, {"source": "AF"}, {"source": "AF"}, {"source": "AF"}],
-    ids=["ai1", "ai2", "ai3", "ai4", "ai5", "ai6", "ai7"],
   )
-print("Database ready!")
-print(collection.count())
 # Model
 #-------
-from huggingface_hub import InferenceClient
-import gradio as gr
-client = InferenceClient(
-    "mistralai/Mixtral-8x7B-Instruct-v0.1"
     #"mistralai/Mistral-7B-Instruct-v0.1"
-)
 # Gradio-GUI
 #------------
 import gradio as gr
 import json
-def format_prompt(message, history):
-  prompt = "<s>"
-  #for user_prompt, bot_response in history:
-  #  prompt += f"[INST] {user_prompt} [/INST]"
-  #  prompt += f" {bot_response}</s> "
-  prompt += f"[INST] {message} [/INST]"
-  return prompt
-def response(
-    prompt, history, temperature=0.9, max_new_tokens=500, top_p=0.95, repetition_penalty=1.0,
-):
-    temperature = float(temperature)
     if temperature < 1e-2: temperature = 1e-2
     top_p = float(top_p)
     generate_kwargs = dict(
@@ -95,31 +258,86 @@ def response(
         do_sample=True,
         seed=42,
     )
-    addon=""
-    results=collection.query(
-      query_texts=[prompt],
-      n_results=2,
-      #where={"source": "google-docs"}
-      #where_document={"$contains":"search_string"}
-    )
-    dists=["<br><small>(relevance: "+str(round((1-d)*100)/100)+";" for d in results['distances'][0]]
-    sources=["source: "+s["source"]+")</small>" for s in results['metadatas'][0]]
-    results=results['documents'][0]
-    combination = zip(results,dists,sources)
-    combination = [' '.join(triplets) for triplets in combination]
-    print(combination)
-    if(len(results)>1):
-      addon=" Bitte berücksichtige bei deiner Antwort ggf. folgende Auszüge aus unserer Datenbank, sofern sie für die Antwort erforderlich sind. Beantworte die Frage knapp und präzise. Ignoriere unpassende Datenbank-Auszüge OHNE sie zu kommentieren, zu erwähnen oder aufzulisten:\n"+"\n".join(results)
-    system="Du bist ein deutschsprachiges KI-basiertes Assistenzsystem, das zu jedem Anliegen möglichst geeignete KI-Tools empfiehlt."+addon+"\n\nUser-Anliegen:"
-    #body={"prompt":system+"### Instruktion:\n"+message+"\n\n### Antwort:","max_tokens":500, "echo":"False","stream":"True"} #e.g. SauerkrautLM
-    formatted_prompt = format_prompt(system+"\n"+prompt, history)
-    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
-    output = ""
-    for response in stream:
-        output += response.token.text
-        yield output
-    output=output+"\n\n<br><details open><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
-    yield output
-gr.ChatInterface(response, chatbot=gr.Chatbot(value=[[None,"Herzlich willkommen! Ich bin ein KI-basiertes Assistenzsystem, das für jede Anfrage die am besten geeigneten KI-Tools empfiehlt.<br>Aktuell bin ich wenig mehr als eine Tech-Demo und kenne nur 7 KI-Modelle - also sei bitte nicht zu streng mit mir.<br>Was ist dein Anliegen?"]],render_markdown=True),title="German AI-RAG-Interface to the Hugging Face Hub").queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
-print("Interface up and running!")

+#########################################################################################
+# Title:  Gradio Interface to LLM-chatbot with memory RAG on premises
 # Author: Andreas Fischer
+# Date:   October 15th, 2023
+# Last update: February 22st, 2024
+##########################################################################################
+#https://github.com/abetlen/llama-cpp-python/issues/306
+#sudo apt install libclblast-dev
+#CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir -v
+# Prepare resources
+#-------------------
+import torch
+import gc
+torch.cuda.empty_cache()
+gc.collect()
+import os
+from datetime import datetime
+global filename
+filename=f"./{datetime.now().strftime('%Y%m%d')}_history.json" # where to store the history as json-file
+if(os.path.exists(filename)==True): os.remove(filename)
 # Chroma-DB
 #-----------
 import os
 import chromadb
+dbPath = "/home/af/Schreibtisch/Code/gradio/Chroma/db"
+onPrem = True if(os.path.exists(dbPath)) else False
+if(onPrem==False): dbPath="/home/user/app/db"
+#onPrem=False # override automatic detection
 print(dbPath)
 #client = chromadb.Client()
 path=dbPath
 client = chromadb.PersistentClient(path=path)
 print(client.list_collections())
 from chromadb.utils import embedding_functions
 default_ef = embedding_functions.DefaultEmbeddingFunction()
+#sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
 #instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-large", device="cuda")
+embeddingModel = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer", device="cuda" if(onPrem) else "cpu")
 print(str(client.list_collections()))
 global collection
+dbName="historicalChromaDB1"
+if("name="+dbName in str(client.list_collections())): client.delete_collection(name=dbName) # deletes collection
+if("name="+dbName in str(client.list_collections())):
+  print(dbName+" found!")
+  collection = client.get_collection(name=dbName, embedding_function=embeddingModel) #sentence_transformer_ef)
 else:
+  #client.delete_collection(name=dbName)
+  print(dbName+" created!")
   collection = client.create_collection(
+    dbName,
+    embedding_function=embeddingModel,
     metadata={"hnsw:space": "cosine"})
+print("Database ready!")
+print(collection.count())
+x=collection.get(include=[])["ids"]
+if(len(x)==0):
+  message="Ich bin der User."
+  response="Hallo User, wie kann ich dienen?"
+  x=collection.get(include=[])["ids"]
   collection.add(
+    documents=[message,response],
+    metadatas=[
+      {"source": "ICH", "dialog": f"ICH: {message}\nDU: {response}"},
+      {"source": "DU",  "dialog": f"ICH: {message}\nDU: {response}"}
     ],
+    ids=[str(len(x)+1),str(len(x)+2)]
+  )
+  RAGResults=collection.query(
+      query_texts=[message],
+      n_results=1,
+      #where={"source": "USER"}
   )
+  RAGResults["metadatas"][0][0]["dialog"]
+collection.get()["ids","documents"]
+x=collection.get(include=[])["ids"]
+x
 # Model
 #-------
+#onPrem=False
+if(onPrem==False):
+  modelPath="mistralai/Mixtral-8x7B-Instruct-v0.1"
+  from huggingface_hub import InferenceClient
+  import gradio as gr
+  client = InferenceClient(
+    modelPath
+    #"mistralai/Mixtral-8x7B-Instruct-v0.1"
     #"mistralai/Mistral-7B-Instruct-v0.1"
+  )
+else:
+  import os
+  import requests
+  import subprocess
+  ##modelPath="/home/af/gguf/models/phi-2.Q4_0.gguf"
+  #modelPath="/home/af/gguf/models/openchat-3.5-0106.Q4_0.gguf"
+  #modelPath="/home/af/gguf/models/decilm-7b-uniform-gqa-q8_0.gguf"
+  #modelPath="/home/af/gguf/models/wizardlm-13b-v1.2.Q4_0.gguf"
+  #modelPath="/home/af/gguf/models/SauerkrautLM-7b-HerO-q8_0.gguf"
+  #modelPath="/home/af/gguf/models/gemma-2b-it-Q4_0.gguf"
+  modelPath="/home/af/gguf/models/discolm_german_7b_v1.Q4_0.gguf"
+  modelPath="/home/af/gguf/models/gemma-7b-it-Q4_K_M.gguf"
+  modelPath="/home/af/gguf/models/gemma-7b-it-Q4_0.gguf"
+  #modelPath="/home/af/gguf/models/sauerkrautlm-una-solar-instruct.Q4_0.gguf"
+  #modelPath="/home/af/gguf/models/mixtral-8x7b-instruct-v0.1.Q4_0.gguf"
+  #modelPath="/home/af/gguf/models/dolphin-2.5-mixtral-8x7b.Q4_0.gguf"
+  #modelPath="/home/af/gguf/models/nous-hermes-2-mixtral-8x7b-dpo.Q4_0.gguf"
+  if(os.path.exists(modelPath)==False):
+    #url="https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GGUF/resolve/main/wizardlm-13b-v1.2.Q4_0.gguf"
+    #url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
+    #url="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true"
+    url="https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q4_0.gguf?download=true"
+    response = requests.get(url)
+    with open("./model.gguf", mode="wb") as file:
+      file.write(response.content)
+    print("Model downloaded")
+    modelPath="./model.gguf"
+  print(modelPath)
+  n="20"
+  if("mixtral-8x7b-instruct" in modelPath): n="0" # mixtral seems to cause problems here...
+  command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "8", "--n_gpu_layers", n]
+  subprocess.Popen(command)
+  print("Server ready!")
+#import llama_cpp
+#llama_cpp.llama_backend_init(numa=False)
+#params=llama_cpp.llama_context_default_params()
+#params.n_ctx
 # Gradio-GUI
 #------------
+def extend_prompt(message="", history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4): #float("Inf")
+  if zeichenlimit is None: zeichenlimit=1000000000 # :-)
+  template0="[INST] {system} [/INST]</s>" if onPrem else "[INST] {system} [/INST]</s>" #<s>?
+  template1="[INST] {message} [/INST] "
+  template2="{response}</s>"
+  if("discolm_german_7b" in modelPath): #https://huggingface.co/DiscoResearch/DiscoLM_German_7b_v1
+    template0="<|im_start|>system\n{system}<|im_end|>\n"
+    template1="<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    template2="{response}<|im_end|>\n"
+  if("mixtral-8x7b-instruct" in modelPath): # https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+    template0="[INST] {system} [/INST]</s>" if onPrem else "[INST] {system} [/INST]</s>" #<s>?
+    template1="[INST] {message} [/INST] "
+    template2="{response}</s>"
+  if("gemma-" in modelPath): # https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+    template0="<start_of_turn>user{system}</end_of_turn>"
+    template1="<start_of_turn>user{message}</end_of_turn><start_of_turn>model"
+    template2="{response}</end_of_turn>"
+  if("Mistral-7B-Instruct" in modelPath): #https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
+    template0="[INST] {system} [/INST]</s>" if onPrem else "[INST] {system} [/INST]</s>" #<s>?
+    template1="[INST] {message} [/INST] "
+    template2="{response}</s>"
+  if("openchat-3.5" in modelPath): #https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF
+    template0="GPT4 Correct User: {system}<|end_of_turn|>GPT4 Correct Assistant: Okay.<|end_of_turn|>"
+    template1="GPT4 Correct User: {message}<|end_of_turn|>GPT4 Correct Assistant: "
+    template2="{response}<|end_of_turn|>"
+  if("SauerkrautLM-7b-HerO" in modelPath):  #https://huggingface.co/VAGOsolutions/SauerkrautLM-7b-HerO
+    template0="<|im_start|>system\n{system}<|im_end|>\n"
+    template1="<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    template2="{response}<|im_end|>\n"
+  if("WizardLM-13B-V1.2" in modelPath): #https://huggingface.co/WizardLM/WizardLM-13B-V1.2
+    template0="{system} " #<s>
+    template1="USER: {message} ASSISTANT: "
+    template2="{response}</s>"
+  if("phi-2" in modelPath): #https://huggingface.co/TheBloke/phi-2-GGUF
+    template0="Instruct: {system}\nOutput: Okay.\n"
+    template1="Instruct: {message}\nOutput:"
+    template2="{response}\n"
+  prompt = ""
+  if RAGAddon is not None:
+    system += RAGAddon
+  if system is not None:
+    prompt += template0.format(system=system) #"<s>"
+  if history is not None:
+    for user_message, bot_response in history[-historylimit:]:
+      if user_message is not None: prompt += template1.format(message=user_message[:zeichenlimit])  #"[INST] {user_prompt} [/INST] "
+      if bot_response is not None: prompt += template2.format(response=bot_response[:zeichenlimit]) #"{bot_response}</s> "
+  if message is not None: prompt += template1.format(message=message[:zeichenlimit])                #"[INST] {message} [/INST]"
+  if system2 is not None:
+    prompt += system2
+  return prompt
 import gradio as gr
+import requests
 import json
+from datetime import datetime
+import os
+import re
+def response(message, history,customSysPrompt,settings):
+  #print(str(history)) # print history
+  #system="Du bist ein KI-basierter Assistent."
+  system="Lass uns ein Rollenspiel spielen. Wir spielen Shadowrun. Du bist der Spielleiter und sprichst Deutsch." if customSysPrompt is None else customSysPrompt
+  message=message.replace("[INST]","")
+  message=message.replace("[/INST]","")
+  message=re.sub("<[|](im_start|im_end|end_of_turn)[|]>", '', message)
+  if (settings=="Permanent"):
+    if((len(history)==0)&(os.path.isfile(filename))): history=json.load(open(filename,'r',encoding="utf-8")) # retrieve history (if available)
+  x=collection.get(include=[])["ids"]
+  rag=None # RAG is turned off until history gets too long
+  historylimit=4
+  if(len(x)>(historylimit*2)): # turn on RAG when the database contains entries that are not shown within historylimit
+    RAGResults=collection.query(
+      query_texts=[message],
+      n_results=1,
+      #where={"source": "USER"}
+    )
+    bestMatch=str(RAGResults["metadatas"][0][0]["dialog"])
+    #print("Message: "+message+"\n\nBest Match: "+bestMatch)
+    rag="\n\n"
+    rag += "Mit Blick auf den aktuellen Stand der Session erinnerst du dich insb. an folgende Episode:\n"
+    rag += bestMatch
+    rag += "\n\nIm Folgenden siehst du den aktuellen Stand der Session."
+    rag += "Bitte beschreibe kurz den weiteren Verlauf bis zur nächsten Handlung des Spielers!"
+  else:
+    system += "\nBitte beschreibe kurz den weiteren Verlauf bis zur nächsten Handlung des Spielers!"
+  system2=None # system2 can be used as fictive first words of the AI, which are not displayed or stored
+  #print("RAG: "+rag)
+  #print("System: "+system+"\n\nMessage: "+message)
+  prompt=extend_prompt(message,history,system,rag,system2,historylimit=historylimit)
+  print("\n\n*** Prompt:\n"+prompt+"\n***\n\n")
+  ## Request response from model
+  #------------------------------
+  print("AI running on prem!" if(onPrem) else "AI running HFHub!")
+  if(onPrem==False):
+    temperature=float(0.9)
+    max_new_tokens=500
+    top_p=0.95
+    repetition_penalty=1.0
     if temperature < 1e-2: temperature = 1e-2
     top_p = float(top_p)
     generate_kwargs = dict(
         do_sample=True,
         seed=42,
     )
+    stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+    response = ""
+    #print("User: "+message+"\nAI: ")
+    for text in stream:
+        part=text.token.text
+        #print(part, end="", flush=True)
+        response += part
+        yield response
+    history.append((message, response)) # add current dialog to history
+    # Store current state in DB if settings=="Permanent"
+    if (settings=="Permanent"):
+      x=collection.get(include=[])["ids"] # add current dialog to db
+      collection.add(
+        documents=[message,response],
+        metadatas=[
+          { "source": "ICH", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"},
+          { "source": "DU",  "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"}
+        ],
+        ids=[str(len(x)+1),str(len(x)+2)]
+      )
+      json.dump(history,open(filename,'w',encoding="utf-8"),ensure_ascii=False)
+  if(onPrem==True):
+    # url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
+    url="http://0.0.0.0:2600/v1/completions"
+    body={"prompt":prompt,"max_tokens":None, "echo":"False","stream":"True"}            # e.g. Mixtral-Instruct
+    if("discolm_german_7b" in modelPath): body.update({"stop": ["<|im_end|>"]})         # fix stop-token of DiscoLM
+    if("gemma-" in modelPath): body.update({"stop": ["<|im_end|>","</end_of_turn>"]})   # fix stop-token of Gemma
+    response="" #+"("+myType+")\n"
+    buffer=""
+    #print("URL: "+url)
+    #print("User: "+message+"\nAI: ")
+    for text in requests.post(url, json=body, stream=True):  #-H 'accept: application/json' -H 'Content-Type: application/json'
+      if buffer is None: buffer=""
+      buffer=str("".join(buffer))
+      # print("*** Raw String: "+str(text)+"\n***\n")
+      text=text.decode('utf-8')
+      if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
+      # print("\n*** Buffer: "+str(buffer)+"\n***\n")
+      buffer=buffer.split('"finish_reason": null}]}')
+      if(len(buffer)==1):
+        buffer="".join(buffer)
+        pass
+      if(len(buffer)==2):
+        part=buffer[0]+'"finish_reason": null}]}'
+        if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
+        try:
+          part = str(json.loads(part)["choices"][0]["text"])
+          #print(part, end="", flush=True)
+          response=response+part
+          buffer="" # reset buffer
+        except Exception as e:
+          print("Exception:"+str(e))
+          pass
+      yield response
+    history.append((message, response)) # add current dialog to history
+    # Store current state in DB if settings=="Permanent"
+    if (settings=="Permanent"):
+      x=collection.get(include=[])["ids"] # add current dialog to db
+      collection.add(
+        documents=[message,response],
+        metadatas=[
+          { "source": "ICH", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"},
+          { "source": "DU",  "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"}
+        ],
+        ids=[str(len(x)+1),str(len(x)+2)]
+      )
+      json.dump(history,open(filename,'w',encoding="utf-8"),ensure_ascii=False)
+gr.ChatInterface(
+  response,
+  chatbot=gr.Chatbot(render_markdown=True),
+  title="AI-Interface (on prem)" if onPrem else "AI-Interface (HFHub)",
+  additional_inputs=[
+    gr.Textbox(value="Lass uns ein Rollenspiel spielen. Wir spielen Shadowrun. Du bist der Spielleiter und sprichst Deutsch.",label="System Prompt"),
+    gr.Dropdown(["Permanent","Temporär"],value="Temorär",label="Dialog speichern?")
+  ]
+  ).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
+print("Interface up and running!")