File size: 3,155 Bytes
8f22239
39043d6
751c1e6
39a25f7
da994cb
818675e
14f05b9
8278b34
 
751c1e6
 
 
251281a
818675e
50734e1
39043d6
251281a
39043d6
251281a
 
 
 
 
 
 
3be34c2
b0fc8f9
14f05b9
 
5ed96b5
251281a
 
 
 
 
b0fc8f9
 
 
251281a
b0fc8f9
 
251281a
 
 
 
 
 
 
 
 
 
 
 
b0fc8f9
 
251281a
 
 
9eedb54
251281a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
import subprocess
import requests
from llama_cpp import Llama
import gradio as gr

#url="https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GGUF/resolve/main/wizardlm-13b-v1.2.Q4_0.gguf"
#url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
url="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true"
response = requests.get(url)
with open("./model.gguf", mode="wb") as file:
  file.write(response.content)
print("Model downloaded")

command = ["python3", "-m", "llama_cpp.server", "--model", "./model.gguf", "--host", "0.0.0.0", "--port", "2600", "--n_threads", "2"]
subprocess.Popen(command)
print("Model ready!")

#llm = Llama(model_path="./model.gguf")
#def response(input_text, history):
#    output = llm(f"Q: {input_text} A:", max_tokens=256, stop=["Q:", "\n"], echo=True)
#    return output['choices'][0]['text']

def response(message, history):
  #url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
  url="http://0.0.0.0:2600/v1/completions"  
  #body={"prompt":"Im Folgenden findest du eine Instruktion, die eine Aufgabe bescheibt. Schreibe eine Antwort, um die Aufgabe zu lösen.\n\n### Instruktion:\n"+message+"\n\n### Antwort:","max_tokens":500, "echo":"False","stream":"True"}
  #body={"prompt":" chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER:\n"+message+"\n\nASSISTANT:","max_tokens":500, "echo":"False","stream":"True"}    
  #body={"prompt":system+"### Instruktion:\n"+message+"\n\n### Antwort:","max_tokens":500, "echo":"False","stream":"True"} #e.g. SauerkrautLM
  body={"prompt":"[INST]"+message+"[/INST]","max_tokens":500, "echo":"False","stream":"True"} #e.g. Mixtral-Instruct
  response=""
  buffer=""
  print("URL: "+url)
  print("User: "+message+"\nAI: ")
  for text in requests.post(url, json=body, stream=True):  #-H 'accept: application/json' -H 'Content-Type: application/json'
    if buffer is None: buffer=""
    buffer=str("".join(buffer))
    #print("*** Raw String: "+str(text)+"\n***\n")
    text=text.decode('utf-8')
    if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
    #print("\n*** Buffer: "+str(buffer)+"\n***\n") 
    buffer=buffer.split('"finish_reason": null}]}')
    if(len(buffer)==1):
      buffer="".join(buffer)
      pass
    if(len(buffer)==2):
      part=buffer[0]+'"finish_reason": null}]}'  
      if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
      try: 
        part = str(json.loads(part)["choices"][0]["text"])
        print(part, end="", flush=True)
        response=response+part
        buffer="" # reset buffer
      except Exception as e:
        print("Exception:"+str(e))
        pass
    yield response 

gr.ChatInterface(response,title="Mistral-7B-Instruct-v0.2-GGUF Chatbot").queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
print("Interface up and running!")