File size: 2,688 Bytes
38585cf
406a63c
38585cf
406a63c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cfae4dc
406a63c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38585cf
 
406a63c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
from huggingface_hub import InferenceClient
import gradio as gr
from gradio_client import Client


model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
inference_client = InferenceClient(model_id, token=os.environ["HF_TOKEN"])
docs_embeddings_space_id = "huggingchat/hf-docs"
gradio_client = Client(docs_embeddings_space_id)


SYSTEM_PROMPT = "You are a Hugging Face AI expert. Use the provided context to answer user questions. If the request is not realted to Hugging Face Hub or Hugging Face open source libraries, you MUST respond with: \"I can only chat about Hugging Face\" and STOP answering." # from https://huggingface.co/chat/settings/assistants/65f33e95d854946bb3f88dde


def generate(prompt, history):
    try:
        # step 1: get relevant docs excerpts
        rag_content, sourced_md = gradio_client.predict(
                query_text=prompt,
                output_option="RAG-friendly",
                api_name="/predict"
        )
        
        # step 2; generate answer
        processed_prompt = f'''Answer the question: "{prompt}"\

Here are relevant extract from docs that you can use to generate the answer:
=====================
{rag_content}
====================='''

        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
        for user_msg, assistant_msg in history:
            assistant_msg = assistant_msg.split("\n\nsources:")[0]
            messages.extend([{"role": "user", "content": user_msg}, {"role": "assistant", "content": assistant_msg}])

        messages.append({"role": "user", "content": processed_prompt})

        generate_kwargs = dict(
            temperature=0.6,
            max_tokens=8192,
            top_p=0.95,
        )

        output = ""

        for token in inference_client.chat_completion(messages, stream=True, **generate_kwargs):
            new_content = token.choices[0].delta.content
            output += new_content
            yield output + f"\n\nsources: {sourced_md}"

        return output + f"\n\nsources: {sourced_md}"
    except Exception as e:
        raise gr.Error(e)



examples = ["How do I upload a model?", 
            "Can I change the color of my Space?", 
            "How do I finetune Stable Diffusion with Lora?", 
            "How do I run a model found on the Hugging Face Hub?"]



demo = gr.ChatInterface(
    fn=generate,
    chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
    title="HF Docs Bot 🤗",
    examples=examples,
    concurrency_limit=400,
    stop_btn = None,
    retry_btn = None,
    undo_btn = None,
    clear_btn = None,
    cache_examples=False
)

demo.launch(show_api=False)