File size: 8,881 Bytes
15a19f2
 
 
 
 
 
 
7976fe2
15a19f2
 
 
 
 
7976fe2
15a19f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edf97c1
 
15a19f2
 
7976fe2
 
15a19f2
7976fe2
15a19f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edf97c1
15a19f2
 
 
 
 
7976fe2
15a19f2
7976fe2
15a19f2
 
 
 
 
 
 
7976fe2
15a19f2
7976fe2
15a19f2
 
 
 
 
 
 
 
 
 
 
 
 
7976fe2
15a19f2
 
 
 
 
7976fe2
15a19f2
7976fe2
15a19f2
 
 
 
 
 
f9ff224
15a19f2
 
edf97c1
 
 
15a19f2
 
 
 
 
 
 
edf97c1
15a19f2
edf97c1
7976fe2
 
15a19f2
 
7976fe2
edf97c1
 
15a19f2
 
7976fe2
15a19f2
 
edf97c1
15a19f2
 
 
 
 
 
 
 
 
edf97c1
15a19f2
 
 
7976fe2
15a19f2
 
 
20d94f9
 
edf97c1
15a19f2
 
 
7976fe2
 
15a19f2
7976fe2
edf97c1
7976fe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edf97c1
 
 
7976fe2
 
 
 
 
 
 
 
 
 
edf97c1
 
15a19f2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from langchain.embeddings import TensorflowHubEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from tqdm import tqdm
import pandas as pd
import gradio as gr
import datetime
import openai
import praw
import os
import re

embs = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder/4")

def set_openai_key(key):

    if key == "":
        key = os.environ.get("OPENAI_API_KEY")

    openai.api_key = key

def set_reddit_keys(client_id, client_secret, user_agent):

    # If any of the keys are empty, use the environment variables
    if [client_id, client_secret, user_agent] == ["", "", ""]:
        client_id = os.environ.get("REDDIT_CLIENT_ID")
        client_secret = os.environ.get("REDDIT_CLIENT_SECRET")
        user_agent = os.environ.get("REDDIT_USER_AGENT")

    reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)

    return reddit

def generate_topics(query, model="gpt-3.5-turbo"):

    current_date = datetime.datetime.now().strftime("%Y-%m-%d")

    messages = [
        {"role": "user", "content": f"The current date is {current_date}. Take this query '{query}' and return a list of 10 simple to understand topics (4 words or less) to input in Search so it returns good results."}
    ]

    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )

    response_message = response["choices"][0]["message"]["content"]

    topics = re.sub(r'^\d+\.\s*', '', response_message, flags=re.MULTILINE).split("\n")

    # Post-processing GPT output

    topics = [topic.strip() for topic in topics]

    topics = [topic[1:-1] if (topic.startswith('"') and topic.endswith('"')) or (topic.startswith("'") and topic.endswith("'")) else topic for topic in topics]

    topics = [re.sub(r'[^a-zA-Z0-9\s]', ' ', topic) for topic in topics]

    return topics

def get_relevant_comments(reddit, topics):

    comments = []

    for topic in tqdm(topics):
        for post in reddit.subreddit("all").search(
        topic, limit=5):
            
            post.comment_limit = 10
            post.comment_sort = "top"

            # Top level comments only
            post.comments.replace_more(limit=0)

            for comment in post.comments:
                author = comment.author.name if comment.author else '[deleted]'
                comments.append([post.id, comment.id, post.subreddit.display_name, post.title, author, comment.body, datetime.datetime.fromtimestamp(comment.created).strftime('%Y-%m')])

    comments = pd.DataFrame(comments,columns=['source', 'comment_id', 'subreddit', 'title', 'author', 'text', 'date'])

    # Drop empty texts or ["deleted"] texts
    comments = comments[comments['text'].str.len() > 0]
    comments = comments[comments['text'] != "[deleted]"]

    # Drop comments with None authors
    comments = comments[comments['author'] != "AutoModerator"]

    # Drop duplicate ids
    comments = comments.drop_duplicates(subset=['source'])

    return comments

def construct_retriever(comments, k=5):

    # Convert comments dataframe to a dictionary
    comments = comments.to_dict('records')

    # Convert comments["text"] to a list of strings
    texts = [comment["title"] + " " + comment["date"] + ": " + comment["text"] + " " + comment["subreddit"] for comment in comments]

    db = Chroma.from_texts(texts, embs, metadatas=[{"source": comment["source"], "comment_id": comment["comment_id"], "author": comment["author"], "subreddit": comment["subreddit"], "title": comment["title"]} for comment in comments])

    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})

    return retriever

def construct_bot(retriever):
    bot = ConversationalRetrievalChain.from_llm(OpenAI(openai_api_key=openai.api_key, temperature=0), retriever, return_source_documents=True, max_tokens_limit=2000)
    return bot

def get_response(bot, query, chat_history):
    # Convert chat_history to a list of tuples
    chat_history = [tuple(chat) for chat in chat_history]
    response = bot({"question": query, "chat_history": chat_history})
    return response

def restart():

    print("Chat history and bot knowledge has been cleared!")

    return [], "", gr.State(), "Bot has no knowledge yet! Please enter an initial query to educate the bot."

def main(query, openAI_key, reddit_client_id, reddit_client_secret, reddit_user_agent, chat_history, bot, kb):

    set_openai_key(openAI_key)

    if chat_history == []:

        reddit = set_reddit_keys(reddit_client_id, reddit_client_secret, reddit_user_agent)

        print("Bot knowledge has not been initialised yet! Generating topics...")
        topics = generate_topics(query)
        kb = "Bot now has knowledge of the following topics: [" + "".join([f"{i+1}. {topic} " for i, topic in enumerate(topics)]) + "]"

        print("Fetching relevant comments...")
        comments = get_relevant_comments(reddit, topics)

        print("Embedding relevant comments...")
        retriever = construct_retriever(comments)

        print("Educating bot...")
        bot = construct_bot(retriever)

        print("Bot has been constructed and is ready to use!")

    response = get_response(bot, query, chat_history)

    answer, source_documents = response["answer"], response["source_documents"]

    source_urls = "### Sources\n\nThe following contain sources the bot might have used to answer your last query:\n\n" + "\n\n".join([f'[{x.metadata["title"]} (r/{x.metadata["subreddit"]})](https://www.reddit.com/r/{x.metadata["subreddit"]}/comments/{x.metadata["source"]}/comment/{x.metadata["comment_id"]})' for x in source_documents])

    chat_history.append((query, answer))

    print(query, answer)

    return "", kb, chat_history, source_urls, bot

# Testing only!

title = "Ask Reddit GPT πŸ“œ"


with gr.Blocks() as demo:
                
        with gr.Group():
            gr.Markdown(f'<center><h1>{title}</h1></center>')
            gr.Markdown(f"Ask Reddit GPT allow you to ask about and chat with information found on Reddit. The tool uses the Reddit API to build a database of knowledge (stored in a Chroma database) and LangChain to query it. For each response, a list of potential sources are sent back. The first query you sent will take a while as it will need to build a knowledge base based on the topics concerning such query. Subsequent queries on the same topic will be much faster. If however, you would like to ask a question concerning other topics, you will need to clear out the knowledge base. To do this, click the 'Restart knowledge base' button below.")

            with gr.Accordion("Instructions", open=False):
                gr.Markdown('''1. You will need an **Open AI** API key! Get one [here](https://platform.openai.com/account/api-keys).



                2. You will also need **Reddit** credentials! Steps to obtain them:

                * Log in to Reddit.

                * Go [here](https://www.reddit.com/prefs/apps). 

                * Scroll to the bottom.

                * Click "create another app...".

                * Fill in the details as you wish, but make sure you select "script" as the type.

                * Click "create app".

                * Copy the client ID, client secret, and user agent name and paste them in the boxes below.  

                * All done!

                ''')  
        
        with gr.Group():

            with gr.Accordion("Credentials", open=True):
                openAI_key=gr.Textbox(label='Enter your OpenAI API key here:')
                reddit_client_id=gr.Textbox(label='Enter your Reddit client ID here:')
                reddit_client_secret=gr.Textbox(label='Enter your Reddit client secret here:')
                reddit_user_agent=gr.Textbox(label='Enter your Reddit user agent here:')

        with gr.Group():

            kb = gr.Markdown("Bot has no knowledge yet! Please enter an initial query to educate the bot.")
            chat_history = gr.Chatbot()
            bot = gr.State()

            query = gr.Textbox()
            submit = gr.Button("Submit")
            submit.style(full_width=True)

            clear = gr.Button("Restart knowledge base")
            clear.style(full_width=True)

            sources = gr.Markdown()

            submit.click(main, [query, openAI_key, reddit_client_id, reddit_client_secret, reddit_user_agent, chat_history, bot, kb], [query, kb, chat_history, sources, bot])
            clear.click(restart, None, [chat_history, sources, bot, kb], queue=False)

demo.launch()