Spaces:

andaqu
/

ask-reddit-gpt

Runtime error

App Files Files Community

wendru18 commited on May 8, 2023

Commit

7da8c71

1 Parent(s): 6ee98e6

added langchain

Browse files

Files changed (2) hide show

main.ipynb +158 -141
semantic_search.py +3 -3

main.ipynb CHANGED Viewed

@@ -2,13 +2,11 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from semantic_search import SemanticSearch \n",
     "import pandas as pd\n",
-    "import tiktoken\n",
     "import openai\n",
     "import praw\n",
     "import os\n",
@@ -20,16 +18,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "searcher = SemanticSearch()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -50,19 +55,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def generate_topics(query, model=\"gpt-3.5-turbo\"):\n",
     "\n",
     "    messages = [\n",
-    "        {\"role\": \"user\", \"content\": f\"Take this query '{query}' and return a list of short topics to input in Search so it returns good results. Each topic must stand on its own with respect to the relation of the question.\"},\n",
     "    ]\n",
     "\n",
     "    response = openai.ChatCompletion.create(\n",
     "        model=model,\n",
-    "        messages=messages\n",
     "    )\n",
     "\n",
     "    response_message = response[\"choices\"][0][\"message\"][\"content\"]\n",
@@ -74,21 +80,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "query = \"Where are some nice places where I can work remotely in Malta?\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "topics = generate_topics(query)\n",
     "topics = [topic.strip() for topic in topics]\n",
     "print(topics)"
    ]
   },
@@ -97,50 +112,37 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Relevant Subreddits Retrieval"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "posts = []\n",
     "\n",
     "for topic in topics:\n",
     "    for post in reddit.subreddit(\"all\").search(\n",
-    "    topic, limit=200):\n",
-    "        posts.append([post.title, post.subreddit, post.selftext])\n",
     "\n",
-    "posts = pd.DataFrame(posts,columns=['title', 'subreddit', 'text'])\n",
     "\n",
-    "# Segments is title, text and subreddit at the end\n",
-    "segments = (posts['title'] + ' ' + posts['subreddit'].astype(str)).tolist()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "searcher.fit(segments, n_neighbors=5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# TODO: Add distance check here\n",
-    "subreddits = set([result.split()[-1] for result in searcher(query)])\n",
-    "\n",
-    "# Convert to string and \"+\" in between\n",
-    "subreddits = \"+\".join(subreddits)\n",
-    "\n",
-    "print(f\"Relevant subreddits: {subreddits}\")"
    ]
   },
   {
@@ -148,148 +150,163 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Relevant Posts Retrieval"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "segments = []\n",
-    "segment_length = 100\n",
-    "\n",
-    "\n",
-    "for topic in topics:\n",
-    "    for post in reddit.subreddit(subreddits).search(\n",
-    "        topic, limit=50):\n",
-    "            \n",
-    "            comments = \"\"\n",
-    "\n",
-    "            post.comments.replace_more(limit=3)\n",
-    "            for comment in post.comments.list():\n",
-    "                if comment.body != \"[deleted]\":\n",
-    "                    comments += comment.body + \"\\n\"\n",
-    "\n",
-    "            words = comments.split()\n",
-    "            segments.extend([post.title + \" \" + post.id + \"\\n\" + ' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "searcher.fit(segments, n_neighbors=5)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Answering the Query"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def num_tokens(text, model):\n",
-    "    encoding = tiktoken.encoding_for_model(model)\n",
-    "    return len(encoding.encode(text))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def form_query(query, model, token_budget):\n",
-    "\n",
-    "    relevant_segments = searcher(query)\n",
-    "\n",
-    "    introduction = 'Use the below segments from multiple Reddit posts to answer the subsequent question. If the answer cannot be found in the articles, write \"I could not find an answer.\" Cite each sentence using the [postid] notation found at the start of each segment. Every sentence MUST have a citation!\\n\\n'\n",
-    "\n",
-    "    message = introduction\n",
-    "\n",
-    "    query = f\"\\n\\nQuestion: {query}\"\n",
-    "\n",
-    "    evidence = []\n",
-    "\n",
-    "    for i, result in enumerate(relevant_segments):\n",
-    "        if (\n",
-    "            num_tokens(message + result + query, model=model)\n",
-    "            > token_budget\n",
-    "        ):\n",
-    "            break\n",
-    "        else:\n",
-    "            result = result + \"\\n\\n\"\n",
-    "            message += result\n",
-    "            evidence.append(result.split(\"\\n\")[0])\n",
-    "\n",
-    "    evidence = list(set(evidence))\n",
     "\n",
-    "    return message + query, evidence"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "def generate_answer(query, model, token_budget, temperature):\n",
-    "    \n",
-    "    message, evidence = form_query(query, model, token_budget)\n",
-    "\n",
-    "    messages = [\n",
-    "        {\"role\": \"user\", \"content\": message},\n",
-    "    ]\n",
-    "\n",
-    "    print(message)\n",
-    "\n",
-    "    response = openai.ChatCompletion.create(\n",
-    "        model=model,\n",
-    "        messages=messages,\n",
-    "        temperature=temperature\n",
-    "    )\n",
-    "    \n",
-    "    response_message = response[\"choices\"][0][\"message\"][\"content\"]\n",
-    "\n",
-    "    return response_message, evidence"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "answer, evidence = generate_answer(query, \"gpt-3.5-turbo\", 1000, 0)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "query"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "answer"
    ]
   }
  ],
@@ -309,7 +326,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.0"
   },
   "orig_nbformat": 4
  },

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 94,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import openai\n",
     "import praw\n",
     "import os\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 95,
    "metadata": {},
    "outputs": [],
    "source": [
+    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
+    "from langchain.text_splitter import CharacterTextSplitter\n",
+    "from langchain.vectorstores import Chroma\n",
+    "from langchain.docstore.document import Document\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain.indexes.vectorstore import VectorstoreIndexCreator\n",
+    "from langchain.chains.qa_with_sources import load_qa_with_sources_chain\n",
+    "from langchain.llms import OpenAI"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 96,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 97,
    "metadata": {},
    "outputs": [],
    "source": [
     "def generate_topics(query, model=\"gpt-3.5-turbo\"):\n",
     "\n",
     "    messages = [\n",
+    "        {\"role\": \"user\", \"content\": f\"Take this query '{query}' and return a list of 10 simple to understand topics (3 words or less) to input in Search so it returns good results.\"},\n",
     "    ]\n",
     "\n",
     "    response = openai.ChatCompletion.create(\n",
     "        model=model,\n",
+    "        messages=messages,\n",
+    "        temperature=0\n",
     "    )\n",
     "\n",
     "    response_message = response[\"choices\"][0][\"message\"][\"content\"]\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 108,
    "metadata": {},
    "outputs": [],
    "source": [
+    "query = \"Are we in a recession right now?\""
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 109,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Current economic status', 'Recession indicators', 'Unemployment rates', 'GDP growth rate', 'Consumer spending trends', 'Stock market performance', 'Federal Reserve actions', 'Economic stimulus packages', 'Business closures impact', 'Housing market trends']\n"
+     ]
+    }
+   ],
    "source": [
     "topics = generate_topics(query)\n",
     "topics = [topic.strip() for topic in topics]\n",
+    "topics = [topic[1:-1] if (topic.startswith('\"') and topic.endswith('\"')) or (topic.startswith(\"'\") and topic.endswith(\"'\")) else topic for topic in topics]\n",
     "print(topics)"
    ]
   },
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Relevant Subreddit Retrieval"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 110,
    "metadata": {},
    "outputs": [],
    "source": [
     "posts = []\n",
+    "comments = []\n",
     "\n",
     "for topic in topics:\n",
     "    for post in reddit.subreddit(\"all\").search(\n",
+    "    topic, limit=5):\n",
+    "        posts.append([post.id, post.title, post.subreddit, post.selftext])\n",
+    "        post.comments.replace_more(limit=1)\n",
     "\n",
+    "        for comment in post.comments.list():\n",
+    "            posts.append([post.id, post.title, post.subreddit, comment.body])\n",
     "\n",
+    "posts = pd.DataFrame(posts,columns=['source', 'title', 'subreddit', 'text'])"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 111,
    "metadata": {},
    "outputs": [],
    "source": [
+    "posts[\"subreddit\"] = posts[\"subreddit\"].apply(lambda x: x.display_name)\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Answering Query with Langchain"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 112,
    "metadata": {},
    "outputs": [],
    "source": [
+    "text = posts[\"text\"].tolist()\n",
+    "text = \" \".join(text)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 113,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Created a chunk of size 1635, which is longer than the specified 1000\n",
+      "Created a chunk of size 1298, which is longer than the specified 1000\n",
+      "Created a chunk of size 1109, which is longer than the specified 1000\n",
+      "Created a chunk of size 2072, which is longer than the specified 1000\n",
+      "Created a chunk of size 1498, which is longer than the specified 1000\n",
+      "Created a chunk of size 1419, which is longer than the specified 1000\n",
+      "Created a chunk of size 1127, which is longer than the specified 1000\n",
+      "Created a chunk of size 1576, which is longer than the specified 1000\n",
+      "Created a chunk of size 1314, which is longer than the specified 1000\n",
+      "Created a chunk of size 2563, which is longer than the specified 1000\n",
+      "Created a chunk of size 1287, which is longer than the specified 1000\n",
+      "Created a chunk of size 1649, which is longer than the specified 1000\n",
+      "Created a chunk of size 1616, which is longer than the specified 1000\n",
+      "Created a chunk of size 1573, which is longer than the specified 1000\n",
+      "Created a chunk of size 1024, which is longer than the specified 1000\n",
+      "Created a chunk of size 1395, which is longer than the specified 1000\n",
+      "Created a chunk of size 1712, which is longer than the specified 1000\n",
+      "Created a chunk of size 1175, which is longer than the specified 1000\n",
+      "Created a chunk of size 3872, which is longer than the specified 1000\n",
+      "Created a chunk of size 1098, which is longer than the specified 1000\n",
+      "Created a chunk of size 1429, which is longer than the specified 1000\n",
+      "Created a chunk of size 1002, which is longer than the specified 1000\n",
+      "Created a chunk of size 2241, which is longer than the specified 1000\n",
+      "Created a chunk of size 1923, which is longer than the specified 1000\n",
+      "Created a chunk of size 1716, which is longer than the specified 1000\n",
+      "Created a chunk of size 2563, which is longer than the specified 1000\n",
+      "Created a chunk of size 1221, which is longer than the specified 1000\n",
+      "Created a chunk of size 2449, which is longer than the specified 1000\n",
+      "Created a chunk of size 1321, which is longer than the specified 1000\n",
+      "Created a chunk of size 1302, which is longer than the specified 1000\n",
+      "Created a chunk of size 2182, which is longer than the specified 1000\n",
+      "Created a chunk of size 1027, which is longer than the specified 1000\n",
+      "Created a chunk of size 1156, which is longer than the specified 1000\n",
+      "Created a chunk of size 7334, which is longer than the specified 1000\n",
+      "Created a chunk of size 1849, which is longer than the specified 1000\n",
+      "Created a chunk of size 2829, which is longer than the specified 1000\n",
+      "Created a chunk of size 1567, which is longer than the specified 1000\n",
+      "Created a chunk of size 1245, which is longer than the specified 1000\n",
+      "Created a chunk of size 1299, which is longer than the specified 1000\n",
+      "Created a chunk of size 1003, which is longer than the specified 1000\n",
+      "Created a chunk of size 1327, which is longer than the specified 1000\n",
+      "Created a chunk of size 2079, which is longer than the specified 1000\n",
+      "Created a chunk of size 2780, which is longer than the specified 1000\n",
+      "Created a chunk of size 1522, which is longer than the specified 1000\n",
+      "Created a chunk of size 1766, which is longer than the specified 1000\n",
+      "Created a chunk of size 1079, which is longer than the specified 1000\n",
+      "Created a chunk of size 1080, which is longer than the specified 1000\n",
+      "Created a chunk of size 1755, which is longer than the specified 1000\n",
+      "Created a chunk of size 1232, which is longer than the specified 1000\n",
+      "Created a chunk of size 1279, which is longer than the specified 1000\n",
+      "Created a chunk of size 3189, which is longer than the specified 1000\n",
+      "Created a chunk of size 1549, which is longer than the specified 1000\n",
+      "Created a chunk of size 1124, which is longer than the specified 1000\n",
+      "Created a chunk of size 1033, which is longer than the specified 1000\n",
+      "Created a chunk of size 1676, which is longer than the specified 1000\n",
+      "Created a chunk of size 1011, which is longer than the specified 1000\n",
+      "Created a chunk of size 1723, which is longer than the specified 1000\n"
+     ]
+    }
+   ],
    "source": [
+    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
+    "texts = text_splitter.split_text(text)\n",
     "\n",
+    "embeddings = OpenAIEmbeddings()"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 114,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using embedded DuckDB without persistence: data will be transient\n"
+     ]
+    }
+   ],
    "source": [
+    "docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{\"source\": str(i)} for i in range(len(texts))]).as_retriever()"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 115,
    "metadata": {},
    "outputs": [],
    "source": [
+    "docs = docsearch.get_relevant_documents(query)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 116,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content=\"A recession is always just around the corner, meanwhile the S&P has been making higher lows since October 2022, unemployed in Canada is 5.0%, and GDP hasn’t been negative for two quarters. It is likely to happen. And economists will have to work fast to figure out a solution. You need demand to sustain supply. And without jobs there isn't demand, nor tax revenue.\\n\\nSo short term: pain is possible.\\n\\nMedium term: we might benefit from doing less work and getting paid the same.\", metadata={'source': '39'}),\n",
+       " Document(page_content=\"Don't listen to people who say GDP is the only thing that matters for recessions.  The NBER's definition -- which hasn't changed -- requires economic downturn across the economy in a broad sense.  We probably can't have a recession and very low unemployment at the same time.  So if we have a recession, we will have layoffs, too. [deleted] Most layoffs I've seen headlines for are in tech. While tech is a huge sector, it is important to look at the market as a whole. Job growth and job demand is still high. There are many many different indicators people look at. Housing prices is another. The difference between the bond curve. 3 month compared to the 1 year is another common one. When that inverts it means the economy isn't in a great place. It's so hard to predict which is why people constantly say to dollar cost average. Continuously buy and don't try to time anything. This time unemployment rate will be affected.\", metadata={'source': '13'}),\n",
+       " Document(page_content='High tech layoffs often triggers layoff in retail as affected people no longer have the means to spend. Restaurant closing. Meta has to write off $0.67 billion dollars or more next qtr as 1 time charge.  Tweeter let go 50 pct employees so SF is already slow business wise. Housing industry is going to get hit this winter and even spring.   Be honest with ourselves we have been in recession most of the year.  Not too many mfg jobs are in the US so you do not see that many job losses.\\n\\nAfter Xmas I imagine companies like mail order delivery will slow down also. Your pocketbook. Negative GDP, inflation, decrease in purchases, growing unemployment... decrease of consumer spending; and increase of umemployment Honestly fear of a recession is the best indicator. People and companies start holding their money close to be safe and it becomes a self fulfilling prophecy. Most reliable indicator is when people are yelling for higher prices when their asset already doubled, tripled in value.', metadata={'source': '14'}),\n",
+       " Document(page_content='?? Agreed. Investors are jumping at the slightest bit of good news. But the job market is still strong, Fed wont stop raising rates until that changes. Not to mention the fact that oil is going up again. Which was a big factor in how we got to 8% - 9% inflation in the first place. Gas is almost $6 in my area now. More than it has ever been where I live. We are at the beginning of a lot of pain. Not the middle or the end. Recession is here, and it will deepen and broaden until everyone finally sees it. I mean australia raised under expectations but I dont think  that matter I think the jump was - Australia only bumped by 25 basis pts and for some reason the market is in love with the fed pivot idea, jobs openings reduced something like 10%, Bond yields went down as well.  But no, nothing fundamentally changed. The bank of England has started QE >didn\\'t interest rates hike again?\\n\\nYou \"call a trap\" but aren\\'t entirely sure if interest rates were hiked (they were)?', metadata={'source': '179'})]"
+      ]
+     },
+     "execution_count": 116,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "docs"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 117,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "' It is likely that we are in a recession right now.\\nSOURCES: 39, 13, 14, 179'"
+      ]
+     },
+     "execution_count": 117,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type=\"stuff\")\n",
+    "chain.run(input_documents=docs, question=query)"
    ]
   }
  ],
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.10.0"
   },
   "orig_nbformat": 4
  },

semantic_search.py CHANGED Viewed

@@ -21,12 +21,12 @@ class SemanticSearch:
     def __call__(self, text, return_data=True):
         inp_emb = self.use([text])
-        neighbors, distances = self.nn.kneighbors(inp_emb, return_distance=True)[0]
         if return_data:
-            return [self.data[i] for i in neighbors], distances
         else:
-            return neighbors, distances
     def get_text_embedding(self, texts, batch=1000):

     def __call__(self, text, return_data=True):
         inp_emb = self.use([text])
+        distances, neighbors = self.nn.kneighbors(inp_emb, return_distance=True)
         if return_data:
+            return [self.data[i] for i in neighbors[0]], distances
         else:
+            return neighbors[0], distances
     def get_text_embedding(self, texts, batch=1000):