Spaces:
Sleeping
Sleeping
adding the asking functionality
Browse files- .gitignore +2 -1
- TwitterChatBot/__init__.py +0 -0
- TwitterChatBot/chat.py +55 -0
- TwitterChatBot/gpt_3_manager.py +32 -0
- TwitterChatBot/index.py +37 -0
- TwitterChatBot/main.py +28 -0
- TwitterChatBot/prompt.py +57 -0
- TwitterChatBot/tests/chat_test.py +31 -0
- TwitterChatBot/tests/gpt_3_manager_test.py +21 -0
- TwitterChatBot/tests/index_test.py +30 -0
- TwitterChatBot/tests/prompt_test.py +62 -0
- TwitterChatBot/tests/utils_test.py +14 -0
- TwitterChatBot/utils.py +17 -0
- app.py +17 -16
- index/build_index.py +37 -0
- index/index.jsonl +0 -0
- prompts/passage_summarization.txt +4 -0
- prompts/question_answering.txt +7 -0
- requirements.txt +3 -1
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
.env
|
2 |
-
env/
|
|
|
|
1 |
.env
|
2 |
+
env/
|
3 |
+
__pycache__/
|
TwitterChatBot/__init__.py
ADDED
File without changes
|
TwitterChatBot/chat.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
from pathlib import Path
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
|
6 |
+
from TwitterChatBot.index import IndexSearchEngine
|
7 |
+
from TwitterChatBot.prompt import (
|
8 |
+
QuestionAnsweringPrompt,
|
9 |
+
PassageSummarizationPrompt,
|
10 |
+
TextPromptLoader,
|
11 |
+
)
|
12 |
+
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
17 |
+
|
18 |
+
openai.api_key = OPENAI_API_KEY
|
19 |
+
|
20 |
+
|
21 |
+
class ChatBot:
|
22 |
+
def __init__(
|
23 |
+
self, index_search_engine: IndexSearchEngine, prompt_loader, gpt_manager
|
24 |
+
):
|
25 |
+
self.index_search_engine = index_search_engine
|
26 |
+
self.prompet_loader = prompt_loader
|
27 |
+
self.gpt_manager = gpt_manager
|
28 |
+
|
29 |
+
def ask(self, question):
|
30 |
+
search_result = self.index_search_engine.search(question=question, count=2)
|
31 |
+
|
32 |
+
answers = []
|
33 |
+
for result in search_result:
|
34 |
+
question_answering_prompt = QuestionAnsweringPrompt(
|
35 |
+
passage=result, question=question, prompt_loader=self.prompet_loader
|
36 |
+
)
|
37 |
+
prompt = question_answering_prompt.load(
|
38 |
+
Path("./prompts") / "question_answering.txt"
|
39 |
+
)
|
40 |
+
|
41 |
+
answer = self.gpt_manager.get_completion(
|
42 |
+
prompt=prompt, max_tokens=80, model="text-curie-001"
|
43 |
+
)
|
44 |
+
answers.append(answer)
|
45 |
+
|
46 |
+
passage_summarization_prompt = PassageSummarizationPrompt(
|
47 |
+
"\n".join(answers), self.prompet_loader
|
48 |
+
)
|
49 |
+
|
50 |
+
prompt = passage_summarization_prompt.load(
|
51 |
+
Path("./prompts") / "passage_summarization.txt"
|
52 |
+
)
|
53 |
+
|
54 |
+
final_answer = self.gpt_manager.get_completion(prompt=prompt)
|
55 |
+
return final_answer
|
TwitterChatBot/gpt_3_manager.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
|
3 |
+
|
4 |
+
class Gpt3Manager:
|
5 |
+
def __init__(self, api_key):
|
6 |
+
openai.api_key = api_key
|
7 |
+
|
8 |
+
def get_completion(self, prompt, max_tokens=128, model="text-davinci-003"):
|
9 |
+
response = None
|
10 |
+
try:
|
11 |
+
response = openai.Completion.create(
|
12 |
+
prompt=prompt,
|
13 |
+
max_tokens=max_tokens,
|
14 |
+
model=model,
|
15 |
+
)["choices"][0]["text"]
|
16 |
+
|
17 |
+
except Exception as err:
|
18 |
+
print(f"Sorry, There was a problem \n\n {err}")
|
19 |
+
|
20 |
+
return response
|
21 |
+
|
22 |
+
def get_embedding(self, prompt, model="text-similarity-ada-001"):
|
23 |
+
prompt = prompt.replace("\n", " ")
|
24 |
+
embedding = None
|
25 |
+
try:
|
26 |
+
embedding = openai.Embedding.create(input=[prompt], model=model)["data"][0][
|
27 |
+
"embedding"
|
28 |
+
]
|
29 |
+
except Exception as err:
|
30 |
+
print(f"Sorry, There was a problem {err}")
|
31 |
+
|
32 |
+
return embedding
|
TwitterChatBot/index.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
import jsonlines
|
3 |
+
from TwitterChatBot.utils import dot_similarity
|
4 |
+
|
5 |
+
|
6 |
+
class Index(ABC):
|
7 |
+
@abstractmethod
|
8 |
+
def load(self, path):
|
9 |
+
pass
|
10 |
+
|
11 |
+
|
12 |
+
class JsonLinesIndex(Index):
|
13 |
+
def load(self, path):
|
14 |
+
with jsonlines.open(path) as passages:
|
15 |
+
indexes = list(passages)
|
16 |
+
return indexes
|
17 |
+
|
18 |
+
|
19 |
+
class IndexSearchEngine:
|
20 |
+
def __init__(self, indexes, gpt_manager):
|
21 |
+
self.indexes = indexes
|
22 |
+
self.gpt_manager = gpt_manager
|
23 |
+
|
24 |
+
def search(self, question, count=4):
|
25 |
+
question_embedding = self.gpt_manager.get_embedding(prompt=question)
|
26 |
+
|
27 |
+
simmilarities = []
|
28 |
+
for index in self.indexes:
|
29 |
+
embedding = index["embedding"]
|
30 |
+
score = dot_similarity(question_embedding, embedding)
|
31 |
+
simmilarities.append({"index": index, "score": score})
|
32 |
+
|
33 |
+
sorted_similarities = sorted(
|
34 |
+
simmilarities, key=lambda x: x["score"], reverse=True
|
35 |
+
)
|
36 |
+
|
37 |
+
return [result["index"]["content"] for result in sorted_similarities[:count]]
|
TwitterChatBot/main.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
from TwitterChatBot.index import IndexSearchEngine
|
6 |
+
from TwitterChatBot.gpt_3_manager import Gpt3Manager
|
7 |
+
from TwitterChatBot.chat import ChatBot
|
8 |
+
from TwitterChatBot.index import JsonLinesIndex
|
9 |
+
from TwitterChatBot.prompt import TextPromptLoader
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
14 |
+
|
15 |
+
path = Path("./index") / "index.jsonl"
|
16 |
+
|
17 |
+
|
18 |
+
def ask(question):
|
19 |
+
index = JsonLinesIndex()
|
20 |
+
loaded = index.load(path)
|
21 |
+
gpt_manager = Gpt3Manager(api_key=OPENAI_API_KEY)
|
22 |
+
|
23 |
+
engine = IndexSearchEngine(loaded, gpt_manager=gpt_manager)
|
24 |
+
loader = TextPromptLoader()
|
25 |
+
chatbot = ChatBot(engine, prompt_loader=loader, gpt_manager=gpt_manager)
|
26 |
+
|
27 |
+
answer = chatbot.ask(question)
|
28 |
+
return answer
|
TwitterChatBot/prompt.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
# Prompt Loaders
|
4 |
+
class PromptLoader(ABC):
|
5 |
+
@abstractmethod
|
6 |
+
def load_prompt(self, path):
|
7 |
+
pass
|
8 |
+
|
9 |
+
|
10 |
+
class TextPromptLoader(PromptLoader):
|
11 |
+
def load_prompt(self, path):
|
12 |
+
with open(path) as f:
|
13 |
+
lines = f.readlines()
|
14 |
+
return "".join(lines)
|
15 |
+
|
16 |
+
|
17 |
+
# Prompts
|
18 |
+
class Prompt(ABC):
|
19 |
+
def __init__(self, prompt_loader: PromptLoader):
|
20 |
+
self.prompt_loader = prompt_loader
|
21 |
+
|
22 |
+
def load_prompt(self, path):
|
23 |
+
return self.prompt_loader.load_prompt(path)
|
24 |
+
|
25 |
+
@abstractmethod
|
26 |
+
def load(self, path):
|
27 |
+
pass
|
28 |
+
|
29 |
+
|
30 |
+
class QuestionAnsweringPrompt(Prompt):
|
31 |
+
def __init__(self, passage, question, prompt_loader):
|
32 |
+
super().__init__(prompt_loader=prompt_loader)
|
33 |
+
self.passage = passage
|
34 |
+
self.question = question
|
35 |
+
|
36 |
+
# trust me, you'll need this later
|
37 |
+
# .replace("<<PASSAGE>>", self.result["index"]["content"])
|
38 |
+
|
39 |
+
def load(self, path):
|
40 |
+
prompt = (
|
41 |
+
self.load_prompt(path)
|
42 |
+
.replace("<<PASSAGE>>", self.passage)
|
43 |
+
.replace("<<QUESTION>>", self.question)
|
44 |
+
)
|
45 |
+
return prompt
|
46 |
+
|
47 |
+
|
48 |
+
class PassageSummarizationPrompt(Prompt):
|
49 |
+
def __init__(self, passage, prompt_loader):
|
50 |
+
super().__init__(prompt_loader=prompt_loader)
|
51 |
+
self.passage = passage
|
52 |
+
|
53 |
+
# prompt = self.load_prompt(path).replace("<<PASSAGE>>", )
|
54 |
+
|
55 |
+
def load(self, path):
|
56 |
+
prompt = self.load_prompt(path).replace("<<PASSAGE>>", self.passage)
|
57 |
+
return prompt
|
TwitterChatBot/tests/chat_test.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
from index import IndexSearchEngine
|
5 |
+
from gpt_3_manager import Gpt3Manager
|
6 |
+
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
from chat import ChatBot
|
9 |
+
from index import JsonLinesIndex
|
10 |
+
|
11 |
+
from prompt import TextPromptLoader
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
16 |
+
|
17 |
+
|
18 |
+
def test_chatbot():
|
19 |
+
path = Path("index") / "index.jsonl"
|
20 |
+
|
21 |
+
index = JsonLinesIndex()
|
22 |
+
loaded = index.load(path)
|
23 |
+
gpt_manager = Gpt3Manager(api_key=OPENAI_API_KEY)
|
24 |
+
|
25 |
+
engine = IndexSearchEngine(loaded, gpt_manager=gpt_manager)
|
26 |
+
loader = TextPromptLoader()
|
27 |
+
chatbot = ChatBot(engine, prompt_loader=loader, gpt_manager=gpt_manager)
|
28 |
+
|
29 |
+
answer = chatbot.ask("What does the twitter terms of service does")
|
30 |
+
|
31 |
+
assert answer != None
|
TwitterChatBot/tests/gpt_3_manager_test.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from gpt_3_manager import Gpt3Manager
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
8 |
+
|
9 |
+
|
10 |
+
def test_gpt3_completion():
|
11 |
+
manager = Gpt3Manager(api_key=OPENAI_API_KEY)
|
12 |
+
request = manager.get_completion(
|
13 |
+
prompt="This is a testing prompt", max_tokens=10, model="text-ada-001"
|
14 |
+
)
|
15 |
+
assert request != None
|
16 |
+
|
17 |
+
|
18 |
+
def test_gpt3_embedding():
|
19 |
+
manager = Gpt3Manager(api_key=OPENAI_API_KEY)
|
20 |
+
request = manager.get_embedding(prompt="This is a testing prompt")
|
21 |
+
assert request != None
|
TwitterChatBot/tests/index_test.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from index import JsonLinesIndex, IndexSearchEngine
|
3 |
+
from gpt_3_manager import Gpt3Manager
|
4 |
+
from pathlib import Path
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
10 |
+
|
11 |
+
|
12 |
+
def test_jsonlines_index():
|
13 |
+
path = Path("index") / "index.jsonl"
|
14 |
+
|
15 |
+
index = JsonLinesIndex()
|
16 |
+
result = index.load(path)
|
17 |
+
|
18 |
+
assert result != None
|
19 |
+
|
20 |
+
|
21 |
+
def test_index_serach_engine():
|
22 |
+
path = Path("index") / "index.jsonl"
|
23 |
+
gpt_manager = Gpt3Manager(OPENAI_API_KEY)
|
24 |
+
index = JsonLinesIndex()
|
25 |
+
loaded = index.load(path)
|
26 |
+
engine = IndexSearchEngine(loaded, gpt_manager=gpt_manager)
|
27 |
+
|
28 |
+
results = engine.search(question="What does the twitter tos does")
|
29 |
+
|
30 |
+
assert results != None
|
TwitterChatBot/tests/prompt_test.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from prompt import QuestionAnsweringPrompt, PassageSummarizationPrompt, TextPromptLoader
|
3 |
+
|
4 |
+
|
5 |
+
def test_text_prompt_loader():
|
6 |
+
path = Path("prompts") / "question_answering.txt"
|
7 |
+
prompt_loader = TextPromptLoader()
|
8 |
+
|
9 |
+
prompt = prompt_loader.load_prompt(path)
|
10 |
+
testing_prompt = (
|
11 |
+
"Use the passage to write a detailed answer to the following question\n"
|
12 |
+
"\n"
|
13 |
+
"passage: <<PASSAGE>>\n"
|
14 |
+
"\n"
|
15 |
+
"question: <<QUESTION>>\n"
|
16 |
+
"\n"
|
17 |
+
"answer:"
|
18 |
+
)
|
19 |
+
|
20 |
+
assert prompt == testing_prompt
|
21 |
+
|
22 |
+
|
23 |
+
def test_question_answering_prompt():
|
24 |
+
path = Path("prompts") / "question_answering.txt"
|
25 |
+
|
26 |
+
passage = "Hi, I'm foo and I love cycling and programming"
|
27 |
+
question = "What is foo's hobby"
|
28 |
+
|
29 |
+
prompt_loader = TextPromptLoader()
|
30 |
+
prompt = QuestionAnsweringPrompt(passage, question, prompt_loader)
|
31 |
+
loaded_prompt = prompt.load(path)
|
32 |
+
|
33 |
+
testing_prompt = (
|
34 |
+
"Use the passage to write a detailed answer to the following question\n"
|
35 |
+
"\n"
|
36 |
+
"passage: Hi, I'm foo and I love cycling and programming\n"
|
37 |
+
"\n"
|
38 |
+
"question: What is foo's hobby\n"
|
39 |
+
"\n"
|
40 |
+
"answer:"
|
41 |
+
)
|
42 |
+
|
43 |
+
assert loaded_prompt == testing_prompt
|
44 |
+
|
45 |
+
|
46 |
+
def test_passage_summarization_prompt():
|
47 |
+
path = Path("prompts") / "passage_summarization.txt"
|
48 |
+
|
49 |
+
passage = "Hi, I'm foo and I love cycling and programming"
|
50 |
+
|
51 |
+
prompt_loader = TextPromptLoader()
|
52 |
+
prompt = PassageSummarizationPrompt(passage, prompt_loader)
|
53 |
+
loaded_prompt = prompt.load(path)
|
54 |
+
|
55 |
+
testing_prompt = (
|
56 |
+
"Summarize the following passage in detail\n"
|
57 |
+
"passage: Hi, I'm foo and I love cycling and programming\n"
|
58 |
+
"\n"
|
59 |
+
"summary:"
|
60 |
+
)
|
61 |
+
|
62 |
+
assert loaded_prompt == testing_prompt
|
TwitterChatBot/tests/utils_test.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from utils import load_prompt
|
3 |
+
|
4 |
+
|
5 |
+
def test_load_prompt_default():
|
6 |
+
path = Path("prompts") / "question_answering.txt"
|
7 |
+
|
8 |
+
with open(path) as f:
|
9 |
+
lines = f.readlines()
|
10 |
+
testing_prompt = "".join(lines)
|
11 |
+
|
12 |
+
prompt = load_prompt(path)
|
13 |
+
|
14 |
+
assert prompt == testing_prompt
|
TwitterChatBot/utils.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
def load_prompt(path):
|
5 |
+
with open(path) as f:
|
6 |
+
lines = f.readlines()
|
7 |
+
return "".join(lines)
|
8 |
+
|
9 |
+
|
10 |
+
def cosine_similarity(emb1, emb2):
|
11 |
+
return np.dot(emb1, emb2) / (
|
12 |
+
(np.dot(emb1, emb1) ** 0.5) * (np.dot(emb2, emb2) ** 0.5)
|
13 |
+
)
|
14 |
+
|
15 |
+
|
16 |
+
def dot_similarity(emb1, emb2):
|
17 |
+
return np.dot(emb1, emb2)
|
app.py
CHANGED
@@ -1,26 +1,27 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
import requests
|
4 |
-
from
|
5 |
|
6 |
-
load_dotenv()
|
7 |
|
8 |
-
|
|
|
|
|
9 |
|
10 |
|
11 |
-
def get_answer(question):
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
|
25 |
|
26 |
def predict(input, history=[]):
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
import requests
|
4 |
+
from TwitterChatBot.main import ask
|
5 |
|
|
|
6 |
|
7 |
+
def get_answer(question):
|
8 |
+
answer = ask(question=question)
|
9 |
+
return answer.strip()
|
10 |
|
11 |
|
12 |
+
# def get_answer(question):
|
13 |
+
# try:
|
14 |
+
# answer = requests.get(
|
15 |
+
# url,
|
16 |
+
# json={"question": question},
|
17 |
+
# )
|
18 |
+
# except Exception as err:
|
19 |
+
# return f"Sorry there was a problem with {err}, please check your connection and try again."
|
20 |
+
|
21 |
+
# if answer.status_code == 200:
|
22 |
+
# return answer.json()["answer"]
|
23 |
+
|
24 |
+
# return "Sorry, We have a problem with our server"
|
25 |
|
26 |
|
27 |
def predict(input, history=[]):
|
index/build_index.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
from pathlib import Path
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
|
6 |
+
import openai
|
7 |
+
import textwrap
|
8 |
+
import jsonlines
|
9 |
+
|
10 |
+
from src.utils import gpt3_embeddings
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
15 |
+
|
16 |
+
openai.api_key = OPENAI_API_KEY
|
17 |
+
|
18 |
+
path = Path("./documents")
|
19 |
+
|
20 |
+
|
21 |
+
with open(path / "result.txt", "r") as f:
|
22 |
+
lines = f.readlines()
|
23 |
+
text = "".join(lines)
|
24 |
+
text = re.sub("\s+", " ", text) # white space normalization
|
25 |
+
|
26 |
+
result = []
|
27 |
+
|
28 |
+
chunks = textwrap.wrap(text, 4000)
|
29 |
+
for chunk in chunks:
|
30 |
+
embedding = gpt3_embeddings(chunk)
|
31 |
+
info = {"content": chunk, "embedding": embedding}
|
32 |
+
result.append(info)
|
33 |
+
|
34 |
+
result_path = Path("./index")
|
35 |
+
|
36 |
+
with jsonlines.open(result_path / "index.jsonl", "w") as writer:
|
37 |
+
writer.write_all(result)
|
index/index.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
prompts/passage_summarization.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Summarize the following passage in detail
|
2 |
+
passage: <<PASSAGE>>
|
3 |
+
|
4 |
+
summary:
|
prompts/question_answering.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Use the passage to write a detailed answer to the following question
|
2 |
+
|
3 |
+
passage: <<PASSAGE>>
|
4 |
+
|
5 |
+
question: <<QUESTION>>
|
6 |
+
|
7 |
+
answer:
|
requirements.txt
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
gradio
|
2 |
-
python-dotenv
|
|
|
|
|
|
1 |
gradio
|
2 |
+
python-dotenv
|
3 |
+
jsonlines
|
4 |
+
openai
|