Spaces:
Sleeping
Sleeping
import weaviate # vector DB | |
from openai import OpenAI # LLM | |
import PyPDF2 # pdf -> text | |
import numpy as np | |
from transformers import AutoModel, AutoTokenizer # Model, Tokenzier Load | |
import gradio as gr # front-end(ui & ux) | |
from sentence_transformers import SentenceTransformer # embedding | |
# Weaviate์ ์ฌ์ฉํ ํด๋์ค ์คํค๋ง ์ ์ | |
def create_schema(): | |
class_obj = { | |
"class": "PdfSentence", # ํด๋์ค๋ช | |
"properties": [ | |
{ | |
"name": "sentence", | |
"dataType": ["text"] | |
}, | |
{ | |
"name": "embedding", | |
"dataType": ["number[]"] # ๋ฒกํฐ ํ์ | |
} | |
] | |
} | |
# ์คํค๋ง ์์ฑ | |
db_client.schema.create_class(class_obj) | |
# ์คํค๋ง ํ์ธ ๋ฐ ์์ฑ / ๊ธฐ์กด์ ์ ์ธํ ์คํค๋ง๊ฐ ์๋ ๊ฒฝ์ฐ -> pass(๋์ด๊ฐ๋ค.) | |
def ensure_schema(): | |
schema = db_client.schema.get() | |
classes = [cls["class"] for cls in schema["classes"]] | |
print(classes) | |
if "PdfSentence" not in classes: | |
create_schema() | |
# ์คํค๋ง๊ฐ ์กด์ฌํ์ง ์์ ๊ฒฝ์ฐ ์์ฑ | |
# if not client.schema.contains({"class": "PdfSentence"}): | |
# create_schema() | |
# PDF ํ ์คํธ ์ถ์ถ ํจ์ | |
def extract_text_from_pdf(pdf): | |
pdf_reader = PyPDF2.PdfReader(pdf) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
# ํ ์คํธ ์๋ฒ ๋ฉ ์์ฑ ํจ์ | |
def create_embeddings(text, model): | |
result = model.encode(text) | |
return result.astype(np.float64).tolist() | |
# Weaviate์ ๋ฐ์ดํฐ ์ ์ฅ | |
def store_vectors_in_weaviate(sentences, embed_model): | |
with db_client.batch as batch: | |
for sentence in sentences: | |
try: | |
embedding = create_embeddings(sentence, embed_model) | |
# print("embedding", embedding) | |
# Weaviate์ ๋ฌธ์ฅ๊ณผ ๋ฒกํฐ ์ ์ฅ | |
data_object = { | |
"sentence": sentence, | |
"embedding": embedding | |
} | |
batch.add_data_object(data_object, "PdfSentence") | |
print("success") | |
except Exception as e: | |
print(e) | |
# ์ง๋ฌธ์ ๊ฐ์ฅ ์ ์ฌํ ๋ฌธ์ฅ ์ฐพ๊ธฐ | |
def find_similar_sentence_in_weaviate(question_embedding): | |
near_vector = { | |
"vector": question_embedding | |
} | |
result = db_client.query.get("PdfSentence", ["sentence", "embedding"]) \ | |
.with_near_vector(near_vector) \ | |
.do() | |
# .with_limit(1) \ | |
return result # ๊ฐ์ฅ ๊ฐ๊น์ด ์๋ ๋ฌธ์ฅ์ด ๋ฐํ | |
# return result['data']['Get']['PdfSentence'][0]['sentence'] | |
def generate_answer(pdf, question): | |
global embed_model | |
# PDF ํ ์คํธ ์ถ์ถ | |
text = extract_text_from_pdf(pdf) | |
# ํ ์คํธ๋ฅผ ๋ฌธ์ฅ๋ณ๋ก ๋๋๊ธฐ | |
sentences = text.split('. ') | |
# ๋ฌธ์ฅ๋ค์ Weaviate์ ์ ์ฅ | |
store_vectors_in_weaviate(sentences, embed_model) | |
# ์ง๋ฌธ์ ๋ํ ์๋ฒ ๋ฉ ์์ฑ | |
question_embedding = create_embeddings(question, embed_model) | |
# Weaviate์์ ๊ฐ์ฅ ์ ์ฌํ ๋ฌธ์ฅ ์ฐพ๊ธฐ | |
most_similar_sentence = find_similar_sentence_in_weaviate(question_embedding) | |
print("debug03") | |
print(most_similar_sentence) | |
# OpenAI API๋ก ์๋ต ์์ฑ | |
ai_client = OpenAI(api_key='sk-TWonV6ldIlpQzTtp5WDW3IiE1mJtQ5eP2p3arsIkDQT3BlbkFJ87T5N5D4WQFHo-QitD7sFOBL6360GxdKTNYpuPbV8A') | |
response = ai_client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=[ | |
{"role": "system", "content": f"You are a helpful assistant. Answer based on context: {most_similar_sentence}"}, | |
{"role": "user", "content": question} | |
] | |
) | |
result = response.choices[0].message.content | |
return result | |
# Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ | |
def interface(pdf, question): | |
return generate_answer(pdf, question) | |
if __name__=="__main__": | |
# ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋ | |
embed_model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens') | |
# Weaviate ํด๋ผ์ด์ธํธ ์ค์ | |
db_client = weaviate.Client( | |
url="https://ildmdarvrfcox58ff2tipw.c0.us-west3.gcp.weaviate.cloud", # ํด๋ฌ์คํฐ URL | |
auth_client_secret=weaviate.AuthApiKey(api_key="SPmVOW99EWg8LkstmLlsKUSuSiHfoefcLQwS"), # API ํค ์ค์ | |
# timeout_config=(5, 150) # ํ์์์ ์ค์ (์ ํ ์ฌํญ) | |
) | |
ensure_schema() | |
# Gradio UI ์์ฑ | |
with gr.Blocks() as demo: | |
pdf_input = gr.File(label="Upload PDF", type="filepath") | |
question_input = gr.Textbox(label="Ask a question", placeholder="What do you want to know?") | |
output = gr.Textbox(label="Answer") | |
submit_btn = gr.Button("Submit") | |
submit_btn.click(fn=interface, inputs=[pdf_input, question_input], outputs=output) | |
# ์ฑ ์คํ | |
demo.launch() | |