langchain_goethe / main.py
goethe0101's picture
Upload folder using huggingface_hub
f021ec0 verified
import weaviate # vector DB
from openai import OpenAI # LLM
import PyPDF2 # pdf -> text
import numpy as np
from transformers import AutoModel, AutoTokenizer # Model, Tokenzier Load
import gradio as gr # front-end(ui & ux)
from sentence_transformers import SentenceTransformer # embedding
# Weaviate์— ์‚ฌ์šฉํ•  ํด๋ž˜์Šค ์Šคํ‚ค๋งˆ ์ •์˜
def create_schema():
class_obj = {
"class": "PdfSentence", # ํด๋ž˜์Šค๋ช…
"properties": [
{
"name": "sentence",
"dataType": ["text"]
},
{
"name": "embedding",
"dataType": ["number[]"] # ๋ฒกํ„ฐ ํƒ€์ž…
}
]
}
# ์Šคํ‚ค๋งˆ ์ƒ์„ฑ
db_client.schema.create_class(class_obj)
# ์Šคํ‚ค๋งˆ ํ™•์ธ ๋ฐ ์ƒ์„ฑ / ๊ธฐ์กด์— ์„ ์–ธํ•œ ์Šคํ‚ค๋งˆ๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ -> pass(๋„˜์–ด๊ฐ„๋‹ค.)
def ensure_schema():
schema = db_client.schema.get()
classes = [cls["class"] for cls in schema["classes"]]
print(classes)
if "PdfSentence" not in classes:
create_schema()
# ์Šคํ‚ค๋งˆ๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์„ ๊ฒฝ์šฐ ์ƒ์„ฑ
# if not client.schema.contains({"class": "PdfSentence"}):
# create_schema()
# PDF ํ…์ŠคํŠธ ์ถ”์ถœ ํ•จ์ˆ˜
def extract_text_from_pdf(pdf):
pdf_reader = PyPDF2.PdfReader(pdf)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ํ•จ์ˆ˜
def create_embeddings(text, model):
result = model.encode(text)
return result.astype(np.float64).tolist()
# Weaviate์— ๋ฐ์ดํ„ฐ ์ €์žฅ
def store_vectors_in_weaviate(sentences, embed_model):
with db_client.batch as batch:
for sentence in sentences:
try:
embedding = create_embeddings(sentence, embed_model)
# print("embedding", embedding)
# Weaviate์— ๋ฌธ์žฅ๊ณผ ๋ฒกํ„ฐ ์ €์žฅ
data_object = {
"sentence": sentence,
"embedding": embedding
}
batch.add_data_object(data_object, "PdfSentence")
print("success")
except Exception as e:
print(e)
# ์งˆ๋ฌธ์— ๊ฐ€์žฅ ์œ ์‚ฌํ•œ ๋ฌธ์žฅ ์ฐพ๊ธฐ
def find_similar_sentence_in_weaviate(question_embedding):
near_vector = {
"vector": question_embedding
}
result = db_client.query.get("PdfSentence", ["sentence", "embedding"]) \
.with_near_vector(near_vector) \
.do()
# .with_limit(1) \
return result # ๊ฐ€์žฅ ๊ฐ€๊นŒ์ด ์žˆ๋Š” ๋ฌธ์žฅ์ด ๋ฐ˜ํ™˜
# return result['data']['Get']['PdfSentence'][0]['sentence']
def generate_answer(pdf, question):
global embed_model
# PDF ํ…์ŠคํŠธ ์ถ”์ถœ
text = extract_text_from_pdf(pdf)
# ํ…์ŠคํŠธ๋ฅผ ๋ฌธ์žฅ๋ณ„๋กœ ๋‚˜๋ˆ„๊ธฐ
sentences = text.split('. ')
# ๋ฌธ์žฅ๋“ค์„ Weaviate์— ์ €์žฅ
store_vectors_in_weaviate(sentences, embed_model)
# ์งˆ๋ฌธ์— ๋Œ€ํ•œ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
question_embedding = create_embeddings(question, embed_model)
# Weaviate์—์„œ ๊ฐ€์žฅ ์œ ์‚ฌํ•œ ๋ฌธ์žฅ ์ฐพ๊ธฐ
most_similar_sentence = find_similar_sentence_in_weaviate(question_embedding)
print("debug03")
print(most_similar_sentence)
# OpenAI API๋กœ ์‘๋‹ต ์ƒ์„ฑ
ai_client = OpenAI(api_key='sk-TWonV6ldIlpQzTtp5WDW3IiE1mJtQ5eP2p3arsIkDQT3BlbkFJ87T5N5D4WQFHo-QitD7sFOBL6360GxdKTNYpuPbV8A')
response = ai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": f"You are a helpful assistant. Answer based on context: {most_similar_sentence}"},
{"role": "user", "content": question}
]
)
result = response.choices[0].message.content
return result
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ
def interface(pdf, question):
return generate_answer(pdf, question)
if __name__=="__main__":
# ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ
embed_model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')
# Weaviate ํด๋ผ์ด์–ธํŠธ ์„ค์ •
db_client = weaviate.Client(
url="https://ildmdarvrfcox58ff2tipw.c0.us-west3.gcp.weaviate.cloud", # ํด๋Ÿฌ์Šคํ„ฐ URL
auth_client_secret=weaviate.AuthApiKey(api_key="SPmVOW99EWg8LkstmLlsKUSuSiHfoefcLQwS"), # API ํ‚ค ์„ค์ •
# timeout_config=(5, 150) # ํƒ€์ž„์•„์›ƒ ์„ค์ • (์„ ํƒ ์‚ฌํ•ญ)
)
ensure_schema()
# Gradio UI ์ƒ์„ฑ
with gr.Blocks() as demo:
pdf_input = gr.File(label="Upload PDF", type="filepath")
question_input = gr.Textbox(label="Ask a question", placeholder="What do you want to know?")
output = gr.Textbox(label="Answer")
submit_btn = gr.Button("Submit")
submit_btn.click(fn=interface, inputs=[pdf_input, question_input], outputs=output)
# ์•ฑ ์‹คํ–‰
demo.launch()