File size: 4,839 Bytes
8dd03d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f021ec0
8dd03d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f021ec0
 
8dd03d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f021ec0
8dd03d6
 
 
 
f021ec0
8dd03d6
 
f021ec0
8dd03d6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import weaviate # vector DB
from openai import OpenAI # LLM
import PyPDF2 # pdf -> text
import numpy as np
from transformers import AutoModel, AutoTokenizer # Model, Tokenzier Load
import gradio as gr # front-end(ui & ux)
from sentence_transformers import SentenceTransformer # embedding

# Weaviate์— ์‚ฌ์šฉํ•  ํด๋ž˜์Šค ์Šคํ‚ค๋งˆ ์ •์˜
def create_schema():
    class_obj = {
        "class": "PdfSentence",  # ํด๋ž˜์Šค๋ช…
        "properties": [
            {
                "name": "sentence",
                "dataType": ["text"]
            },
            {
                "name": "embedding",
                "dataType": ["number[]"]  # ๋ฒกํ„ฐ ํƒ€์ž…
            }
        ]
    }
    # ์Šคํ‚ค๋งˆ ์ƒ์„ฑ
    db_client.schema.create_class(class_obj)

# ์Šคํ‚ค๋งˆ ํ™•์ธ ๋ฐ ์ƒ์„ฑ / ๊ธฐ์กด์— ์„ ์–ธํ•œ ์Šคํ‚ค๋งˆ๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ -> pass(๋„˜์–ด๊ฐ„๋‹ค.)
def ensure_schema():
    schema = db_client.schema.get()
    classes = [cls["class"] for cls in schema["classes"]]
    print(classes)
    if "PdfSentence" not in classes:
        create_schema()


# ์Šคํ‚ค๋งˆ๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์„ ๊ฒฝ์šฐ ์ƒ์„ฑ
# if not client.schema.contains({"class": "PdfSentence"}):
#     create_schema()

# PDF ํ…์ŠคํŠธ ์ถ”์ถœ ํ•จ์ˆ˜
def extract_text_from_pdf(pdf):
    pdf_reader = PyPDF2.PdfReader(pdf)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ํ•จ์ˆ˜
def create_embeddings(text, model):
    result = model.encode(text)
    return result.astype(np.float64).tolist()


# Weaviate์— ๋ฐ์ดํ„ฐ ์ €์žฅ
def store_vectors_in_weaviate(sentences, embed_model):
    with db_client.batch as batch:
        for sentence in sentences:
            try:
                embedding = create_embeddings(sentence, embed_model)
                # print("embedding", embedding)

                # Weaviate์— ๋ฌธ์žฅ๊ณผ ๋ฒกํ„ฐ ์ €์žฅ
                data_object = {
                    "sentence": sentence,
                    "embedding": embedding
                }
                batch.add_data_object(data_object, "PdfSentence")
                print("success")
            except Exception as e:
                print(e)

# ์งˆ๋ฌธ์— ๊ฐ€์žฅ ์œ ์‚ฌํ•œ ๋ฌธ์žฅ ์ฐพ๊ธฐ
def find_similar_sentence_in_weaviate(question_embedding):
    near_vector = {
        "vector": question_embedding
    }

    result = db_client.query.get("PdfSentence", ["sentence", "embedding"]) \
        .with_near_vector(near_vector) \
        .do()
    #         .with_limit(1) \
    
    return result # ๊ฐ€์žฅ ๊ฐ€๊นŒ์ด ์žˆ๋Š” ๋ฌธ์žฅ์ด ๋ฐ˜ํ™˜
    # return result['data']['Get']['PdfSentence'][0]['sentence']

def generate_answer(pdf, question):
    global embed_model
    # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
    text = extract_text_from_pdf(pdf)
    
    # ํ…์ŠคํŠธ๋ฅผ ๋ฌธ์žฅ๋ณ„๋กœ ๋‚˜๋ˆ„๊ธฐ
    sentences = text.split('. ')
    
    # ๋ฌธ์žฅ๋“ค์„ Weaviate์— ์ €์žฅ
    store_vectors_in_weaviate(sentences, embed_model)
    
    # ์งˆ๋ฌธ์— ๋Œ€ํ•œ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
    question_embedding = create_embeddings(question, embed_model)
    
    # Weaviate์—์„œ ๊ฐ€์žฅ ์œ ์‚ฌํ•œ ๋ฌธ์žฅ ์ฐพ๊ธฐ
    most_similar_sentence = find_similar_sentence_in_weaviate(question_embedding)
    print("debug03")
    print(most_similar_sentence)

    # OpenAI API๋กœ ์‘๋‹ต ์ƒ์„ฑ
    ai_client = OpenAI(api_key='sk-TWonV6ldIlpQzTtp5WDW3IiE1mJtQ5eP2p3arsIkDQT3BlbkFJ87T5N5D4WQFHo-QitD7sFOBL6360GxdKTNYpuPbV8A')
    response = ai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"You are a helpful assistant. Answer based on context: {most_similar_sentence}"},
            {"role": "user", "content": question}
        ]
    )
    result = response.choices[0].message.content

    return result

# Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ
def interface(pdf, question):
    return generate_answer(pdf, question)

if __name__=="__main__":
    # ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ
    embed_model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

    # Weaviate ํด๋ผ์ด์–ธํŠธ ์„ค์ •
    db_client = weaviate.Client(
        url="https://ildmdarvrfcox58ff2tipw.c0.us-west3.gcp.weaviate.cloud",  # ํด๋Ÿฌ์Šคํ„ฐ URL
        auth_client_secret=weaviate.AuthApiKey(api_key="SPmVOW99EWg8LkstmLlsKUSuSiHfoefcLQwS"),  # API ํ‚ค ์„ค์ •
        # timeout_config=(5, 150)  # ํƒ€์ž„์•„์›ƒ ์„ค์ • (์„ ํƒ ์‚ฌํ•ญ)
    )

    ensure_schema()

    # Gradio UI ์ƒ์„ฑ
    with gr.Blocks() as demo:
        pdf_input = gr.File(label="Upload PDF", type="filepath")
        question_input = gr.Textbox(label="Ask a question", placeholder="What do you want to know?")
        output = gr.Textbox(label="Answer")

        submit_btn = gr.Button("Submit")
        submit_btn.click(fn=interface, inputs=[pdf_input, question_input], outputs=output)

    # ์•ฑ ์‹คํ–‰
    demo.launch()