File size: 1,964 Bytes
b1a244b b16918e b1a244b 207d98f b1a244b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
################
# PDF ํ์ผ์ ๋ก๋ํ๊ณ ๋ฌธ์๋ฅผ ์ชผ๊ฐ์ ๋ฌธ์๋ฒกํฐํ ํ ํ ์ง์ํ๊ธฐ
################
import tiktoken
tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
tokens = tokenizer.encode(text)
return len(tokens)
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import filter_complex_metadata
## pdf ํ์ผ๋ก๋ ํ๊ณ ์ชผ๊ฐ๊ธฐ
# https://python.langchain.com/v0.2/docs/how_to/document_loader_markdown/
# ๋งํฌ๋ค์ด ํ์ผ์ ๋ก๋ํ๊ณ ๋ถํ
loader = UnstructuredMarkdownLoader('Document/Knowledge.md', mode="elements")
pages = loader.load_and_split()
# ํ
์คํธ๋ฅผ ์ฒญํฌ๋ก ๋ถํ
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80, length_function=tiktoken_len)
sourceDocs = text_splitter.split_documents(pages)
sourceDocs = filter_complex_metadata(sourceDocs)
################
# HuggingFace ๋ชจ๋ธ๋ก ๋ฌธ์๋ฒกํฐํ ํ ์ ์ฌ๋ ํ์
################
from langchain.vectorstores import Chroma
model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask',
model_kwargs = {'device':'cpu'},
encode_kwargs = {'normalize_embeddings' : True})
## Chroma ๊ธฐ๋ฐ pdf(docs ๋ฒกํฐํ)
db = Chroma.from_documents(sourceDocs, model_huggingface)
## ์ง์ํ๊ธฐ
def SearchDocs(question, k=4):
results = db.similarity_search_with_relevance_scores(question, k = k)
merged = ''
for result in results:
merged += '\n\n' + result[0].page_content
return merged
# # ์ง์ ํ
์คํธ
# question = "์์ฐ์ด ์ฒ๋ฆฌ๋ ๋ฌด์์ธ๊ฐ์?"
# print(SearchDocs(question, k=1)) |