Spaces:
Running
Running
File size: 2,779 Bytes
b4bdfee afc2218 b4bdfee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
"""
Indexing with vector database
"""
from pathlib import Path
import re
import chromadb
from unidecode import unidecode
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
# Load PDF document and create doc splits
def load_doc(list_file_path, chunk_size, chunk_overlap):
"""Load PDF document and create doc splits"""
loaders = [PyPDFLoader(x) for x in list_file_path]
pages = []
for loader in loaders:
pages.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
doc_splits = text_splitter.split_documents(pages)
return doc_splits
# Generate collection name for vector database
# - Use filepath as input, ensuring unicode text
# - Handle multiple languages (arabic, chinese)
def create_collection_name(filepath):
"""Create collection name for vector database"""
# Extract filename without extension
collection_name = Path(filepath).stem
# Fix potential issues from naming convention
## Remove space
collection_name = collection_name.replace(" ", "-")
## ASCII transliterations of Unicode text
collection_name = unidecode(collection_name)
## Remove special characters
collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name)
## Limit length to 50 characters
collection_name = collection_name[:50]
## Minimum length of 3 characters
if len(collection_name) < 3:
collection_name = collection_name + "xyz"
## Enforce start and end as alphanumeric character
if not collection_name[0].isalnum():
collection_name = "A" + collection_name[1:]
if not collection_name[-1].isalnum():
collection_name = collection_name[:-1] + "Z"
print("\n\nFilepath: ", filepath)
print("Collection name: ", collection_name)
return collection_name
# Create vector database
def create_db(splits, collection_name):
"""Create embeddings and vector database"""
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
# model_name="sentence-transformers/all-MiniLM-L6-v2",
# model_kwargs={"device": "cpu"},
# encode_kwargs={'normalize_embeddings': False}
)
chromadb.api.client.SharedSystemClient.clear_system_cache()
new_client = chromadb.EphemeralClient()
vectordb = Chroma.from_documents(
documents=splits,
embedding=embedding,
client=new_client,
collection_name=collection_name,
# persist_directory=default_persist_directory
)
return vectordb
|