textSummary / utils /process_data.py
iohanngrig's picture
Upload process_data.py
d1ccbf9 verified
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
CHUNK_SIZE = 1024
MAX_CHUNKS = 500
def split_text_into_chunks(text, chunk_size=CHUNK_SIZE):
"""
Splits text into smaller chunks.
Args:
text (str): Text to be split.
chunk_size (int, optional): Size of each chunk. Defaults to 4,000.
Returns:
list[str]: List of text chunks.
"""
chunks = []
for i in range(0, len(text), chunk_size):
chunks.append(text[i : i + chunk_size])
return chunks
def generate_chunks(inp_str, max_chunks=MAX_CHUNKS):
""" Chunk text into smaller pieces."""
inp_str = inp_str.replace('.', '.<eos>')
inp_str = inp_str.replace('?', '?<eos>')
inp_str = inp_str.replace('!', '!<eos>')
sentences = inp_str.split('<eos>')
current_chunk = 0
chunks = []
for sentence in sentences:
if len(chunks) == current_chunk + 1:
if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunks:
chunks[current_chunk].extend(sentence.split(' '))
else:
current_chunk += 1
chunks.append(sentence.split(' '))
else:
chunks.append(sentence.split(' '))
return [' '.join(chunk) for chunk in chunks]
def pdf_to_text(pdf_path):
"""
Converts a PDF file to text.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF file.
"""
reader = PdfReader(pdf_path)
extracted_texts = [page.extract_text() for page in reader.pages]
return " ".join(extracted_texts).replace("\n", " ")
def process_text(text):
""" Split the text into chunks using Langchain's CharacterTextSplitter """
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=CHUNK_SIZE,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
# Convert the chunks of text into embeddings to form a knowledge base
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
knowledgeBase = FAISS.from_texts(chunks, embeddings)
return knowledgeBase