Spaces:

iohanngrig
/

textSummary

Sleeping

File size: 2,321 Bytes

d1ccbf9

from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


CHUNK_SIZE = 1024
MAX_CHUNKS = 500


def split_text_into_chunks(text, chunk_size=CHUNK_SIZE):
    """
    Splits text into smaller chunks.
    Args:
        text (str): Text to be split.
        chunk_size (int, optional): Size of each chunk. Defaults to 4,000.
    Returns:
        list[str]: List of text chunks.
    """
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i : i + chunk_size])
    return chunks


def generate_chunks(inp_str, max_chunks=MAX_CHUNKS):
    """ Chunk text into smaller pieces."""
    inp_str = inp_str.replace('.', '.<eos>')
    inp_str = inp_str.replace('?', '?<eos>')
    inp_str = inp_str.replace('!', '!<eos>')
    
    sentences = inp_str.split('<eos>')
    current_chunk = 0 
    chunks = []
    for sentence in sentences:
        if len(chunks) == current_chunk + 1: 
            if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunks:
                chunks[current_chunk].extend(sentence.split(' '))
            else:
                current_chunk += 1
                chunks.append(sentence.split(' '))
        else:
            chunks.append(sentence.split(' '))
    return [' '.join(chunk) for chunk in chunks]


def pdf_to_text(pdf_path):
    """
    Converts a PDF file to text.
    Args:
        pdf_path (str): Path to the PDF file.
    Returns:
        str: Extracted text from the PDF file.
    """
    reader = PdfReader(pdf_path)
    extracted_texts = [page.extract_text() for page in reader.pages]
    return " ".join(extracted_texts).replace("\n", " ")


def process_text(text):
    """ Split the text into chunks using Langchain's CharacterTextSplitter """
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=CHUNK_SIZE,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    # Convert the chunks of text into embeddings to form a knowledge base
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    knowledgeBase = FAISS.from_texts(chunks, embeddings)
    return knowledgeBase