File size: 1,490 Bytes
3caa485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import re

import pandas as pd
from dotenv import load_dotenv
from llama_index.core import SimpleDirectoryReader
from llama_parse import LlamaParse

load_dotenv()
MIN_PARAGRAPH_LENGTH = 50


def extract_paragraphs(markdown_text):
    """
    Extract paragraphs from a markdown text.
    """
    # Split the text into paragraphs using regex
    paragraphs = re.split(r"\n\n+", markdown_text)
    # Remove leading and trailing whitespaces from each paragraph
    paragraphs = [p.strip() for p in paragraphs if p.strip()]
    paragraphs = [
        p
        for p in paragraphs
        if len(p) >= MIN_PARAGRAPH_LENGTH and not p.startswith("#")
    ]
    print(f"created {len(paragraphs)} paragraphs\n", paragraphs)

    return paragraphs


def extract_endpoint(file_paths):
    """
    Extract PDFs using LlamaParse.
    """

    # set up parser
    parser = LlamaParse(result_type="markdown")  # "markdown" and "text" are available

    # use SimpleDirectoryReader to parse our file
    file_extractor = {".pdf": parser}
    documents = SimpleDirectoryReader(
        input_files=file_paths, file_extractor=file_extractor
    ).load_data()

    extracted_data = []

    for doc in documents:
        print(doc.text[:500])
        paragraphs = extract_paragraphs(doc.text)
        data = {
            "paper": doc.metadata["file_name"],
            "chunks": paragraphs,
        }
        extracted_data.append(data)

    df = pd.DataFrame(extracted_data)

    return [extracted_data, df]