Spaces:

schellrw
/

il-legal

Sleeping

schellrw commited on Aug 12

Commit

f5c9f39

•

1 Parent(s): 3ad9096

Create utils/process.py

Files changed (1) hide show

utils/process.py ADDED Viewed

+import pymupdf ## fitz # PyMuPDF
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+def extract_text_from_pdf(pdf_file):
+ ## doc = pymupdf.open(pdf_file)
+ # with pymupdf.open(pdf_file) as doc:
+ # with fitz.open(pdf_file) as doc:
+ text = ""
+ with pymupdf.open(stream=pdf_file.read(), filetype="pdf") as doc:
+ for page in doc:
+ text += page.get_text()
+ return text
+MARKDOWN_SEPARATORS = [
+ "\n#{1,6} ",
+ "```\n",
+ "\n\\*\\*\\*+\n",
+ "\n---+\n",
+ "\n___+\n",
+ "\n\n",
+ "\n",
+ " ",
+ "",
+]
+def chunk_text(text, chunk_size=1000, chunk_overlap=100):
+ text_splitter = RecursiveCharacterTextSplitter(
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ add_start_index=True,
+ strip_whitespace=True,
+ separators=MARKDOWN_SEPARATORS
+ )
+ return text_splitter.split_text(text)