schellrw commited on
Commit
f5c9f39
1 Parent(s): 3ad9096

Create utils/process.py

Browse files
Files changed (1) hide show
  1. utils/process.py +34 -0
utils/process.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymupdf ## fitz # PyMuPDF
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+
4
+ def extract_text_from_pdf(pdf_file):
5
+ ## doc = pymupdf.open(pdf_file)
6
+ # with pymupdf.open(pdf_file) as doc:
7
+ # with fitz.open(pdf_file) as doc:
8
+ text = ""
9
+ with pymupdf.open(stream=pdf_file.read(), filetype="pdf") as doc:
10
+ for page in doc:
11
+ text += page.get_text()
12
+ return text
13
+
14
+ MARKDOWN_SEPARATORS = [
15
+ "\n#{1,6} ",
16
+ "```\n",
17
+ "\n\\*\\*\\*+\n",
18
+ "\n---+\n",
19
+ "\n___+\n",
20
+ "\n\n",
21
+ "\n",
22
+ " ",
23
+ "",
24
+ ]
25
+
26
+ def chunk_text(text, chunk_size=1000, chunk_overlap=100):
27
+ text_splitter = RecursiveCharacterTextSplitter(
28
+ chunk_size=chunk_size,
29
+ chunk_overlap=chunk_overlap,
30
+ add_start_index=True,
31
+ strip_whitespace=True,
32
+ separators=MARKDOWN_SEPARATORS
33
+ )
34
+ return text_splitter.split_text(text)