Yiming Qian
commited on
Upload pdf_parser.py
Browse files- pdf_parser.py +117 -0
pdf_parser.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
import pymupdf
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import pickle
|
5 |
+
import torch
|
6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
7 |
+
|
8 |
+
|
9 |
+
model_kwargs = dict(
|
10 |
+
use_cache=False,
|
11 |
+
trust_remote_code=True,
|
12 |
+
attn_implementation="flash_attention_2", # loading the model with flash-attenstion support
|
13 |
+
torch_dtype=torch.bfloat16,
|
14 |
+
device_map="cuda",
|
15 |
+
load_in_4bit=True
|
16 |
+
)
|
17 |
+
|
18 |
+
model = AutoModelForCausalLM.from_pretrained("./model_4bit", **model_kwargs)
|
19 |
+
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained("./model_4bit")
|
21 |
+
tokenizer.model_max_length = 8000
|
22 |
+
tokenizer.pad_token = tokenizer.eos_token # use unk rather than eos token to prevent endless generation
|
23 |
+
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
|
24 |
+
tokenizer.padding_side = 'right'
|
25 |
+
|
26 |
+
|
27 |
+
SYSTEM = '''The user's input is data in XML format. Please organize it into a markdown format. Pay attention to:
|
28 |
+
|
29 |
+
1. Directly output the results. Do not make summary of the text.
|
30 |
+
2. Do not alter any text from the XML. Do not change number into words.
|
31 |
+
3. Correct format errors, such as misalignment between numbers and text, and disorder in the sequence of table cells.
|
32 |
+
4. Use markdown, but all numbers must be explicitly written out in full (e.g., 3.2.5.1).
|
33 |
+
5. Preserve the original document structure as much as possible, such as paragraphs, lists, etc.
|
34 |
+
6. Pay attention to detecting tables in the text (as the table format may have been lost due to copying from the XML). Restore the table's format and maintain its integrity. Some tables may be too long and span across pages. Pay attention to merging the same tables that span pages. Properly handle table headers to avoid repetition or omission.
|
35 |
+
7. Text from the XML may contain some garbled characters; remove any characters that are garbled.
|
36 |
+
8. Convert headings (H1, H2, H3, etc.) into their respective Markdown heading levels (e.g., 3 for # 3, 3.2 for ## 3.2, 3.2.1 for ### 3.2.1).
|
37 |
+
9. Include metadata information in the output, such as document title, section number, etc.
|
38 |
+
10. Remove the footnote and page number, it is important!!!
|
39 |
+
11. Make sure phrase connected with - will not break up.
|
40 |
+
'''
|
41 |
+
|
42 |
+
def merge_elements_up_to_max_length(elements, max_length):
|
43 |
+
"""
|
44 |
+
Merge elements in the list to ensure no element exceeds the specified max_length.
|
45 |
+
|
46 |
+
Parameters:
|
47 |
+
- elements: List[str] - The list of string elements to merge.
|
48 |
+
- max_length: int - The maximum allowed length for any element after merging.
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
- List[str]: A new list where the elements have been merged as necessary.
|
52 |
+
"""
|
53 |
+
if not elements:
|
54 |
+
return []
|
55 |
+
|
56 |
+
# Initialize the list with the first element
|
57 |
+
merged = [elements[0]]
|
58 |
+
|
59 |
+
for element in elements[1:]:
|
60 |
+
# Check if the last element in merged list can be combined with the current element
|
61 |
+
if len(merged[-1]) + len(element) <= max_length:
|
62 |
+
merged[-1] += element # Merge with the last element
|
63 |
+
else:
|
64 |
+
merged.append(element) # Add as a new element
|
65 |
+
|
66 |
+
return merged
|
67 |
+
|
68 |
+
|
69 |
+
pipe = pipeline(
|
70 |
+
"text-generation",
|
71 |
+
model=model,
|
72 |
+
tokenizer=tokenizer,
|
73 |
+
)
|
74 |
+
|
75 |
+
generation_args = {
|
76 |
+
"max_new_tokens": 2000,
|
77 |
+
"return_full_text": False,
|
78 |
+
"do_sample": False,
|
79 |
+
}
|
80 |
+
|
81 |
+
# %%
|
82 |
+
filename ='2023071000529.pdf'
|
83 |
+
elements=[]
|
84 |
+
with pymupdf.open(filename) as doc:
|
85 |
+
|
86 |
+
for page in doc:
|
87 |
+
soup = BeautifulSoup(page.get_text('xhtml'), 'html.parser')
|
88 |
+
for img in soup("img"):
|
89 |
+
img.decompose()
|
90 |
+
|
91 |
+
page_element=''
|
92 |
+
for item in soup.find_all('p'):
|
93 |
+
if len(item.get_text())<2:
|
94 |
+
item.decompose()
|
95 |
+
else:
|
96 |
+
#elements.append(str(item))
|
97 |
+
page_element=page_element+str(item)
|
98 |
+
elements.append(page_element)
|
99 |
+
elements.append("<hr>")
|
100 |
+
|
101 |
+
max_length=7000
|
102 |
+
|
103 |
+
merged_elements=merge_elements_up_to_max_length(elements, max_length)
|
104 |
+
|
105 |
+
markdown_text=''
|
106 |
+
for j in range(len(merged_elements)):
|
107 |
+
item =merged_elements[j]
|
108 |
+
messages=[{"role": "system", "content": SYSTEM},
|
109 |
+
{"role": "user", "content": item}]
|
110 |
+
output = pipe(messages, **generation_args)
|
111 |
+
markdown_text=markdown_text+output[0]['generated_text']+'\n'
|
112 |
+
|
113 |
+
main_file = filename[:-4]+'.md'
|
114 |
+
with open(main_file, "w") as f:
|
115 |
+
f.write(markdown_text)
|
116 |
+
|
117 |
+
|