prashant commited on
Commit
1e18f9c
1 Parent(s): 593bc97

adding haystack converter and preprocessor

Browse files
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- farm-haystack
2
- farm-haystack[ocr]
3
  spacy==3.2.0
4
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
5
  keybert==0.5.1
 
1
+ farm-haystack == 1.10
2
+ farm-haystack[ocr]==1.10.0
3
  spacy==3.2.0
4
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
5
  keybert==0.5.1
udfPreprocess/paramconfig.cfg CHANGED
@@ -10,3 +10,7 @@ THRESHOLD = 0.1
10
 
11
  [sdg]
12
  THRESHOLD = 0.85
 
 
 
 
 
10
 
11
  [sdg]
12
  THRESHOLD = 0.85
13
+
14
+ [preprocessor]
15
+ SPLIT_OVERLAP_WORD = 20
16
+ SPLIT_OVERLAP_SENTENCE = 1
udfPreprocess/preprocessing.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes.base import BaseComponent
2
+ from haystack.schema import Document
3
+ from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
+ from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
+ from typing import Callable, Dict, List, Optional, Text, Union
6
+ from typing_extensions import Literal
7
+ import pandas as pd
8
+ import logging
9
+ import re
10
+ import string
11
+ import configparser
12
+ config = configparser.ConfigParser()
13
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
14
+ top_k = int(config.get('lexical_search','TOP_K'))
15
+
16
+ def useOCR(file_path: str)-> Text:
17
+ """
18
+ Converts image pdfs into text, Using the Farm-haystack[OCR]
19
+
20
+
21
+ Params
22
+ ----------
23
+ file_path: file_path of uploade file, returned by add_upload function in
24
+ uploadAndExample.py
25
+
26
+ Returns the text files as string.
27
+ """
28
+
29
+
30
+ converter = PDFToTextOCRConverter(remove_numeric_tables=True,
31
+ valid_languages=["eng"])
32
+ docs = converter.convert(file_path=file_path, meta=None)
33
+ return docs[0].content
34
+
35
+
36
+
37
+
38
+ class FileConverter(BaseComponent):
39
+ """
40
+ Wrapper class to convert uploaded document into text by calling appropriate
41
+ Converter class, will use internally haystack PDFToTextOCR in case of image
42
+ pdf. Cannot use the FileClassifier from haystack as its doesnt has any
43
+ label/output class for image.
44
+
45
+ 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
46
+ 2. https://docs.haystack.deepset.ai/docs/file_converters
47
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
48
+ 4. https://docs.haystack.deepset.ai/reference/file-converters-api
49
+
50
+
51
+ """
52
+
53
+ outgoing_edges = 1
54
+
55
+ def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
56
+ id_hash_keys: Optional[List[str]] = None,
57
+ ) -> tuple(dict,str):
58
+ """ this is required method to invoke the component in
59
+ the pipeline implementation.
60
+
61
+ Params
62
+ ----------
63
+ file_name: name of file
64
+ file_path: file_path of uploade file, returned by add_upload function in
65
+ uploadAndExample.py
66
+
67
+ See the links provided in Class docstring/description to see other params
68
+
69
+ Return
70
+ ---------
71
+ output: dictionary, with key as identifier and value could be anything
72
+ we need to return. In this case its the List of Hasyatck Document
73
+
74
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
75
+ """
76
+ try:
77
+ if file_name.endswith('.pdf'):
78
+ converter = PDFToTextConverter(remove_numeric_tables=True)
79
+ if file_name.endswith('.txt'):
80
+ converter = TextConverter(remove_numeric_tables=True)
81
+ if file_name.endswith('.docx'):
82
+ converter = DocxToTextConverter(remove_numeric_tables=True)
83
+ except Exception as e:
84
+ logging.error(e)
85
+ return
86
+
87
+
88
+
89
+ documents = []
90
+
91
+ document = converter.convert(
92
+ file_path=file_path, meta=None,
93
+ encoding=encoding, id_hash_keys=id_hash_keys
94
+ )[0]
95
+
96
+ text = document.content
97
+
98
+ # if file is image pdf then it will have {'content': "\x0c\x0c\x0c\x0c"}
99
+ # subsitute this substring with '',and check if content is empty string
100
+
101
+ text = re.sub(r'\x0c', '', text)
102
+ documents.append(Document(content=text,
103
+ meta={"name": file_name},
104
+ id_hash_keys=id_hash_keys))
105
+
106
+
107
+ # check if text is empty and apply pdfOCR converter.
108
+ for i in documents:
109
+ if i.content == "":
110
+ logging.info("Using OCR")
111
+ i.content = useOCR(file_path)
112
+
113
+ logging.info('file conversion succesful')
114
+ output = {'documents': documents}
115
+ return output, 'output_1'
116
+
117
+ def run_batch():
118
+ """
119
+ we dont have requirement to process the multiple files in one go
120
+ therefore nothing here, however to use the custom node we need to have
121
+ this method for the class.
122
+ """
123
+
124
+ return
125
+
126
+
127
+ def basic(s, removePunc:bool = False):
128
+
129
+ """
130
+ Params
131
+ ----------
132
+ s: string to be processed
133
+ removePunc: to remove all Punctuation including ',' and '.' or not
134
+
135
+ Returns: processed string: see comments in the source code for more info
136
+ """
137
+
138
+ # Remove URLs
139
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
140
+ s = re.sub(r"http\S+", " ", s)
141
+
142
+ # Remove new line characters
143
+ s = re.sub('\n', ' ', s)
144
+
145
+ # Remove punctuations
146
+ if removePunc == True:
147
+ translator = str.maketrans(' ', ' ', string.punctuation)
148
+ s = s.translate(translator)
149
+ # Remove distracting single quotes and dotted pattern
150
+ s = re.sub("\'", " ", s)
151
+ s = re.sub("..","",s)
152
+
153
+ return s.strip()
154
+
155
+
156
+ class UdfPreProcessor(BaseComponent):
157
+ """
158
+ class to preprocess the document returned by FileConverter. It will check
159
+ for splitting strategy and splits the document by word or sentences and then
160
+ synthetically create the paragraphs.
161
+
162
+ 1. https://docs.haystack.deepset.ai/docs/preprocessor
163
+ 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
164
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
165
+
166
+ """
167
+ outgoing_edges = 1
168
+ split_overlap_word = config.get('preprocessor','SPLIT_OVERLAP_WORD')
169
+ split_overlap_sentence = config.get('preprocessor','SPLIT_OVERLAP_SENTENCE')
170
+
171
+ def run(self, documents:List[Document], removePunc:bool,
172
+ split_by: Literal["sentence", "word"] = 'sentence',
173
+ split_length:int = 2):
174
+
175
+ """ this is required method to invoke the component in
176
+ the pipeline implementation.
177
+
178
+ Params
179
+ ----------
180
+ documents: documents from the output dictionary returned by Fileconverter
181
+ removePunc: to remove all Punctuation including ',' and '.' or not
182
+ split_by: document splitting strategy either as word or sentence
183
+ split_length: when synthetically creating the paragrpahs from document,
184
+ it defines the length of paragraph.
185
+
186
+ Return
187
+ ---------
188
+ output: dictionary, with key as identifier and value could be anything
189
+ we need to return. In this case the output will contain 4 objects
190
+ the paragraphs text list as List, Haystack document, Dataframe and
191
+ one raw text file.
192
+
193
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
194
+
195
+ """
196
+
197
+ if split_by == 'sentence':
198
+ split_respect_sentence_boundary = False
199
+ split_overlap=self.split_overlap_sentence
200
+
201
+ else:
202
+ split_respect_sentence_boundary = True
203
+ split_overlap= self.split_overlap_word
204
+
205
+ preprocessor = PreProcessor(
206
+ clean_empty_lines=True,
207
+ clean_whitespace=True,
208
+ clean_header_footer=True,
209
+ split_by=split_by,
210
+ split_length=split_length,
211
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
212
+ split_overlap=split_overlap,
213
+ add_page_number=True
214
+ )
215
+
216
+ for i in documents:
217
+ docs_processed = preprocessor.process([i])
218
+ for item in docs_processed:
219
+ item.content = basic(item.content, removePunc= removePunc)
220
+
221
+ df = pd.DataFrame(docs_processed)
222
+ all_text = " ".join(df.content.to_list())
223
+ para_list = df.content.to_list()
224
+
225
+ output = {'documents': docs_processed,
226
+ 'dataframe': df,
227
+ 'text': all_text,
228
+ 'paraList': para_list
229
+ }
230
+ return output, "output_1"
231
+ def run_batch():
232
+ """
233
+ we dont have requirement to process the multiple files in one go
234
+ therefore nothing here, however to use the custom node we need to have
235
+ this method for the class.
236
+ """
237
+ return