Rauhan commited on
Commit
4a38803
1 Parent(s): 0ff268d

UPDATE: ThreadPoolExecutor

Browse files
Files changed (1) hide show
  1. functions.py +4 -4
functions.py CHANGED
@@ -292,13 +292,13 @@ def getLinks(url: str, timeout = 30):
292
  return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
293
 
294
 
295
- def getText(image):
296
- global reader
297
- return "\n".join([text[1] for text in reader.readtext(np.array(image.resize((500, 500))), paragraph=True)])
298
 
299
  def getTextFromImagePDF(pdfBytes):
 
 
 
300
  allImages = convert_from_bytes(pdfBytes)
301
- with ThreadPoolExecutor(max_workers = 25) as p:
302
  texts = list(p.map(getText, allImages))
303
  return "\n\n\n".join(texts)
304
 
 
292
  return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
293
 
294
 
 
 
 
295
 
296
  def getTextFromImagePDF(pdfBytes):
297
+ def getText(image):
298
+ global reader
299
+ return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
300
  allImages = convert_from_bytes(pdfBytes)
301
+ with ThreadPoolExecutor(max_workers = 32) as p:
302
  texts = list(p.map(getText, allImages))
303
  return "\n\n\n".join(texts)
304