not-lain commited on
Commit
f17ebdc
·
1 Parent(s): d99955f

update parameter

Browse files
Files changed (1) hide show
  1. app.py +6 -27
app.py CHANGED
@@ -269,41 +269,33 @@ def extract_text_from_pdf(file):
269
  return text
270
 
271
 
272
- def extract_text_from_docx(file):
273
  text = ""
274
- doc = Document(file.name)
275
  for paragraph in doc.paragraphs:
276
  text += paragraph.text + "\n"
277
  return text
278
 
279
 
280
- def convert_doc_to_text(doc_path):
281
  try:
282
  subprocess.run(
283
- ["unoconv", "--format", "txt", doc_path],
284
  capture_output=True,
285
  text=True,
286
  check=True,
287
  )
288
- txt_file_path = doc_path.replace(".doc", ".txt")
289
  with open(txt_file_path, "r") as f:
290
  text = f.read()
291
  text = text.lstrip("\ufeff")
292
  os.remove(txt_file_path)
293
  return text
294
  except subprocess.CalledProcessError as e:
295
- print(f"Error converting {doc_path} to text: {e}")
296
  return ""
297
 
298
 
299
- def extract_text_from_doc_or_docx(file):
300
- if file.name.endswith(".docx"):
301
- return extract_text_from_docx(file)
302
- elif file.name.endswith(".doc"):
303
- return convert_doc_to_text(file.name)
304
- else:
305
- return "Unsupported file type. Please upload a .doc or .docx file."
306
-
307
 
308
  # function that generates a random string
309
  def generate_random_string(length=23):
@@ -405,12 +397,6 @@ pdf_to_text = gr.Interface(
405
  api_name="pdf_to_text",
406
  )
407
 
408
- # doc_or_docx_to_text = gr.Interface(
409
- # extract_text_from_doc_or_docx,
410
- # gr.File(),
411
- # gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"),
412
- # api_name="doc_or_docx_to_text",
413
- # )
414
  doc_to_text = gr.Interface(
415
  convert_doc_to_text,
416
  gr.File(),
@@ -424,13 +410,6 @@ docx_to_text = gr.Interface(
424
  api_name="docx_to_text"
425
  )
426
 
427
- # pptx_or_ppt_to_text = gr.Interface(
428
- # extract_text_from_ppt_or_pptx,
429
- # gr.File(),
430
- # gr.Textbox(placeholder="Extracted text from PPTX will appear here"),
431
- # api_name="pptx_or_ppt_to_text",
432
- # )
433
-
434
  ppt_to_text = gr.Interface(
435
  extract_text_from_ppt,
436
  gr.File(),
 
269
  return text
270
 
271
 
272
+ def extract_text_from_docx(file_path):
273
  text = ""
274
+ doc = Document(file_path.name)
275
  for paragraph in doc.paragraphs:
276
  text += paragraph.text + "\n"
277
  return text
278
 
279
 
280
+ def convert_doc_to_text(file_path):
281
  try:
282
  subprocess.run(
283
+ ["unoconv", "--format", "txt", file_path],
284
  capture_output=True,
285
  text=True,
286
  check=True,
287
  )
288
+ txt_file_path = file_path.replace(".doc", ".txt")
289
  with open(txt_file_path, "r") as f:
290
  text = f.read()
291
  text = text.lstrip("\ufeff")
292
  os.remove(txt_file_path)
293
  return text
294
  except subprocess.CalledProcessError as e:
295
+ print(f"Error converting {file_path} to text: {e}")
296
  return ""
297
 
298
 
 
 
 
 
 
 
 
 
299
 
300
  # function that generates a random string
301
  def generate_random_string(length=23):
 
397
  api_name="pdf_to_text",
398
  )
399
 
 
 
 
 
 
 
400
  doc_to_text = gr.Interface(
401
  convert_doc_to_text,
402
  gr.File(),
 
410
  api_name="docx_to_text"
411
  )
412
 
 
 
 
 
 
 
 
413
  ppt_to_text = gr.Interface(
414
  extract_text_from_ppt,
415
  gr.File(),