update parameter
Browse files
app.py
CHANGED
@@ -269,41 +269,33 @@ def extract_text_from_pdf(file):
|
|
269 |
return text
|
270 |
|
271 |
|
272 |
-
def extract_text_from_docx(
|
273 |
text = ""
|
274 |
-
doc = Document(
|
275 |
for paragraph in doc.paragraphs:
|
276 |
text += paragraph.text + "\n"
|
277 |
return text
|
278 |
|
279 |
|
280 |
-
def convert_doc_to_text(
|
281 |
try:
|
282 |
subprocess.run(
|
283 |
-
["unoconv", "--format", "txt",
|
284 |
capture_output=True,
|
285 |
text=True,
|
286 |
check=True,
|
287 |
)
|
288 |
-
txt_file_path =
|
289 |
with open(txt_file_path, "r") as f:
|
290 |
text = f.read()
|
291 |
text = text.lstrip("\ufeff")
|
292 |
os.remove(txt_file_path)
|
293 |
return text
|
294 |
except subprocess.CalledProcessError as e:
|
295 |
-
print(f"Error converting {
|
296 |
return ""
|
297 |
|
298 |
|
299 |
-
def extract_text_from_doc_or_docx(file):
|
300 |
-
if file.name.endswith(".docx"):
|
301 |
-
return extract_text_from_docx(file)
|
302 |
-
elif file.name.endswith(".doc"):
|
303 |
-
return convert_doc_to_text(file.name)
|
304 |
-
else:
|
305 |
-
return "Unsupported file type. Please upload a .doc or .docx file."
|
306 |
-
|
307 |
|
308 |
# function that generates a random string
|
309 |
def generate_random_string(length=23):
|
@@ -405,12 +397,6 @@ pdf_to_text = gr.Interface(
|
|
405 |
api_name="pdf_to_text",
|
406 |
)
|
407 |
|
408 |
-
# doc_or_docx_to_text = gr.Interface(
|
409 |
-
# extract_text_from_doc_or_docx,
|
410 |
-
# gr.File(),
|
411 |
-
# gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"),
|
412 |
-
# api_name="doc_or_docx_to_text",
|
413 |
-
# )
|
414 |
doc_to_text = gr.Interface(
|
415 |
convert_doc_to_text,
|
416 |
gr.File(),
|
@@ -424,13 +410,6 @@ docx_to_text = gr.Interface(
|
|
424 |
api_name="docx_to_text"
|
425 |
)
|
426 |
|
427 |
-
# pptx_or_ppt_to_text = gr.Interface(
|
428 |
-
# extract_text_from_ppt_or_pptx,
|
429 |
-
# gr.File(),
|
430 |
-
# gr.Textbox(placeholder="Extracted text from PPTX will appear here"),
|
431 |
-
# api_name="pptx_or_ppt_to_text",
|
432 |
-
# )
|
433 |
-
|
434 |
ppt_to_text = gr.Interface(
|
435 |
extract_text_from_ppt,
|
436 |
gr.File(),
|
|
|
269 |
return text
|
270 |
|
271 |
|
272 |
+
def extract_text_from_docx(file_path):
|
273 |
text = ""
|
274 |
+
doc = Document(file_path.name)
|
275 |
for paragraph in doc.paragraphs:
|
276 |
text += paragraph.text + "\n"
|
277 |
return text
|
278 |
|
279 |
|
280 |
+
def convert_doc_to_text(file_path):
|
281 |
try:
|
282 |
subprocess.run(
|
283 |
+
["unoconv", "--format", "txt", file_path],
|
284 |
capture_output=True,
|
285 |
text=True,
|
286 |
check=True,
|
287 |
)
|
288 |
+
txt_file_path = file_path.replace(".doc", ".txt")
|
289 |
with open(txt_file_path, "r") as f:
|
290 |
text = f.read()
|
291 |
text = text.lstrip("\ufeff")
|
292 |
os.remove(txt_file_path)
|
293 |
return text
|
294 |
except subprocess.CalledProcessError as e:
|
295 |
+
print(f"Error converting {file_path} to text: {e}")
|
296 |
return ""
|
297 |
|
298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
|
300 |
# function that generates a random string
|
301 |
def generate_random_string(length=23):
|
|
|
397 |
api_name="pdf_to_text",
|
398 |
)
|
399 |
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
doc_to_text = gr.Interface(
|
401 |
convert_doc_to_text,
|
402 |
gr.File(),
|
|
|
410 |
api_name="docx_to_text"
|
411 |
)
|
412 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
ppt_to_text = gr.Interface(
|
414 |
extract_text_from_ppt,
|
415 |
gr.File(),
|