|
import run
|
|
import util
|
|
import docx
|
|
from docx.oxml.ns import qn
|
|
from docx.shared import Pt,RGBColor
|
|
import fitz
|
|
import os
|
|
from fpdf import FPDF
|
|
import run
|
|
from BERT_inference import BertClassificationModel
|
|
|
|
|
|
def text_dump_to_lines(text,topic_num,max_length):
|
|
lines = util.seg(text)
|
|
sentences = run.texClear(lines)
|
|
print(sentences)
|
|
keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length))
|
|
keysText = "\n".join(keys)
|
|
outputText = "\n".join(output)
|
|
print(keys,output)
|
|
return keysText, outputText, dump_to_txt(output), dump_to_docx(output), dump_to_pdf(output)
|
|
|
|
def file_dump_to_lines(file,topic_num,max_length):
|
|
lines = []
|
|
|
|
fileFormat = file.name.split(".")[-1]
|
|
|
|
if fileFormat == "txt":
|
|
with open(file.name, encoding='utf-8') as f:
|
|
content = f.read()
|
|
lines = [x.strip() for x in content.split("\n") if x.strip()!='']
|
|
elif fileFormat == "docx":
|
|
doc=docx.Document(file.name)
|
|
paragraphs = doc.paragraphs
|
|
lines = [par.text for par in paragraphs]
|
|
elif fileFormat == "pdf":
|
|
pdf = fitz.open(file.name)
|
|
for page in pdf:
|
|
pageText = page.get_text("text")
|
|
lines.extend([x.strip() for x in pageText.split("\n") if x.strip()!=''])
|
|
|
|
text = "\n".join(lines)
|
|
print(text)
|
|
keysText, outputText, txt_path, docx_path, pdf_path = text_dump_to_lines(text,topic_num,max_length)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return keysText, outputText, txt_path, docx_path, pdf_path
|
|
|
|
def dump_to_txt(lines):
|
|
text = "\n".join(lines)
|
|
with open('temp.txt',mode="w",encoding="utf-8") as f:
|
|
f.write(text)
|
|
path = os.path.abspath('temp.txt')
|
|
return path
|
|
|
|
def dump_to_docx(lines):
|
|
document = docx.Document()
|
|
document.styles['Normal'].font.name = u'宋体'
|
|
document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
|
|
document.styles['Normal'].font.size = Pt(14)
|
|
document.styles['Normal'].font.color.rgb = RGBColor(0,0,0)
|
|
|
|
|
|
paragraph = document.add_paragraph()
|
|
run = paragraph.add_run()
|
|
|
|
run.font.name=u'Cambria'
|
|
run.font.color.rgb = RGBColor(0,0,0)
|
|
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'Cambria')
|
|
|
|
for line in lines:
|
|
document.add_paragraph(line)
|
|
|
|
document.save(r'temp.docx')
|
|
path = os.path.abspath('temp.docx')
|
|
|
|
return path
|
|
|
|
def dump_to_pdf(lines):
|
|
pdf = FPDF()
|
|
|
|
pdf.add_font('FZY3JW', '', 'FZY3JW.TTF', True)
|
|
pdf.add_page()
|
|
|
|
pdf.set_font("FZY3JW", size=12)
|
|
|
|
try:
|
|
|
|
for line in lines:
|
|
str=line
|
|
num=len(str)
|
|
temp=45
|
|
for j in range(0,num,temp):
|
|
if(j+temp<num):
|
|
data=str[j:j+temp]
|
|
else:
|
|
data=str[j:num]
|
|
pdf.cell(0, 5, data, ln=1)
|
|
except Exception as e:
|
|
print(e)
|
|
pdf.output("temp.pdf")
|
|
path = os.path.abspath('temp.pdf')
|
|
return path
|
|
|
|
if __name__ == "__main__":
|
|
with open('test.txt', 'r', encoding='utf-8') as f:
|
|
data = f.read()
|
|
|
|
text_dump_to_lines(data,10,50) |