TSA / textInput.py
QINGCHE's picture
update file code
9fd45d8
raw
history blame
3.79 kB
import run
import util
import docx
from docx.oxml.ns import qn
from docx.shared import Pt,RGBColor
import fitz
import os
from fpdf import FPDF
import run
from BERT_inference import BertClassificationModel
def text_dump_to_lines(text,topic_num,max_length):
lines = util.seg(text)
sentences = run.texClear(lines)
print(sentences)
keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length))
keysText = "\n".join(keys)
outputText = "\n".join(output)
print(keys,output)
return keysText, outputText, dump_to_txt(output), dump_to_docx(output), dump_to_pdf(output)
def file_dump_to_lines(file,topic_num,max_length):
lines = []
# print(file.name)
fileFormat = file.name.split(".")[-1]
# print(fileFormat)
if fileFormat == "txt":
with open(file.name, encoding='utf-8') as f:
content = f.read()
lines = [x.strip() for x in content.split("\n") if x.strip()!='']
elif fileFormat == "docx":
doc=docx.Document(file.name)
paragraphs = doc.paragraphs
lines = [par.text for par in paragraphs]
elif fileFormat == "pdf":
pdf = fitz.open(file.name)
for page in pdf:
pageText = page.get_text("text")
lines.extend([x.strip() for x in pageText.split("\n") if x.strip()!=''])
# print(lines)
text = "\n".join(lines)
print(text)
keysText, outputText, txt_path, docx_path, pdf_path = text_dump_to_lines(text,topic_num,max_length)
# sentences = run.texClear(lines)
# keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length))
# keysText = "\n".join(keys)
# outputText = "\n".join(output)
# # text = "\n".join(lines)
# # return text, text, dump_to_txt(lines), dump_to_docx(lines), dump_to_pdf(lines)
return keysText, outputText, txt_path, docx_path, pdf_path
def dump_to_txt(lines):
text = "\n".join(lines)
with open('temp.txt',mode="w",encoding="utf-8") as f:
f.write(text)
path = os.path.abspath('temp.txt')
return path
def dump_to_docx(lines):
document = docx.Document()
document.styles['Normal'].font.name = u'宋体'
document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
document.styles['Normal'].font.size = Pt(14)
document.styles['Normal'].font.color.rgb = RGBColor(0,0,0)
paragraph = document.add_paragraph()
run = paragraph.add_run()
#run.font.name = 'Times New Roman'
run.font.name=u'Cambria'
run.font.color.rgb = RGBColor(0,0,0)
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'Cambria')
for line in lines:
document.add_paragraph(line)
document.save(r'temp.docx')
path = os.path.abspath('temp.docx')
return path
def dump_to_pdf(lines):
pdf = FPDF()
#读取字体文件
pdf.add_font('FZY3JW', '', 'FZY3JW.TTF', True)
pdf.add_page()
#设置pdf字体大小
pdf.set_font("FZY3JW", size=12)
#打开txt文本
try:
#按行读取txt文本内容
for line in lines:
str=line
num=len(str)
temp=45#判断标志,实现pdf文件每行最多村45个字符
for j in range(0,num,temp):
if(j+temp<num):
data=str[j:j+temp]
else:
data=str[j:num]
pdf.cell(0, 5, data, ln=1)
except Exception as e:
print(e)
pdf.output("temp.pdf")
path = os.path.abspath('temp.pdf')
return path
if __name__ == "__main__":
with open('test.txt', 'r', encoding='utf-8') as f:
data = f.read()
# print(data)
text_dump_to_lines(data,10,50)