document_loaders / toolbox /to_markdown /excel_to_markdown.py
HoneyTian's picture
first commit
e94100d
raw
history blame
1.93 kB
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
from pathlib import Path
import shutil
import tempfile
import uuid
import pandas as pd
from project_settings import project_path
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--filename",
default=(project_path / "data/files/xlsx/可对外提供资料清单说明-V1.0版本2022.1.13.xlsx").as_posix(),
# default=(project_path / "data/files/xlsx/财务部流程文档编号记录.xlsx").as_posix(),
type=str
)
args = parser.parse_args()
return args
@BaseToMarkdown.register("pandas")
class ExcelToMarkdown(BaseToMarkdown):
def __init__(self, filename: str):
super().__init__(filename)
self.excel = pd.read_excel(self.filename, sheet_name=None)
def get_md_text(self) -> str:
result = ""
for sheet_name, df in self.excel.items():
md_text = df.to_markdown(index=False)
result += f"{sheet_name}\n\n"
result += f"{md_text}\n\n"
return result
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
md_text = self.get_md_text()
with open(md_file.as_posix(), "w", encoding="utf-8") as f:
f.write(md_text)
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
# zip
self.zip_directory(temp_dir, output_zip_file)
shutil.rmtree(temp_dir)
return output_zip_file
def main():
args = get_args()
e2m = ExcelToMarkdown(args.filename)
output_zip_file = e2m.save_to_zip(output_dir=".")
print(output_zip_file)
return
if __name__ == "__main__":
main()