import os
import shutil
from typing import List
import gradio as gr
import pandas as pd
from autorag.data.parse import langchain_parse
from autorag.data.parse.llamaparse import llama_parse
from autorag.data.qa.schema import Raw
from llama_index.llms.openai import OpenAI
from src.create import default_create, fast_create, advanced_create
from src.util import on_submit_openai_key, on_submit_llama_cloud_key, on_submit_upstage_key
root_dir = os.path.dirname(os.path.realpath(__file__))
FILE_DIR = os.path.join(root_dir, "file_cache")
if not os.path.exists(FILE_DIR):
os.makedirs(FILE_DIR)
DATA_DIR = os.path.join(root_dir, "data")
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
def change_lang_choice(lang: str) -> str:
lang_dict = {
"English": "en",
"한국어": "ko",
"日本語": "ja"
}
return lang_dict[lang]
def change_visible_status_api_key(parse_method: str):
if parse_method == "llama-parse":
return gr.update(visible=True), gr.update(visible=False)
elif parse_method == "upstage🇰🇷":
return gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False)
def run_parse(file_lists: List[str], parse_method: str, progress=gr.Progress()):
# save an input file to a directory
for file_path in file_lists:
shutil.copy(file_path, FILE_DIR)
progress(0.05)
if parse_method in ["pdfminer", "pdfplumber", "pypdfium2", "pypdf", "pymupdf"]:
raw_df: pd.DataFrame = langchain_parse(data_path_glob=os.path.join(FILE_DIR, "*.pdf"), parse_method=parse_method)
elif parse_method == "llama-parse":
llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
if llama_cloud_api_key is None:
return "Please submit your Llama Cloud API key first."
raw_df: pd.DataFrame = llama_parse(data_path_glob=os.path.join(FILE_DIR, "*.pdf"))
elif parse_method == "upstage🇰🇷":
upstage_api_key = os.getenv("UPSTAGE_API_KEY")
if upstage_api_key is None:
return "Please submit your Upstage API key first."
raw_df: pd.DataFrame = langchain_parse(data_path_glob=os.path.join(FILE_DIR, "*.pdf"), parse_method="upstagedocumentparse")
else:
return "Unsupported parse method."
progress(0.8)
raw_df.to_parquet(os.path.join(DATA_DIR, "raw.parquet"), index=False)
return "Parsing Complete. Download at the bottom button."
def run_chunk(use_existed_raw: bool, raw_file: str, chunk_method: str, chunk_size: int, chunk_overlap: int,
lang: str = "English", progress=gr.Progress()):
lang = change_lang_choice(lang)
if use_existed_raw:
raw_df_path = os.path.join(DATA_DIR, "raw.parquet")
else:
raw_df_path = raw_file
if not os.path.exists(raw_df_path):
return "Please upload raw.parquet file first. Or run the parsing stage first."
raw_df = pd.read_parquet(raw_df_path, engine="pyarrow")
raw_instance = Raw(raw_df)
if chunk_method in ["Token", "Sentence"]:
corpus = raw_instance.chunk("llama_index_chunk", chunk_method=chunk_method, chunk_size=chunk_size,
chunk_overlap=chunk_overlap, add_file_name=lang)
elif chunk_method in ["Semantic"]:
corpus = raw_instance.chunk("llama_index_chunk", chunk_method="Semantic_llama_index",
embed_model="openai", breakpoint_percnetile_threshold=0.95,
add_file_name=lang)
elif chunk_method == "Recursive":
corpus = raw_instance.chunk("langchain_chunk", chunk_method="recursivecharacter",
add_file_name=lang, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
else:
gr.Error("Unsupported chunk method.")
return "Unsupported chunk method."
progress(0.8)
corpus.to_parquet(os.path.join(DATA_DIR, "corpus.parquet"))
return "Chunking Complete. Download at the bottom button."
def run_qa(use_existed_corpus: bool, corpus_file: str, qa_method: str,
model_name: str, qa_cnt: int, batch_size: int, lang: str = "English", progress=gr.Progress()):
lang = change_lang_choice(lang)
if use_existed_corpus:
corpus_df_path = os.path.join(DATA_DIR, "corpus.parquet")
else:
corpus_df_path = corpus_file
if not os.path.exists(corpus_df_path):
gr.Error("Please upload corpus.parquet file first. Or run the chunking stage first.")
return "Please upload corpus.parquet file first. Or run the chunking stage first."
corpus_df = pd.read_parquet(corpus_df_path, engine="pyarrow")
if os.getenv("OPENAI_API_KEY") is None:
gr.Error("Please submit your OpenAI API key first.")
return "Please submit your OpenAI API key first."
llm = OpenAI(model=model_name)
if qa_method == "default":
qa = default_create(corpus_df, llm=llm, n=qa_cnt, lang=lang, progress=progress, batch_size=batch_size)
elif qa_method == "fast":
qa = fast_create(corpus_df, llm=llm, n=qa_cnt, lang=lang, progress=progress, batch_size=batch_size)
elif qa_method == "advanced":
qa = advanced_create(corpus_df, llm=llm, n=qa_cnt, lang=lang, progress=progress, batch_size=batch_size)
else:
gr.Error("Unsupported QA method.")
return "Unsupported QA method."
qa.to_parquet(os.path.join(DATA_DIR, "qa.parquet"), os.path.join(DATA_DIR, "corpus.parquet"))
return "QA Creation Complete. Download at the bottom button."
def file_reset() -> str:
shutil.rmtree(FILE_DIR)
os.makedirs(FILE_DIR)
return "Files reset complete."
with gr.Blocks(theme="earneleh/paris") as demo:
gr.HTML("
AutoRAG Data Creation 🛠️
")
with gr.Row():
openai_key_textbox = gr.Textbox(label="Please input your OpenAI API key and press Enter.", type="password",
info="You can get your API key from https://platform.openai.com/account/api-keys\n\n"
"AutoRAG do not store your API key.",
autofocus=True)
api_key_status_box = gr.Textbox(label="OpenAI API status", value="Not Set", interactive=False)
lang_choice = gr.Radio(["English", "한국어", "日本語"], label="Language",
value="English", info="Choose Langauge. En, Ko, Ja are supported.",
interactive=True)
with gr.Row(visible=False) as llama_cloud_api_key_row:
llama_key_textbox = gr.Textbox(label="Please input your Llama Cloud API key and press Enter.", type="password",
info="You can get your API key from https://docs.cloud.llamaindex.ai/llamacloud/getting_started/api_key\n\n"
"AutoRAG do not store your API key.",)
llama_key_status_box = gr.Textbox(label="Llama Cloud API status", value="Not Set", interactive=False)
with gr.Row(visible=False) as upstage_api_key_row:
upstage_key_textbox = gr.Textbox(label="Please input your Upstage API key and press Enter.", type="password",
info="You can get your API key from https://upstage.ai/\n\n"
"AutoRAG do not store your API key.",)
upstage_key_status_box = gr.Textbox(label="Upstage API status", value="Not Set", interactive=False)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## 1. Parse your PDF files\n\nUpload your pdf files and make it to raw.parquet.")
document_file_input = gr.File(label="Upload Files", type="filepath", file_count="multiple")
parse_choice = gr.Dropdown(
["pdfminer", "pdfplumber", "pypdfium2", "pypdf", "pymupdf", "llama-parse", "upstage🇰🇷"],
label="Parsing Method", info="Choose parsing method that you want")
parse_button = gr.Button(value="Run Parsing")
parse_status = gr.Textbox(value="Not Started", interactive=False)
raw_download_button = gr.Button(value="Download raw.parquet",
link=f"/file={os.path.join(DATA_DIR, 'raw.parquet')}")
file_reset_button = gr.Button(value="Reset uploaded files")
with gr.Column(scale=1):
gr.Markdown(
"## 2. Chunk your raw.parquet\n\nUse parsed raw.parquet or upload your own. It will make a corpus.parquet."
)
raw_file_input = gr.File(label="Upload raw.parquet", type="filepath", file_count="single", visible=False)
use_previous_raw_file = gr.Checkbox(label="Use previous raw.parquet", value=True)
chunk_choice = gr.Dropdown(
["Token", "Sentence", "Semantic", "Recursive"],
label="Chunking Method", info="Choose chunking method that you want")
chunk_size = gr.Slider(minimum=128, maximum=1024, step=128, label="Chunk Size", value=256)
chunk_overlap = gr.Slider(minimum=16, maximum=256, step=16, label="Chunk Overlap", value=32)
chunk_button = gr.Button(value="Run Chunking")
chunk_status = gr.Textbox(value="Not Started", interactive=False)
corpus_download_button = gr.Button(value="Download corpus.parquet",
link=f"/file={os.path.join(DATA_DIR, 'corpus.parquet')}")
with gr.Column(scale=1):
gr.Markdown(
"## 3. Create QA dataset from your corpus.parquet\n\nQA dataset is essential to run AutoRAG. Upload corpus.parquet & select QA method and run.")
gr.HTML("Warning: QA Creation uses an OpenAI model, which can be costly. Start with a small batch to gauge expenses.")
corpus_file_input = gr.File(label="Upload corpus.parquet", type="filepath", file_count="single",
visible=False)
use_previous_corpus_file = gr.Checkbox(label="Use previous corpus.parquet", value=True)
qa_choice = gr.Radio(["default", "fast", "advanced"], label="QA Method",
info="Choose QA method that you want")
model_choice = gr.Radio(["gpt-4o-mini", "gpt-4o"], label="Select model for data creation",
)
qa_cnt = gr.Slider(minimum=20, maximum=150, step=5, label="Number of QA pairs", value=80)
batch_size = gr.Slider(minimum=1, maximum=16, step=1,
label="Batch Size to OpenAI model. If there is an error, decrease this.", value=16)
run_qa_button = gr.Button(value="Run QA Creation")
qa_status = gr.Textbox(value="Not Started", interactive=False)
qa_download_button = gr.Button(value="Download qa.parquet",
link=f"/file={os.path.join(DATA_DIR, 'qa.parquet')}")
#================================================================================================#
# Logics
use_previous_raw_file.change(lambda x: gr.update(visible=not x), inputs=[use_previous_raw_file],
outputs=[raw_file_input])
use_previous_corpus_file.change(lambda x: gr.update(visible=not x), inputs=[use_previous_corpus_file],
outputs=[corpus_file_input])
openai_key_textbox.submit(on_submit_openai_key, inputs=[openai_key_textbox], outputs=api_key_status_box)
# Parsing
parse_button.click(run_parse, inputs=[document_file_input, parse_choice], outputs=parse_status)
file_reset_button.click(file_reset, outputs=parse_status)
# Chunking
chunk_button.click(run_chunk, inputs=[use_previous_raw_file, raw_file_input, chunk_choice, chunk_size, chunk_overlap,
lang_choice],
outputs=chunk_status)
# QA Creation
run_qa_button.click(run_qa, inputs=[use_previous_corpus_file, corpus_file_input, qa_choice, model_choice, qa_cnt,
batch_size, lang_choice], outputs=qa_status)
# API Key visibility
parse_choice.change(change_visible_status_api_key, inputs=[parse_choice],
outputs=[llama_cloud_api_key_row, upstage_api_key_row])
llama_key_textbox.submit(on_submit_llama_cloud_key, inputs=[llama_key_textbox], outputs=llama_key_status_box)
upstage_key_textbox.submit(on_submit_upstage_key, inputs=[upstage_key_textbox], outputs=upstage_key_status_box)
demo.launch(share=False, debug=True, allowed_paths=[FILE_DIR, DATA_DIR])