jeffrey commited on
Commit
14d5ed1
·
1 Parent(s): 77c3530

Delete data persistent on the huggingface space

Browse files
Files changed (1) hide show
  1. app.py +62 -65
app.py CHANGED
@@ -1,25 +1,24 @@
1
  import os
2
- import shutil
3
- from typing import List
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  from autorag.data.parse import langchain_parse
 
8
  from autorag.data.parse.llamaparse import llama_parse
9
  from autorag.data.qa.schema import Raw
 
10
  from llama_index.llms.openai import OpenAI
11
 
12
  from src.create import default_create, fast_create, advanced_create
13
  from src.util import on_submit_openai_key, on_submit_llama_cloud_key, on_submit_upstage_key
14
 
15
- root_dir = os.path.dirname(os.path.realpath(__file__))
16
- FILE_DIR = os.path.join(root_dir, "file_cache")
17
- if not os.path.exists(FILE_DIR):
18
- os.makedirs(FILE_DIR)
19
- DATA_DIR = os.path.join(root_dir, "data")
20
- if not os.path.exists(DATA_DIR):
21
- os.makedirs(DATA_DIR)
22
-
23
 
24
  def change_lang_choice(lang: str) -> str:
25
  lang_dict = {
@@ -39,43 +38,38 @@ def change_visible_status_api_key(parse_method: str):
39
 
40
 
41
 
42
- def run_parse(file_lists: List[str], parse_method: str, progress=gr.Progress()):
43
  # save an input file to a directory
44
- for file_path in file_lists:
45
- shutil.copy(file_path, FILE_DIR)
46
  progress(0.05)
 
47
 
48
  if parse_method in ["pdfminer", "pdfplumber", "pypdfium2", "pypdf", "pymupdf"]:
49
- raw_df: pd.DataFrame = langchain_parse(data_path_glob=os.path.join(FILE_DIR, "*.pdf"), parse_method=parse_method)
 
50
  elif parse_method == "llama-parse":
51
  llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
52
  if llama_cloud_api_key is None:
53
- return "Please submit your Llama Cloud API key first."
54
- raw_df: pd.DataFrame = llama_parse(data_path_glob=os.path.join(FILE_DIR, "*.pdf"))
55
  elif parse_method == "upstage🇰🇷":
56
  upstage_api_key = os.getenv("UPSTAGE_API_KEY")
57
  if upstage_api_key is None:
58
- return "Please submit your Upstage API key first."
59
- raw_df: pd.DataFrame = langchain_parse(data_path_glob=os.path.join(FILE_DIR, "*.pdf"), parse_method="upstagedocumentparse")
 
60
  else:
61
- return "Unsupported parse method."
62
  progress(0.8)
63
 
64
- raw_df.to_parquet(os.path.join(DATA_DIR, "raw.parquet"), index=False)
65
- return "Parsing Complete. Download at the bottom button."
66
 
67
 
68
- def run_chunk(use_existed_raw: bool, raw_file: str, chunk_method: str, chunk_size: int, chunk_overlap: int,
69
- lang: str = "English", progress=gr.Progress()):
70
  lang = change_lang_choice(lang)
71
- if use_existed_raw:
72
- raw_df_path = os.path.join(DATA_DIR, "raw.parquet")
73
- else:
74
- raw_df_path = raw_file
75
-
76
- if not os.path.exists(raw_df_path):
77
- return "Please upload raw.parquet file first. Or run the parsing stage first."
78
- raw_df = pd.read_parquet(raw_df_path, engine="pyarrow")
79
  raw_instance = Raw(raw_df)
80
 
81
  if chunk_method in ["Token", "Sentence"]:
@@ -90,28 +84,21 @@ def run_chunk(use_existed_raw: bool, raw_file: str, chunk_method: str, chunk_siz
90
  add_file_name=lang, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
91
  else:
92
  gr.Error("Unsupported chunk method.")
93
- return "Unsupported chunk method."
94
  progress(0.8)
95
- corpus.to_parquet(os.path.join(DATA_DIR, "corpus.parquet"))
96
- return "Chunking Complete. Download at the bottom button."
97
 
98
 
99
- def run_qa(use_existed_corpus: bool, corpus_file: str, qa_method: str,
100
- model_name: str, qa_cnt: int, batch_size: int, lang: str = "English", progress=gr.Progress()):
 
101
  lang = change_lang_choice(lang)
102
- if use_existed_corpus:
103
- corpus_df_path = os.path.join(DATA_DIR, "corpus.parquet")
104
- else:
105
- corpus_df_path = corpus_file
106
-
107
- if not os.path.exists(corpus_df_path):
108
- gr.Error("Please upload corpus.parquet file first. Or run the chunking stage first.")
109
- return "Please upload corpus.parquet file first. Or run the chunking stage first."
110
- corpus_df = pd.read_parquet(corpus_df_path, engine="pyarrow")
111
 
112
  if os.getenv("OPENAI_API_KEY") is None:
113
  gr.Error("Please submit your OpenAI API key first.")
114
- return "Please submit your OpenAI API key first."
115
  llm = OpenAI(model=model_name)
116
 
117
  if qa_method == "default":
@@ -122,18 +109,25 @@ def run_qa(use_existed_corpus: bool, corpus_file: str, qa_method: str,
122
  qa = advanced_create(corpus_df, llm=llm, n=qa_cnt, lang=lang, progress=progress, batch_size=batch_size)
123
  else:
124
  gr.Error("Unsupported QA method.")
125
- return "Unsupported QA method."
 
 
126
 
127
- qa.to_parquet(os.path.join(DATA_DIR, "qa.parquet"), os.path.join(DATA_DIR, "corpus.parquet"))
128
- return "QA Creation Complete. Download at the bottom button."
129
 
 
 
 
 
 
 
 
 
130
 
131
- def file_reset() -> str:
132
- shutil.rmtree(FILE_DIR)
133
- os.makedirs(FILE_DIR)
134
- return "Files reset complete."
135
 
136
  with gr.Blocks(theme="earneleh/paris") as demo:
 
 
 
137
  gr.HTML("<h1>AutoRAG Data Creation 🛠️</h1>")
138
  with gr.Row():
139
  openai_key_textbox = gr.Textbox(label="Please input your OpenAI API key and press Enter.", type="password",
@@ -166,9 +160,8 @@ with gr.Blocks(theme="earneleh/paris") as demo:
166
  label="Parsing Method", info="Choose parsing method that you want")
167
  parse_button = gr.Button(value="Run Parsing")
168
  parse_status = gr.Textbox(value="Not Started", interactive=False)
169
- raw_download_button = gr.DownloadButton(value=os.path.join(DATA_DIR, 'raw.parquet'),
170
  label="Download raw.parquet")
171
- file_reset_button = gr.Button(value="Reset uploaded files")
172
 
173
  with gr.Column(scale=1):
174
  gr.Markdown(
@@ -185,7 +178,7 @@ with gr.Blocks(theme="earneleh/paris") as demo:
185
  chunk_button = gr.Button(value="Run Chunking")
186
  chunk_status = gr.Textbox(value="Not Started", interactive=False)
187
  corpus_download_button = gr.DownloadButton(label="Download corpus.parquet",
188
- value=os.path.join(DATA_DIR, 'corpus.parquet'))
189
 
190
  with gr.Column(scale=1):
191
  gr.Markdown(
@@ -205,7 +198,7 @@ with gr.Blocks(theme="earneleh/paris") as demo:
205
  run_qa_button = gr.Button(value="Run QA Creation")
206
  qa_status = gr.Textbox(value="Not Started", interactive=False)
207
  qa_download_button = gr.DownloadButton(label="Download qa.parquet",
208
- value=os.path.join(DATA_DIR, 'qa.parquet'))
209
 
210
  #================================================================================================#
211
  # Logics
@@ -217,17 +210,19 @@ with gr.Blocks(theme="earneleh/paris") as demo:
217
  openai_key_textbox.submit(on_submit_openai_key, inputs=[openai_key_textbox], outputs=api_key_status_box)
218
 
219
  # Parsing
220
- parse_button.click(run_parse, inputs=[document_file_input, parse_choice], outputs=parse_status)
221
- file_reset_button.click(file_reset, outputs=parse_status)
222
 
223
  # Chunking
224
- chunk_button.click(run_chunk, inputs=[use_previous_raw_file, raw_file_input, chunk_choice, chunk_size, chunk_overlap,
225
- lang_choice],
226
- outputs=chunk_status)
227
 
228
  # QA Creation
229
- run_qa_button.click(run_qa, inputs=[use_previous_corpus_file, corpus_file_input, qa_choice, model_choice, qa_cnt,
230
- batch_size, lang_choice], outputs=qa_status)
 
 
231
 
232
  # API Key visibility
233
  parse_choice.change(change_visible_status_api_key, inputs=[parse_choice],
@@ -236,4 +231,6 @@ with gr.Blocks(theme="earneleh/paris") as demo:
236
  upstage_key_textbox.submit(on_submit_upstage_key, inputs=[upstage_key_textbox], outputs=upstage_key_status_box)
237
 
238
 
239
- demo.launch(share=False, debug=False, allowed_paths=[FILE_DIR, DATA_DIR])
 
 
 
1
  import os
2
+ import tempfile
3
+ from typing import List, Callable
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  from autorag.data.parse import langchain_parse
8
+ from autorag.data.parse.base import _add_last_modified_datetime
9
  from autorag.data.parse.llamaparse import llama_parse
10
  from autorag.data.qa.schema import Raw
11
+ from autorag.utils import result_to_dataframe
12
  from llama_index.llms.openai import OpenAI
13
 
14
  from src.create import default_create, fast_create, advanced_create
15
  from src.util import on_submit_openai_key, on_submit_llama_cloud_key, on_submit_upstage_key
16
 
17
+ @result_to_dataframe(["texts", "path", "page", "last_modified_datetime"])
18
+ def original_parse(fn: Callable, **kwargs):
19
+ result = fn(**kwargs)
20
+ result = _add_last_modified_datetime(result)
21
+ return result
 
 
 
22
 
23
  def change_lang_choice(lang: str) -> str:
24
  lang_dict = {
 
38
 
39
 
40
 
41
+ def run_parse(file_lists: List[str], parse_method: str, original_raw_df, progress=gr.Progress()):
42
  # save an input file to a directory
43
+
 
44
  progress(0.05)
45
+ langchain_parse_original = langchain_parse.__wrapped__
46
 
47
  if parse_method in ["pdfminer", "pdfplumber", "pypdfium2", "pypdf", "pymupdf"]:
48
+ raw_df: pd.DataFrame = original_parse(langchain_parse_original,
49
+ data_path_list=file_lists, parse_method=parse_method)
50
  elif parse_method == "llama-parse":
51
  llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
52
  if llama_cloud_api_key is None:
53
+ return "Please submit your Llama Cloud API key first.", original_raw_df
54
+ raw_df: pd.DataFrame = original_parse(llama_parse.__wrapped__, data_path_list=file_lists)
55
  elif parse_method == "upstage🇰🇷":
56
  upstage_api_key = os.getenv("UPSTAGE_API_KEY")
57
  if upstage_api_key is None:
58
+ return "Please submit your Upstage API key first.", original_raw_df
59
+ raw_df: pd.DataFrame = original_parse(langchain_parse_original,
60
+ data_path_list=file_lists, parse_method="upstagedocumentparse")
61
  else:
62
+ return "Unsupported parse method.", original_raw_df
63
  progress(0.8)
64
 
65
+ return "Parsing Complete. Download at the bottom button.", raw_df
 
66
 
67
 
68
+ def run_chunk(use_existed_raw: bool, raw_df: pd.DataFrame, raw_file: str, chunk_method: str, chunk_size: int, chunk_overlap: int,
69
+ lang: str = "English", original_corpus_df = None, progress=gr.Progress()):
70
  lang = change_lang_choice(lang)
71
+ if not use_existed_raw:
72
+ raw_df = pd.read_parquet(raw_file, engine="pyarrow")
 
 
 
 
 
 
73
  raw_instance = Raw(raw_df)
74
 
75
  if chunk_method in ["Token", "Sentence"]:
 
84
  add_file_name=lang, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
85
  else:
86
  gr.Error("Unsupported chunk method.")
87
+ return "Unsupported chunk method.", original_corpus_df
88
  progress(0.8)
89
+ return "Chunking Complete. Download at the bottom button.", corpus.data
 
90
 
91
 
92
+ def run_qa(use_existed_corpus: bool, corpus_df: pd.DataFrame, corpus_file: str, qa_method: str,
93
+ model_name: str, qa_cnt: int, batch_size: int, lang: str = "English", original_qa_df = None,
94
+ progress=gr.Progress()):
95
  lang = change_lang_choice(lang)
96
+ if not use_existed_corpus:
97
+ corpus_df = pd.read_parquet(corpus_file, engine="pyarrow")
 
 
 
 
 
 
 
98
 
99
  if os.getenv("OPENAI_API_KEY") is None:
100
  gr.Error("Please submit your OpenAI API key first.")
101
+ return "Please submit your OpenAI API key first.", original_qa_df
102
  llm = OpenAI(model=model_name)
103
 
104
  if qa_method == "default":
 
109
  qa = advanced_create(corpus_df, llm=llm, n=qa_cnt, lang=lang, progress=progress, batch_size=batch_size)
110
  else:
111
  gr.Error("Unsupported QA method.")
112
+ return "Unsupported QA method.", original_qa_df
113
+
114
+ return "QA Creation Complete. Download at the bottom button.", qa.data
115
 
 
 
116
 
117
+ def download_state(state: pd.DataFrame, change_name: str):
118
+ if state is None:
119
+ gr.Error("No data to download.")
120
+ return ""
121
+ with tempfile.TemporaryDirectory() as temp_dir:
122
+ filename = os.path.join(temp_dir, f"{change_name}.parquet")
123
+ state.to_parquet(filename, engine="pyarrow")
124
+ yield filename
125
 
 
 
 
 
126
 
127
  with gr.Blocks(theme="earneleh/paris") as demo:
128
+ raw_df_state = gr.State()
129
+ corpus_df_state = gr.State()
130
+ qa_df_state = gr.State()
131
  gr.HTML("<h1>AutoRAG Data Creation 🛠️</h1>")
132
  with gr.Row():
133
  openai_key_textbox = gr.Textbox(label="Please input your OpenAI API key and press Enter.", type="password",
 
160
  label="Parsing Method", info="Choose parsing method that you want")
161
  parse_button = gr.Button(value="Run Parsing")
162
  parse_status = gr.Textbox(value="Not Started", interactive=False)
163
+ raw_download_button = gr.DownloadButton(value=download_state, inputs=[raw_df_state, gr.State("raw")],
164
  label="Download raw.parquet")
 
165
 
166
  with gr.Column(scale=1):
167
  gr.Markdown(
 
178
  chunk_button = gr.Button(value="Run Chunking")
179
  chunk_status = gr.Textbox(value="Not Started", interactive=False)
180
  corpus_download_button = gr.DownloadButton(label="Download corpus.parquet",
181
+ value=download_state, inputs=[corpus_df_state, gr.State("corpus")])
182
 
183
  with gr.Column(scale=1):
184
  gr.Markdown(
 
198
  run_qa_button = gr.Button(value="Run QA Creation")
199
  qa_status = gr.Textbox(value="Not Started", interactive=False)
200
  qa_download_button = gr.DownloadButton(label="Download qa.parquet",
201
+ value=download_state, inputs=[qa_df_state, gr.State("qa")])
202
 
203
  #================================================================================================#
204
  # Logics
 
210
  openai_key_textbox.submit(on_submit_openai_key, inputs=[openai_key_textbox], outputs=api_key_status_box)
211
 
212
  # Parsing
213
+ parse_button.click(run_parse, inputs=[document_file_input, parse_choice, raw_df_state],
214
+ outputs=[parse_status, raw_df_state])
215
 
216
  # Chunking
217
+ chunk_button.click(run_chunk, inputs=[use_previous_raw_file, raw_df_state, raw_file_input, chunk_choice, chunk_size, chunk_overlap,
218
+ lang_choice, corpus_df_state],
219
+ outputs=[chunk_status, corpus_df_state])
220
 
221
  # QA Creation
222
+ run_qa_button.click(run_qa, inputs=[use_previous_corpus_file, corpus_df_state, corpus_file_input, qa_choice,
223
+ model_choice, qa_cnt, batch_size, lang_choice,
224
+ qa_df_state],
225
+ outputs=[qa_status, qa_df_state])
226
 
227
  # API Key visibility
228
  parse_choice.change(change_visible_status_api_key, inputs=[parse_choice],
 
231
  upstage_key_textbox.submit(on_submit_upstage_key, inputs=[upstage_key_textbox], outputs=upstage_key_status_box)
232
 
233
 
234
+ # if __name__ == "__main__":
235
+ # demo.launch(share=False, debug=True)
236
+ demo.launch(share=False, debug=False)