jeffrey commited on
Commit
cf0997e
·
1 Parent(s): 20b5695

init commit

Browse files
Files changed (8) hide show
  1. .gitignore +164 -0
  2. README.md +3 -3
  3. app.py +241 -0
  4. packages.txt +4 -0
  5. requirements.txt +2 -0
  6. src/__init__.py +0 -0
  7. src/create.py +111 -0
  8. src/util.py +62 -0
.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ .idea/
163
+ file_cache/
164
+ data/
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: AutoRAG Data Creation
3
- emoji: 🏢
4
- colorFrom: yellow
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.0.2
8
  app_file: app.py
 
1
  ---
2
  title: AutoRAG Data Creation
3
+ emoji: 🛠️
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.0.2
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from typing import List
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ from autorag.data.parse import langchain_parse
8
+ from autorag.data.parse.llamaparse import llama_parse
9
+ from autorag.data.qa.schema import Raw
10
+ from llama_index.llms.openai import OpenAI
11
+
12
+ from src.create import default_create, fast_create, advanced_create
13
+ from src.util import on_submit_openai_key
14
+
15
+
16
+ root_dir = os.path.dirname(os.path.realpath(__file__))
17
+ FILE_DIR = os.path.join(root_dir, "file_cache")
18
+ if not os.path.exists(FILE_DIR):
19
+ os.makedirs(FILE_DIR)
20
+ DATA_DIR = os.path.join(root_dir, "data")
21
+ if not os.path.exists(DATA_DIR):
22
+ os.makedirs(DATA_DIR)
23
+
24
+
25
+ def change_lang_choice(lang: str) -> str:
26
+ lang_dict = {
27
+ "English": "en",
28
+ "한국어": "ko",
29
+ "日本語": "ja"
30
+ }
31
+ return lang_dict[lang]
32
+
33
+ def change_visible_status_api_key(parse_method: str):
34
+ if parse_method == "llama-parse":
35
+ return gr.update(visible=True), gr.update(visible=False)
36
+ elif parse_method == "upstage🇰🇷":
37
+ return gr.update(visible=False), gr.update(visible=True)
38
+ else:
39
+ return gr.update(visible=False), gr.update(visible=False)
40
+
41
+
42
+
43
+ def run_parse(file_lists: List[str], parse_method: str, progress=gr.Progress()):
44
+ # save an input file to a directory
45
+ for file_path in file_lists:
46
+ shutil.copy(file_path, FILE_DIR)
47
+ progress(0.05)
48
+
49
+ if parse_method in ["pdfminer", "pdfplumber", "pypdfium2", "pypdf", "pymupdf"]:
50
+ raw_df: pd.DataFrame = langchain_parse(data_path_glob=os.path.join(FILE_DIR, "*.pdf"), parse_method=parse_method)
51
+ elif parse_method == "llama-parse":
52
+ llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
53
+ if llama_cloud_api_key is None:
54
+ return "Please submit your Llama Cloud API key first."
55
+ raw_df: pd.DataFrame = llama_parse(data_path_glob=os.path.join(FILE_DIR, "*.pdf"))
56
+ elif parse_method == "upstage🇰🇷":
57
+ upstage_api_key = os.getenv("UPSTAGE_API_KEY")
58
+ if upstage_api_key is None:
59
+ return "Please submit your Upstage API key first."
60
+ raw_df: pd.DataFrame = langchain_parse(data_path_glob=os.path.join(FILE_DIR, "*.pdf"), parse_method="upstagedocumentparse")
61
+ else:
62
+ return "Unsupported parse method."
63
+ progress(0.8)
64
+
65
+ raw_df.to_parquet(os.path.join(DATA_DIR, "raw.parquet"), index=False)
66
+ return "Parsing Complete. Download at the bottom button."
67
+
68
+
69
+ def run_chunk(use_existed_raw: bool, raw_file: str, chunk_method: str, chunk_size: int, chunk_overlap: int,
70
+ lang: str = "English", progress=gr.Progress()):
71
+ lang = change_lang_choice(lang)
72
+ if use_existed_raw:
73
+ raw_df_path = os.path.join(DATA_DIR, "raw.parquet")
74
+ else:
75
+ raw_df_path = raw_file
76
+
77
+ if not os.path.exists(raw_df_path):
78
+ return "Please upload raw.parquet file first. Or run the parsing stage first."
79
+ raw_df = pd.read_parquet(raw_df_path, engine="pyarrow")
80
+ raw_instance = Raw(raw_df)
81
+
82
+ if chunk_method in ["Token", "Sentence"]:
83
+ corpus = raw_instance.chunk("llama_index_chunk", chunk_method=chunk_method, chunk_size=chunk_size,
84
+ chunk_overlap=chunk_overlap, add_file_name=lang)
85
+ elif chunk_method in ["Semantic"]:
86
+ corpus = raw_instance.chunk("llama_index_chunk", chunk_method="Semantic_llama_index",
87
+ embed_model="openai", breakpoint_percnetile_threshold=0.95,
88
+ add_file_name=lang)
89
+ elif chunk_method == "Recursive":
90
+ corpus = raw_instance.chunk("langchain_chunk", chunk_method="recursivecharacter",
91
+ add_file_name=lang, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
92
+ elif chunk_method == "Konlpy🇰🇷":
93
+ corpus = raw_instance.chunk("langchain_chunk", chunk_method="konlpy", add_file_name=lang,
94
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap)
95
+ else:
96
+ gr.Error("Unsupported chunk method.")
97
+ return "Unsupported chunk method."
98
+ progress(0.8)
99
+ corpus.to_parquet(os.path.join(DATA_DIR, "corpus.parquet"))
100
+ return "Chunking Complete. Download at the bottom button."
101
+
102
+
103
+ def run_qa(use_existed_corpus: bool, corpus_file: str, qa_method: str,
104
+ model_name: str, qa_cnt: int, batch_size: int, lang: str = "English", progress=gr.Progress()):
105
+ lang = change_lang_choice(lang)
106
+ if use_existed_corpus:
107
+ corpus_df_path = os.path.join(DATA_DIR, "corpus.parquet")
108
+ else:
109
+ corpus_df_path = corpus_file
110
+
111
+ if not os.path.exists(corpus_df_path):
112
+ gr.Error("Please upload corpus.parquet file first. Or run the chunking stage first.")
113
+ return "Please upload corpus.parquet file first. Or run the chunking stage first."
114
+ corpus_df = pd.read_parquet(corpus_df_path, engine="pyarrow")
115
+
116
+ if os.getenv("OPENAI_API_KEY") is None:
117
+ gr.Error("Please submit your OpenAI API key first.")
118
+ return "Please submit your OpenAI API key first."
119
+ llm = OpenAI(model=model_name)
120
+
121
+ if qa_method == "default":
122
+ qa = default_create(corpus_df, llm=llm, n=qa_cnt, lang=lang, progress=progress, batch_size=batch_size)
123
+ elif qa_method == "fast":
124
+ qa = fast_create(corpus_df, llm=llm, n=qa_cnt, lang=lang, progress=progress, batch_size=batch_size)
125
+ elif qa_method == "advanced":
126
+ qa = advanced_create(corpus_df, llm=llm, n=qa_cnt, lang=lang, progress=progress, batch_size=batch_size)
127
+ else:
128
+ gr.Error("Unsupported QA method.")
129
+ return "Unsupported QA method."
130
+
131
+ qa.to_parquet(os.path.join(DATA_DIR, "qa.parquet"), os.path.join(DATA_DIR, "corpus.parquet"))
132
+ return "QA Creation Complete. Download at the bottom button."
133
+
134
+
135
+ def file_reset() -> str:
136
+ shutil.rmtree(FILE_DIR)
137
+ os.makedirs(FILE_DIR)
138
+ return "Files reset complete."
139
+
140
+ with gr.Blocks(theme="earneleh/paris") as demo:
141
+ gr.HTML("<h1>AutoRAG Data Creation 🛠️</h1>")
142
+ with gr.Row():
143
+ openai_key_textbox = gr.Textbox(label="Please input your OpenAI API key and press Enter.", type="password",
144
+ info="You can get your API key from https://platform.openai.com/account/api-keys\n\n"
145
+ "AutoRAG do not store your API key.",
146
+ autofocus=True)
147
+ api_key_status_box = gr.Textbox(label="OpenAI API status", value="Not Set", interactive=False)
148
+ lang_choice = gr.Radio(["English", "한국어", "日本語"], label="Language",
149
+ value="English", info="Choose Langauge. En, Ko, Ja are supported.",
150
+ interactive=True)
151
+
152
+ with gr.Row(visible=False) as llama_cloud_api_key_row:
153
+ llama_key_textbox = gr.Textbox(label="Please input your Llama Cloud API key and press Enter.", type="password",
154
+ info="You can get your API key from https://docs.cloud.llamaindex.ai/llamacloud/getting_started/api_key\n\n"
155
+ "AutoRAG do not store your API key.",)
156
+ llama_key_status_box = gr.Textbox(label="Llama Cloud API status", value="Not Set", interactive=False)
157
+
158
+ with gr.Row(visible=False) as upstage_api_key_row:
159
+ upstage_key_textbox = gr.Textbox(label="Please input your Upstage API key and press Enter.", type="password",
160
+ info="You can get your API key from https://upstage.ai/\n\n"
161
+ "AutoRAG do not store your API key.",)
162
+ upstage_key_status_box = gr.Textbox(label="Upstage API status", value="Not Set", interactive=False)
163
+
164
+ with gr.Row():
165
+ with gr.Column(scale=1):
166
+ gr.Markdown("## 1. Parse your PDF files\n\nUpload your pdf files and make it to raw.parquet.")
167
+ document_file_input = gr.File(label="Upload Files", type="filepath", file_count="multiple")
168
+ parse_choice = gr.Dropdown(
169
+ ["pdfminer", "pdfplumber", "pypdfium2", "pypdf", "pymupdf", "llama-parse", "upstage🇰🇷"],
170
+ label="Parsing Method", info="Choose parsing method that you want")
171
+ parse_button = gr.Button(value="Run Parsing")
172
+ parse_status = gr.Textbox(value="Not Started", interactive=False)
173
+ raw_download_button = gr.Button(value="Download raw.parquet",
174
+ link=f"/file={os.path.join(DATA_DIR, 'raw.parquet')}")
175
+ file_reset_button = gr.Button(value="Reset uploaded files")
176
+
177
+ with gr.Column(scale=1):
178
+ gr.Markdown(
179
+ "## 2. Chunk your raw.parquet\n\nUse parsed raw.parquet or upload your own. It will make a corpus.parquet."
180
+ )
181
+ raw_file_input = gr.File(label="Upload raw.parquet", type="filepath", file_count="single", visible=False)
182
+ use_previous_raw_file = gr.Checkbox(label="Use previous raw.parquet", value=True)
183
+
184
+ chunk_choice = gr.Dropdown(
185
+ ["Token", "Sentence", "Semantic", "Recursive", "Konlpy🇰🇷"],
186
+ label="Chunking Method", info="Choose chunking method that you want")
187
+ chunk_size = gr.Slider(minimum=128, maximum=1024, step=128, label="Chunk Size", value=256)
188
+ chunk_overlap = gr.Slider(minimum=16, maximum=256, step=16, label="Chunk Overlap", value=32)
189
+ chunk_button = gr.Button(value="Run Chunking")
190
+ chunk_status = gr.Textbox(value="Not Started", interactive=False)
191
+ corpus_download_button = gr.Button(value="Download corpus.parquet",
192
+ link=f"/file={os.path.join(DATA_DIR, 'corpus.parquet')}")
193
+
194
+ with gr.Column(scale=1):
195
+ gr.Markdown(
196
+ "## 3. Create QA dataset from your corpus.parquet\n\nQA dataset is essential to run AutoRAG. Upload corpus.parquet & select QA method and run.")
197
+ gr.HTML("<b style='color: red; background-color: black; font-weight: bold;'>Warning: QA Creation uses an OpenAI model, which can be costly. Start with a small batch to gauge expenses.</b>")
198
+ corpus_file_input = gr.File(label="Upload corpus.parquet", type="filepath", file_count="single",
199
+ visible=False)
200
+ use_previous_corpus_file = gr.Checkbox(label="Use previous corpus.parquet", value=True)
201
+
202
+ qa_choice = gr.Radio(["default", "fast", "advanced"], label="QA Method",
203
+ info="Choose QA method that you want")
204
+ model_choice = gr.Radio(["gpt-4o-mini", "gpt-4o"], label="Select model for data creation",
205
+ )
206
+ qa_cnt = gr.Slider(minimum=20, maximum=150, step=5, label="Number of QA pairs", value=80)
207
+ batch_size = gr.Slider(minimum=1, maximum=16, step=1,
208
+ label="Batch Size to OpenAI model. If there is an error, decrease this.", value=16)
209
+ run_qa_button = gr.Button(value="Run QA Creation")
210
+ qa_status = gr.Textbox(value="Not Started", interactive=False)
211
+ qa_download_button = gr.Button(value="Download qa.parquet",
212
+ link=f"/file={os.path.join(DATA_DIR, 'qa.parquet')}")
213
+
214
+ #================================================================================================#
215
+ # Logics
216
+
217
+ use_previous_raw_file.change(lambda x: gr.update(visible=not x), inputs=[use_previous_raw_file],
218
+ outputs=[raw_file_input])
219
+ use_previous_corpus_file.change(lambda x: gr.update(visible=not x), inputs=[use_previous_corpus_file],
220
+ outputs=[corpus_file_input])
221
+ openai_key_textbox.submit(on_submit_openai_key, inputs=[openai_key_textbox], outputs=api_key_status_box)
222
+
223
+ # Parsing
224
+ parse_button.click(run_parse, inputs=[document_file_input, parse_choice], outputs=parse_status)
225
+ file_reset_button.click(file_reset, outputs=parse_status)
226
+
227
+ # Chunking
228
+ chunk_button.click(run_chunk, inputs=[use_previous_raw_file, raw_file_input, chunk_choice, chunk_size, chunk_overlap,
229
+ lang_choice],
230
+ outputs=chunk_status)
231
+
232
+ # QA Creation
233
+ run_qa_button.click(run_qa, inputs=[use_previous_corpus_file, corpus_file_input, qa_choice, model_choice, qa_cnt,
234
+ batch_size, lang_choice], outputs=qa_status)
235
+
236
+ # API Key visibility
237
+ parse_choice.change(change_visible_status_api_key, inputs=[parse_choice],
238
+ outputs=[llama_cloud_api_key_row, upstage_api_key_row])
239
+
240
+
241
+ demo.launch(share=False, debug=True, allowed_paths=[FILE_DIR, DATA_DIR])
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gcc
2
+ poppler-utils
3
+ tesseract
4
+ pyOpenSSL
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ AutoRAG[parse,ko,ja]>=0.3.4
2
+ llama-index-llms-upstage
src/__init__.py ADDED
File without changes
src/create.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import gradio as gr
3
+ from autorag.data.qa.filter.passage_dependency import passage_dependency_filter_llama_index
4
+ from autorag.data.qa.query.llama_gen_query import factoid_query_gen
5
+ from autorag.data.qa.sample import random_single_hop
6
+ from autorag.data.qa.schema import Corpus, QA
7
+ from autorag.data.qa.generation_gt.llama_index_gen_gt import (
8
+ make_basic_gen_gt,
9
+ make_concise_gen_gt,
10
+ )
11
+ from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
12
+ from llama_index.core.base.llms.base import BaseLLM
13
+ from autorag.data.qa.evolve.llama_index_query_evolve import reasoning_evolve_ragas
14
+ from autorag.data.qa.evolve.llama_index_query_evolve import compress_ragas
15
+
16
+
17
+ def default_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en",
18
+ batch_size: int = 32,
19
+ progress=gr.Progress()) -> QA:
20
+ corpus_instance = Corpus(corpus_df)
21
+ if len(corpus_instance.data) < n:
22
+ n = len(corpus_instance.data)
23
+ sampled_corpus = corpus_instance.sample(random_single_hop, n=n)
24
+ mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True))
25
+ retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents()
26
+ progress(0.05)
27
+ query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size)
28
+ progress(0.2)
29
+ basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
30
+ progress(0.4)
31
+ concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
32
+ progress(0.6)
33
+ filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang)
34
+ progress(0.8)
35
+ initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, lang=lang, batch_size=batch_size)
36
+ progress(0.96)
37
+ return initial_qa
38
+
39
+
40
+ def fast_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en",
41
+ batch_size: int = 32,
42
+ progress=gr.Progress()) -> QA:
43
+ corpus_instance = Corpus(corpus_df)
44
+ progress(0.05)
45
+ if len(corpus_instance.data) < n:
46
+ n = len(corpus_instance.data)
47
+
48
+ sampled_corpus = corpus_instance.sample(random_single_hop, n=n)
49
+ mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True))
50
+ progress(0.1)
51
+
52
+ retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents()
53
+ progress(0.2)
54
+
55
+ query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size)
56
+ progress(0.3)
57
+
58
+ basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
59
+ progress(0.5)
60
+
61
+ concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
62
+ progress(0.75)
63
+
64
+ initial_qa = concise_answers
65
+ progress(0.9)
66
+
67
+ return initial_qa
68
+
69
+
70
+ def advanced_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en",
71
+ batch_size: int = 32,
72
+ progress=gr.Progress()) -> QA:
73
+ """
74
+ Mix hard and easy question.
75
+ """
76
+ corpus_instance = Corpus(corpus_df)
77
+ if len(corpus_instance.data) < n:
78
+ n = len(corpus_instance.data)
79
+ sampled_corpus = corpus_instance.sample(random_single_hop, n=n)
80
+ mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True))
81
+ retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents()
82
+ progress(0.05)
83
+ query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size)
84
+ progress(0.15)
85
+ basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
86
+ progress(0.25)
87
+ concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
88
+ progress(0.35)
89
+ filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang)
90
+ progress(0.45)
91
+ initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, lang=lang, batch_size=batch_size)
92
+ progress(0.55)
93
+ cut_idx = n // 2
94
+ reasoning_qa = initial_qa.map(lambda df: df.iloc[:cut_idx]).batch_apply(
95
+ reasoning_evolve_ragas,
96
+ llm=llm,
97
+ lang=lang,
98
+ batch_size=batch_size,
99
+ )
100
+ progress(0.75)
101
+ compressed_qa = initial_qa.map(lambda df: df.iloc[cut_idx:]).map(lambda df: df.reset_index(drop=True)).batch_apply(
102
+ compress_ragas,
103
+ llm=llm,
104
+ lang=lang,
105
+ batch_size=batch_size,
106
+ )
107
+ progress(0.95)
108
+ final_qa = QA(pd.concat([reasoning_qa.data, compressed_qa.data], ignore_index=True),
109
+ linked_corpus=corpus_instance)
110
+
111
+ return final_qa
src/util.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+ import gradio as gr
4
+ from llama_index.core.base.llms.types import ChatResponse
5
+
6
+
7
+ def on_submit_openai_key(openai_key):
8
+ os.environ["OPENAI_API_KEY"] = openai_key
9
+ # Test openai key
10
+ try:
11
+ client = openai.OpenAI()
12
+ response = client.chat.completions.create(
13
+ messages=[
14
+ {"role": "user", "content": "What is the capital of France?"},
15
+ ],
16
+ model="gpt-4o-mini",
17
+ max_tokens=3,
18
+ )
19
+ assert isinstance(response.choices[0].message.content, str)
20
+ gr.Info("OpenAI API key submitted.", duration=3)
21
+ return "Setting complete."
22
+ except openai.AuthenticationError as e:
23
+ gr.Error("OpenAI API key is invalid.", duration=3)
24
+ return "Not Set"
25
+ except AssertionError as e:
26
+ gr.Error("OpenAI server is not working properly.", duration=3)
27
+ return "Not Set"
28
+
29
+ def on_submit_llama_cloud_key(llama_cloud_key):
30
+ from llama_parse import LlamaParse
31
+ os.environ["LLAMA_CLOUD_API_KEY"] = llama_cloud_key
32
+ # Test llama cloud key
33
+ try:
34
+ parser = LlamaParse(
35
+ result_type="markdown" # "markdown" and "text" are available
36
+ )
37
+ return "Setting complete."
38
+ except:
39
+ gr.Error("LLAMA Cloud API key is invalid.", duration=3)
40
+ return "Not Set"
41
+
42
+
43
+ def on_submit_upstage_key(upstage_key):
44
+ os.environ["UPSTAGE_API_KEY"] = upstage_key
45
+ # Test upstage key
46
+ try:
47
+ from llama_index.llms.upstage import Upstage
48
+ from llama_index.core.llms import ChatMessage
49
+
50
+ llm = Upstage()
51
+
52
+ response: ChatResponse = llm.chat(messages=[
53
+ ChatMessage(role="system", content="You are a helpful assistant."),
54
+ ChatMessage(role="user", content="Hi, how are you?")
55
+ ], max_token=3)
56
+
57
+ assert isinstance(response.message, str)
58
+ assert bool(response.message)
59
+ return "Setting complete."
60
+ except:
61
+ gr.Error("Upstage API key is invalid.", duration=3)
62
+ return "Not Set"