Spaces:
Running
Running
import pandas as pd | |
import gradio as gr | |
from autorag.data.qa.filter.passage_dependency import passage_dependency_filter_llama_index | |
from autorag.data.qa.query.llama_gen_query import factoid_query_gen | |
from autorag.data.qa.sample import random_single_hop | |
from autorag.data.qa.schema import Corpus, QA | |
from autorag.data.qa.generation_gt.llama_index_gen_gt import ( | |
make_basic_gen_gt, | |
make_concise_gen_gt, | |
) | |
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based | |
from llama_index.core.base.llms.base import BaseLLM | |
from autorag.data.qa.evolve.llama_index_query_evolve import reasoning_evolve_ragas | |
from autorag.data.qa.evolve.llama_index_query_evolve import compress_ragas | |
def default_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en", | |
batch_size: int = 32, | |
progress=gr.Progress()) -> QA: | |
corpus_instance = Corpus(corpus_df) | |
if len(corpus_instance.data) < n: | |
n = len(corpus_instance.data) | |
sampled_corpus = corpus_instance.sample(random_single_hop, n=n) | |
mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True)) | |
retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents() | |
progress(0.05) | |
query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.2) | |
basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.4) | |
concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.6) | |
filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang) | |
progress(0.8) | |
initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.96) | |
return initial_qa | |
def fast_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en", | |
batch_size: int = 32, | |
progress=gr.Progress()) -> QA: | |
corpus_instance = Corpus(corpus_df) | |
progress(0.05) | |
if len(corpus_instance.data) < n: | |
n = len(corpus_instance.data) | |
sampled_corpus = corpus_instance.sample(random_single_hop, n=n) | |
mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True)) | |
progress(0.1) | |
retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents() | |
progress(0.2) | |
query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.3) | |
basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.5) | |
concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.75) | |
initial_qa = concise_answers | |
progress(0.9) | |
return initial_qa | |
def advanced_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en", | |
batch_size: int = 32, | |
progress=gr.Progress()) -> QA: | |
""" | |
Mix hard and easy question. | |
""" | |
corpus_instance = Corpus(corpus_df) | |
if len(corpus_instance.data) < n: | |
n = len(corpus_instance.data) | |
sampled_corpus = corpus_instance.sample(random_single_hop, n=n) | |
mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True)) | |
retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents() | |
progress(0.05) | |
query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.15) | |
basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.25) | |
concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.35) | |
filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang) | |
progress(0.45) | |
initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, llm=llm, lang=lang, batch_size=batch_size) | |
progress(0.55) | |
cut_idx = n // 2 | |
reasoning_qa = initial_qa.map(lambda df: df.iloc[:cut_idx]).batch_apply( | |
reasoning_evolve_ragas, | |
llm=llm, | |
lang=lang, | |
batch_size=batch_size, | |
) | |
progress(0.75) | |
compressed_qa = initial_qa.map(lambda df: df.iloc[cut_idx:]).map(lambda df: df.reset_index(drop=True)).batch_apply( | |
compress_ragas, | |
llm=llm, | |
lang=lang, | |
batch_size=batch_size, | |
) | |
progress(0.95) | |
final_qa = QA(pd.concat([reasoning_qa.data, compressed_qa.data], ignore_index=True), | |
linked_corpus=corpus_instance) | |
return final_qa | |