import pandas as pd import gradio as gr from autorag.data.qa.filter.passage_dependency import passage_dependency_filter_llama_index from autorag.data.qa.query.llama_gen_query import factoid_query_gen from autorag.data.qa.sample import random_single_hop from autorag.data.qa.schema import Corpus, QA from autorag.data.qa.generation_gt.llama_index_gen_gt import ( make_basic_gen_gt, make_concise_gen_gt, ) from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based from llama_index.core.base.llms.base import BaseLLM from autorag.data.qa.evolve.llama_index_query_evolve import reasoning_evolve_ragas from autorag.data.qa.evolve.llama_index_query_evolve import compress_ragas def default_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en", batch_size: int = 32, progress=gr.Progress()) -> QA: corpus_instance = Corpus(corpus_df) if len(corpus_instance.data) < n: n = len(corpus_instance.data) sampled_corpus = corpus_instance.sample(random_single_hop, n=n) mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True)) retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents() progress(0.05) query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size) progress(0.2) basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size) progress(0.4) concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size) progress(0.6) filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang) progress(0.8) initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, llm=llm, lang=lang, batch_size=batch_size) progress(0.96) return initial_qa def fast_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en", batch_size: int = 32, progress=gr.Progress()) -> QA: corpus_instance = Corpus(corpus_df) progress(0.05) if len(corpus_instance.data) < n: n = len(corpus_instance.data) sampled_corpus = corpus_instance.sample(random_single_hop, n=n) mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True)) progress(0.1) retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents() progress(0.2) query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size) progress(0.3) basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size) progress(0.5) concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size) progress(0.75) initial_qa = concise_answers progress(0.9) return initial_qa def advanced_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en", batch_size: int = 32, progress=gr.Progress()) -> QA: """ Mix hard and easy question. """ corpus_instance = Corpus(corpus_df) if len(corpus_instance.data) < n: n = len(corpus_instance.data) sampled_corpus = corpus_instance.sample(random_single_hop, n=n) mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True)) retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents() progress(0.05) query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size) progress(0.15) basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size) progress(0.25) concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size) progress(0.35) filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang) progress(0.45) initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, llm=llm, lang=lang, batch_size=batch_size) progress(0.55) cut_idx = n // 2 reasoning_qa = initial_qa.map(lambda df: df.iloc[:cut_idx]).batch_apply( reasoning_evolve_ragas, llm=llm, lang=lang, batch_size=batch_size, ) progress(0.75) compressed_qa = initial_qa.map(lambda df: df.iloc[cut_idx:]).map(lambda df: df.reset_index(drop=True)).batch_apply( compress_ragas, llm=llm, lang=lang, batch_size=batch_size, ) progress(0.95) final_qa = QA(pd.concat([reasoning_qa.data, compressed_qa.data], ignore_index=True), linked_corpus=corpus_instance) return final_qa