jeffrey
Fix data creation error
77c3530
raw
history blame
4.4 kB
import pandas as pd
import gradio as gr
from autorag.data.qa.filter.passage_dependency import passage_dependency_filter_llama_index
from autorag.data.qa.query.llama_gen_query import factoid_query_gen
from autorag.data.qa.sample import random_single_hop
from autorag.data.qa.schema import Corpus, QA
from autorag.data.qa.generation_gt.llama_index_gen_gt import (
make_basic_gen_gt,
make_concise_gen_gt,
)
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
from llama_index.core.base.llms.base import BaseLLM
from autorag.data.qa.evolve.llama_index_query_evolve import reasoning_evolve_ragas
from autorag.data.qa.evolve.llama_index_query_evolve import compress_ragas
def default_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en",
batch_size: int = 32,
progress=gr.Progress()) -> QA:
corpus_instance = Corpus(corpus_df)
if len(corpus_instance.data) < n:
n = len(corpus_instance.data)
sampled_corpus = corpus_instance.sample(random_single_hop, n=n)
mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True))
retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents()
progress(0.05)
query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size)
progress(0.2)
basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
progress(0.4)
concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
progress(0.6)
filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang)
progress(0.8)
initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, llm=llm, lang=lang, batch_size=batch_size)
progress(0.96)
return initial_qa
def fast_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en",
batch_size: int = 32,
progress=gr.Progress()) -> QA:
corpus_instance = Corpus(corpus_df)
progress(0.05)
if len(corpus_instance.data) < n:
n = len(corpus_instance.data)
sampled_corpus = corpus_instance.sample(random_single_hop, n=n)
mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True))
progress(0.1)
retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents()
progress(0.2)
query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size)
progress(0.3)
basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
progress(0.5)
concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
progress(0.75)
initial_qa = concise_answers
progress(0.9)
return initial_qa
def advanced_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en",
batch_size: int = 32,
progress=gr.Progress()) -> QA:
"""
Mix hard and easy question.
"""
corpus_instance = Corpus(corpus_df)
if len(corpus_instance.data) < n:
n = len(corpus_instance.data)
sampled_corpus = corpus_instance.sample(random_single_hop, n=n)
mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True))
retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents()
progress(0.05)
query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size)
progress(0.15)
basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
progress(0.25)
concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
progress(0.35)
filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang)
progress(0.45)
initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, llm=llm, lang=lang, batch_size=batch_size)
progress(0.55)
cut_idx = n // 2
reasoning_qa = initial_qa.map(lambda df: df.iloc[:cut_idx]).batch_apply(
reasoning_evolve_ragas,
llm=llm,
lang=lang,
batch_size=batch_size,
)
progress(0.75)
compressed_qa = initial_qa.map(lambda df: df.iloc[cut_idx:]).map(lambda df: df.reset_index(drop=True)).batch_apply(
compress_ragas,
llm=llm,
lang=lang,
batch_size=batch_size,
)
progress(0.95)
final_qa = QA(pd.concat([reasoning_qa.data, compressed_qa.data], ignore_index=True),
linked_corpus=corpus_instance)
return final_qa