File size: 4,397 Bytes
cf0997e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77c3530
cf0997e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77c3530
cf0997e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pandas as pd
import gradio as gr
from autorag.data.qa.filter.passage_dependency import passage_dependency_filter_llama_index
from autorag.data.qa.query.llama_gen_query import factoid_query_gen
from autorag.data.qa.sample import random_single_hop
from autorag.data.qa.schema import Corpus, QA
from autorag.data.qa.generation_gt.llama_index_gen_gt import (
    make_basic_gen_gt,
    make_concise_gen_gt,
)
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
from llama_index.core.base.llms.base import BaseLLM
from autorag.data.qa.evolve.llama_index_query_evolve import reasoning_evolve_ragas
from autorag.data.qa.evolve.llama_index_query_evolve import compress_ragas


def default_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en",
				   batch_size: int = 32,
				   progress=gr.Progress()) -> QA:
	corpus_instance = Corpus(corpus_df)
	if len(corpus_instance.data) < n:
		n = len(corpus_instance.data)
	sampled_corpus = corpus_instance.sample(random_single_hop, n=n)
	mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True))
	retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents()
	progress(0.05)
	query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.2)
	basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.4)
	concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.6)
	filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang)
	progress(0.8)
	initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.96)
	return initial_qa


def fast_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en",
				batch_size: int = 32,
				progress=gr.Progress()) -> QA:
	corpus_instance = Corpus(corpus_df)
	progress(0.05)
	if len(corpus_instance.data) < n:
		n = len(corpus_instance.data)

	sampled_corpus = corpus_instance.sample(random_single_hop, n=n)
	mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True))
	progress(0.1)

	retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents()
	progress(0.2)

	query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.3)

	basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.5)

	concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.75)

	initial_qa = concise_answers
	progress(0.9)

	return initial_qa


def advanced_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en",
					batch_size: int = 32,
					progress=gr.Progress()) -> QA:
	"""
	Mix hard and easy question.
	"""
	corpus_instance = Corpus(corpus_df)
	if len(corpus_instance.data) < n:
		n = len(corpus_instance.data)
	sampled_corpus = corpus_instance.sample(random_single_hop, n=n)
	mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True))
	retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents()
	progress(0.05)
	query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.15)
	basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.25)
	concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.35)
	filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang)
	progress(0.45)
	initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, llm=llm, lang=lang, batch_size=batch_size)
	progress(0.55)
	cut_idx = n // 2
	reasoning_qa = initial_qa.map(lambda df: df.iloc[:cut_idx]).batch_apply(
		reasoning_evolve_ragas,
		llm=llm,
		lang=lang,
		batch_size=batch_size,
	)
	progress(0.75)
	compressed_qa = initial_qa.map(lambda df: df.iloc[cut_idx:]).map(lambda df: df.reset_index(drop=True)).batch_apply(
		compress_ragas,
		llm=llm,
		lang=lang,
		batch_size=batch_size,
	)
	progress(0.95)
	final_qa = QA(pd.concat([reasoning_qa.data, compressed_qa.data], ignore_index=True),
				  linked_corpus=corpus_instance)

	return final_qa