cjber commited on
Commit
d5b7cf9
·
1 Parent(s): d98a335

feat: add eval comparison

Browse files
planning_ai/eval/compare_summaries.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import polars as pl
2
+ from langchain_core.output_parsers import StrOutputParser
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from pydantic import BaseModel, Field
5
+
6
+ from planning_ai.common.utils import Paths
7
+ from planning_ai.llms.llm import GPT4o
8
+
9
+ with open("./planning_ai/eval/eval.txt", "r") as f:
10
+ compare_template = f.read()
11
+
12
+ with open("./planning_ai/eval/summary.txt", "r") as f:
13
+ summary_template = f.read()
14
+
15
+
16
+ class SummaryEvaluator(BaseModel):
17
+ score: int = Field(..., description="The number of the best summary.")
18
+
19
+
20
+ SLLM = GPT4o.with_structured_output(SummaryEvaluator, strict=True)
21
+
22
+ compare_prompt = ChatPromptTemplate([("system", compare_template)])
23
+ compare_chain = compare_prompt | SLLM
24
+
25
+ summary_prompt = ChatPromptTemplate([("system", summary_template)])
26
+ summary_chain = summary_prompt | GPT4o | StrOutputParser()
27
+
28
+
29
+ original = pl.read_parquet(Paths.STAGING / "gcpt3.parquet").filter(
30
+ pl.col("attachments_id").is_null()
31
+ )
32
+ summaries1 = original[["text", "representations_summary"]].unique().head(20)
33
+
34
+ summaries2 = summaries1[["text"]]
35
+ summaries2 = summaries2.with_columns(
36
+ pl.col("text")
37
+ .map_elements(
38
+ lambda x: summary_chain.invoke({"content": x}), return_dtype=pl.String
39
+ )
40
+ .alias("summary")
41
+ )
42
+
43
+ summaries = summaries1.join(summaries2, on="text")
44
+ summaries = summaries.with_columns(
45
+ pl.struct(["text", "representations_summary", "summary"])
46
+ .map_elements(
47
+ lambda x: compare_chain.invoke(
48
+ {
49
+ "document": x["text"],
50
+ "summary_1": x["representations_summary"],
51
+ "summary_2": x["summary"],
52
+ }
53
+ ).score,
54
+ return_dtype=pl.Int8,
55
+ )
56
+ .alias("score")
57
+ )
58
+ summaries["score"].value_counts()
planning_ai/eval/eval.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **Task:**
2
+ You are grading **two** text summaries of a source document to determine which summary more comprehensively captures the key points within the source document.
3
+
4
+ ### **Evaluation Criteria:**
5
+ A good summary should:
6
+ 1. **Be accurate** – It should not include information that is not present in the source document.
7
+ 2. **Be comprehensive** – It should reflect all key points in the source document without omitting important details.
8
+ 3. **Be well-grounded** – It should be based entirely on the source document without adding interpretations, opinions, or external information.
9
+
10
+ ### **Scoring System:**
11
+ - **Score 0:** Neither summary sufficiently captures the key points.
12
+ - **Score 1:** The first summary better reflects the content of the source document.
13
+ - **Score 2:** The second summary better reflects the content of the source document.
14
+ - **Score 3:** Both summaries are sufficiently accurate and comprehensive.
15
+
16
+ ### **Evaluation Process:**
17
+ 1. **Compare each summary to the source document.** Identify whether each summary includes all key points, omits critical details, or introduces extraneous information.
18
+ 2. **Assess which summary better aligns with the source document.** Determine whether one summary is significantly more accurate and comprehensive.
19
+
20
+ ---
21
+
22
+ **Source Document:**
23
+ {document}
24
+
25
+ **Summary 1:**
26
+ {summary_1}
27
+
28
+ **Summary 2:**
29
+ {summary_2}
30
+
31
+ **Final Score (0-3):**
32
+
planning_ai/eval/summary.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Please analyze the response to the planning application provided below. Provide a concise summary of the response, highlighting the main points and any significant details.
2
+
3
+ Response:
4
+
5
+ {content}