Spaces:
Build error
Build error
feat: add eval comparison
Browse files
planning_ai/eval/compare_summaries.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import polars as pl
|
2 |
+
from langchain_core.output_parsers import StrOutputParser
|
3 |
+
from langchain_core.prompts import ChatPromptTemplate
|
4 |
+
from pydantic import BaseModel, Field
|
5 |
+
|
6 |
+
from planning_ai.common.utils import Paths
|
7 |
+
from planning_ai.llms.llm import GPT4o
|
8 |
+
|
9 |
+
with open("./planning_ai/eval/eval.txt", "r") as f:
|
10 |
+
compare_template = f.read()
|
11 |
+
|
12 |
+
with open("./planning_ai/eval/summary.txt", "r") as f:
|
13 |
+
summary_template = f.read()
|
14 |
+
|
15 |
+
|
16 |
+
class SummaryEvaluator(BaseModel):
|
17 |
+
score: int = Field(..., description="The number of the best summary.")
|
18 |
+
|
19 |
+
|
20 |
+
SLLM = GPT4o.with_structured_output(SummaryEvaluator, strict=True)
|
21 |
+
|
22 |
+
compare_prompt = ChatPromptTemplate([("system", compare_template)])
|
23 |
+
compare_chain = compare_prompt | SLLM
|
24 |
+
|
25 |
+
summary_prompt = ChatPromptTemplate([("system", summary_template)])
|
26 |
+
summary_chain = summary_prompt | GPT4o | StrOutputParser()
|
27 |
+
|
28 |
+
|
29 |
+
original = pl.read_parquet(Paths.STAGING / "gcpt3.parquet").filter(
|
30 |
+
pl.col("attachments_id").is_null()
|
31 |
+
)
|
32 |
+
summaries1 = original[["text", "representations_summary"]].unique().head(20)
|
33 |
+
|
34 |
+
summaries2 = summaries1[["text"]]
|
35 |
+
summaries2 = summaries2.with_columns(
|
36 |
+
pl.col("text")
|
37 |
+
.map_elements(
|
38 |
+
lambda x: summary_chain.invoke({"content": x}), return_dtype=pl.String
|
39 |
+
)
|
40 |
+
.alias("summary")
|
41 |
+
)
|
42 |
+
|
43 |
+
summaries = summaries1.join(summaries2, on="text")
|
44 |
+
summaries = summaries.with_columns(
|
45 |
+
pl.struct(["text", "representations_summary", "summary"])
|
46 |
+
.map_elements(
|
47 |
+
lambda x: compare_chain.invoke(
|
48 |
+
{
|
49 |
+
"document": x["text"],
|
50 |
+
"summary_1": x["representations_summary"],
|
51 |
+
"summary_2": x["summary"],
|
52 |
+
}
|
53 |
+
).score,
|
54 |
+
return_dtype=pl.Int8,
|
55 |
+
)
|
56 |
+
.alias("score")
|
57 |
+
)
|
58 |
+
summaries["score"].value_counts()
|
planning_ai/eval/eval.txt
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**Task:**
|
2 |
+
You are grading **two** text summaries of a source document to determine which summary more comprehensively captures the key points within the source document.
|
3 |
+
|
4 |
+
### **Evaluation Criteria:**
|
5 |
+
A good summary should:
|
6 |
+
1. **Be accurate** – It should not include information that is not present in the source document.
|
7 |
+
2. **Be comprehensive** – It should reflect all key points in the source document without omitting important details.
|
8 |
+
3. **Be well-grounded** – It should be based entirely on the source document without adding interpretations, opinions, or external information.
|
9 |
+
|
10 |
+
### **Scoring System:**
|
11 |
+
- **Score 0:** Neither summary sufficiently captures the key points.
|
12 |
+
- **Score 1:** The first summary better reflects the content of the source document.
|
13 |
+
- **Score 2:** The second summary better reflects the content of the source document.
|
14 |
+
- **Score 3:** Both summaries are sufficiently accurate and comprehensive.
|
15 |
+
|
16 |
+
### **Evaluation Process:**
|
17 |
+
1. **Compare each summary to the source document.** Identify whether each summary includes all key points, omits critical details, or introduces extraneous information.
|
18 |
+
2. **Assess which summary better aligns with the source document.** Determine whether one summary is significantly more accurate and comprehensive.
|
19 |
+
|
20 |
+
---
|
21 |
+
|
22 |
+
**Source Document:**
|
23 |
+
{document}
|
24 |
+
|
25 |
+
**Summary 1:**
|
26 |
+
{summary_1}
|
27 |
+
|
28 |
+
**Summary 2:**
|
29 |
+
{summary_2}
|
30 |
+
|
31 |
+
**Final Score (0-3):**
|
32 |
+
|
planning_ai/eval/summary.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Please analyze the response to the planning application provided below. Provide a concise summary of the response, highlighting the main points and any significant details.
|
2 |
+
|
3 |
+
Response:
|
4 |
+
|
5 |
+
{content}
|