Spaces:

cjber
/

planning-ai

Sleeping

App Files Files Community

cjber commited on Jan 17

Commit

1af4802

1 Parent(s): 134bc43

fix: simplify structured outputs which improves accuracy

Browse files

Files changed (9) hide show

planning_ai/chains/map_chain.py +8 -4
planning_ai/chains/policy_chain.py +12 -2
planning_ai/chains/prompts/map.txt +1 -1
planning_ai/chains/prompts/policy.txt +1 -3
planning_ai/graph.py +10 -26
planning_ai/nodes/hallucination_node.py +33 -70
planning_ai/nodes/map_node.py +29 -29
planning_ai/nodes/reduce_node.py +72 -97
planning_ai/states.py +8 -4

planning_ai/chains/map_chain.py CHANGED Viewed

@@ -2,7 +2,7 @@ from enum import Enum, auto
 from typing import Optional, Set, Type
 from langchain_core.prompts import ChatPromptTemplate
-from pydantic import BaseModel, create_model
 from planning_ai.common.utils import Paths
 from planning_ai.llms.llm import LLM
@@ -39,9 +39,11 @@ def create_brief_summary_model(policy_enum: Enum) -> Type[BaseModel]:
         Type[BaseModel]: A dynamically generated Pydantic model for BriefSummary.
     """
     DynamicPolicy = create_model(
         "DynamicPolicy",
-        policy=(policy_enum, ...),
         note=(str, ...),
         __config__={"extra": "forbid"},
     )
@@ -49,7 +51,9 @@ def create_brief_summary_model(policy_enum: Enum) -> Type[BaseModel]:
     return create_model(
         "DynamicBriefSummary",
         summary=(str, ...),
-        policies=(Optional[list[DynamicPolicy]], ...),
         __module__=__name__,
         __config__={"extra": "forbid"},
     )
@@ -82,7 +86,7 @@ if __name__ == "__main__":
     the major settlement of Cambourne has been created - now over the projected 3,000 homes and
     Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
     """
-    test_themes = {"Great Places", "Homes"}
     dynamic_map_chain = create_dynamic_map_chain(test_themes, prompt=map_template)
     result = dynamic_map_chain.invoke({"context": test_document, "themes": test_themes})

 from typing import Optional, Set, Type
 from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel, Field, create_model
 from planning_ai.common.utils import Paths
 from planning_ai.llms.llm import LLM
         Type[BaseModel]: A dynamically generated Pydantic model for BriefSummary.
     """
+    # NOTE: For some reason GPT4o goes mental if we use too much structure
     DynamicPolicy = create_model(
         "DynamicPolicy",
+        # policy=(policy_enum, ...),
+        policy=(str, ...),
         note=(str, ...),
         __config__={"extra": "forbid"},
     )
     return create_model(
         "DynamicBriefSummary",
         summary=(str, ...),
+        # policies=(Optional[list[DynamicPolicy]], ...),
+        policies=(Optional[list[str]], ...),
+        notes=(Optional[list[str]], ...),
         __module__=__name__,
         __config__={"extra": "forbid"},
     )
     the major settlement of Cambourne has been created - now over the projected 3,000 homes and
     Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
     """
+    test_themes = {"Homes", "Great Places"}
     dynamic_map_chain = create_dynamic_map_chain(test_themes, prompt=map_template)
     result = dynamic_map_chain.invoke({"context": test_document, "themes": test_themes})

planning_ai/chains/policy_chain.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from planning_ai.common.utils import Paths
 from planning_ai.llms.llm import LLM
@@ -8,8 +8,18 @@ with open(Paths.PROMPTS / "policy.txt", "r") as f:
     policy_template = f.read()
 policy_prompt = ChatPromptTemplate([("system", policy_template)])
-policy_chain = policy_prompt | LLM | StrOutputParser()
 if __name__ == "__main__":

 from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel, Field
 from planning_ai.common.utils import Paths
 from planning_ai.llms.llm import LLM
     policy_template = f.read()
+class PolicyMerger(BaseModel):
+    """Return condensed details and their associated doc_ids"""
+    details: list[str]
+    doc_id: list[list[int]]
+SLLM = LLM.with_structured_output(PolicyMerger, strict=True)
 policy_prompt = ChatPromptTemplate([("system", policy_template)])
+policy_chain = policy_prompt | SLLM
 if __name__ == "__main__":

planning_ai/chains/prompts/map.txt CHANGED Viewed

@@ -2,7 +2,7 @@ Please analyze the response to the planning application provided below. Your tas
 1. **Summary**: Provide a concise summary of the response, highlighting the main points and any significant details.
-2. **Policy Identification**: Thoroughly review the response and identify all relevant policies from the provided list. Focus on capturing policies that are explicitly mentioned or strongly implied. Prioritize general policies over specific ones when both are relevant. Avoid inferring new policies beyond those stated. Select **all** relevant policies, even if they seem minor.
 3. **Policy Notes**: For each identified policy, extract and list at least one verbatim section from the response that directly relates to it. Ensure the **full** context is retained so the section can be understood independently. Policy notes may overlap. If a note does not have a clear link to the policy, omit both the policy and the note.

 1. **Summary**: Provide a concise summary of the response, highlighting the main points and any significant details.
+2. **Policy Identification**: Carefully review the response and identify all relevant policies from the provided list. Focus on capturing policies that are explicitly mentioned or strongly implied. Avoid inferring new policies beyond those stated. Select **all** relevant policies, even if they seem minor.
 3. **Policy Notes**: For each identified policy, extract and list at least one verbatim section from the response that directly relates to it. Ensure the **full** context is retained so the section can be understood independently. Policy notes may overlap. If a note does not have a clear link to the policy, omit both the policy and the note.

planning_ai/chains/prompts/policy.txt CHANGED Viewed

@@ -6,9 +6,7 @@ You are tasked with refining a list of details related to a specific planning po
 4. Exclude any details that do not pertain to the policy.
 5. Disregard generic details that merely restate the policy.
-The remaining bullet points **must** be followed by **up to 5** references to the original document IDs. Each bullet point should include inline citations corresponding to all the numerical IDs associated with the original details.
-For example '- Impact of increased housing density on the character of Cambridge [1][2][11].'.
 Theme: {theme}

 4. Exclude any details that do not pertain to the policy.
 5. Disregard generic details that merely restate the policy.
+Ensure that all returned details use proper sentence structure.
 Theme: {theme}

planning_ai/graph.py CHANGED Viewed

@@ -4,13 +4,13 @@ from langgraph.graph import END, StateGraph
 from planning_ai.nodes.hallucination_node import (
     check_hallucination,
     fix_hallucination,
-    map_fix_hallucinations,
-    map_hallucinations,
 )
 from planning_ai.nodes.map_node import (
     add_entities,
     generate_summary,
-    map_summaries,
     retrieve_themes,
 )
 from planning_ai.nodes.reduce_node import generate_final_report
@@ -24,31 +24,15 @@ def create_graph():
     graph.add_node("generate_summary", generate_summary)
     graph.add_node("check_hallucination", check_hallucination)
     graph.add_node("fix_hallucination", fix_hallucination)
-    graph.add_node("generate_final_summary", generate_final_report)
     graph.add_edge(START, "add_entities")
-    graph.add_conditional_edges(
-        "add_entities",
-        map_summaries,
-        ["generate_summary"],
-    )
-    graph.add_conditional_edges(
-        "generate_summary",
-        map_hallucinations,
-        ["check_hallucination"],
-    )
-    graph.add_conditional_edges(
-        "check_hallucination",
-        map_fix_hallucinations,
-        ["fix_hallucination"],
-    )
-    graph.add_conditional_edges(
-        "fix_hallucination",
-        map_hallucinations,
-        ["check_hallucination"],
-    )
-    graph.add_edge("check_hallucination", "generate_final_summary")
     graph.add_edge("generate_final_summary", END)
     return graph.compile()

 from planning_ai.nodes.hallucination_node import (
     check_hallucination,
     fix_hallucination,
+    map_check,
+    map_fix,
 )
 from planning_ai.nodes.map_node import (
     add_entities,
     generate_summary,
+    map_documents,
     retrieve_themes,
 )
 from planning_ai.nodes.reduce_node import generate_final_report
     graph.add_node("generate_summary", generate_summary)
     graph.add_node("check_hallucination", check_hallucination)
     graph.add_node("fix_hallucination", fix_hallucination)
+    graph.add_node("generate_final_report", generate_final_report)
     graph.add_edge(START, "add_entities")
+    graph.add_conditional_edges("add_entities", map_documents, ["generate_summary"])
+    graph.add_conditional_edges("generate_summary", map_check, ["check_hallucination"])
+    graph.add_conditional_edges("check_hallucination", map_fix, ["fix_hallucination"])
+    graph.add_conditional_edges("fix_hallucination", map_check, ["check_hallucination"])
+    graph.add_edge("check_hallucination", "generate_final_report")
     graph.add_edge("generate_final_summary", END)
     return graph.compile()

planning_ai/nodes/hallucination_node.py CHANGED Viewed

@@ -2,14 +2,12 @@ import json
 import logging
 from langchain_core.exceptions import OutputParserException
 from langgraph.types import Send
 from pydantic import BaseModel
 from planning_ai.chains.fix_chain import fix_template
-from planning_ai.chains.hallucination_chain import (
-    HallucinationChecker,
-    hallucination_chain,
-)
 from planning_ai.chains.map_chain import create_dynamic_map_chain
 from planning_ai.states import DocumentState, OverallState
@@ -19,12 +17,7 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
-class BasicSummaryBroken(BaseModel):
-    summary: str
-    policies: None
-ITERATIONS = 2
 def check_hallucination(state: DocumentState):
@@ -43,47 +36,35 @@ def check_hallucination(state: DocumentState):
         that need to be addressed.
     """
     logger.warning(f"Checking hallucinations for document {state['filename']}")
-    # Stop trying after 2 iterations
-    if state["iteration"] > ITERATIONS:
-        state["iteration"] = 99
-        state["hallucination"].score = 1
-        return {"documents": [state]}
     try:
         response = hallucination_chain.invoke(
             {"document": state["document"], "summary": state["summary"].summary}
         )
     except (OutputParserException, json.JSONDecodeError) as e:
         logger.error(f"Failed to decode JSON: {e}.")
-        state["iteration"] = 99
-        state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
-        state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
-        return {"documents": [state]}
-    if response.score == 1:
-        return {"documents": [{**state, "hallucination": response}]}
-    return {
-        "documents": [
-            {**state, "hallucination": response, "iteration": state["iteration"] + 1}
-        ]
-    }
-def map_hallucinations(state: OverallState):
-    """Maps summaries to the `check_hallucination` function.
-    This function prepares a list of summaries to be checked for hallucinations by
-    sending them to the `check_hallucination` function. Allows summaries to be checked
-    in parrallel.
-    Args:
-        state (OverallState): The overall state containing all summaries.
-    Returns:
-        list: A list of Send objects directing each summary to the check_hallucination
-        function.
-    """
-    return [Send("check_hallucination", document) for document in state["documents"]]
 def fix_hallucination(state: DocumentState):
@@ -112,35 +93,17 @@ def fix_hallucination(state: DocumentState):
         )
     except (OutputParserException, json.JSONDecodeError) as e:
         logger.error(f"Failed to decode JSON: {e}.")
-        state["iteration"] = 99
-        state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
-        state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
-        return {"documents": [state]}
-    state["summary"] = response  # type: ignore
-    return {"documents": [state]}
-def map_fix_hallucinations(state: OverallState):
-    """Maps hallucinations to the `fix_hallucination` function.
-    This function filters out hallucinations that need fixing and prepares them to be
-    sent to the `fix_hallucination` function. Allows hallucinations to be fixed in
-    parrallel.
-    Args:
-        state (OverallState): The overall state containing all hallucinations.
-    Returns:
-        list: A list of Send objects directing each hallucination to the
-        fix_hallucination function.
-    """
-    hallucinations = []
-    if "documents" in state:
-        hallucinations = [
-            document
-            for document in state["documents"]
-            if document["hallucination"].score != 1
-        ]
     return [
-        Send("fix_hallucination", hallucination) for hallucination in hallucinations
     ]

 import logging
 from langchain_core.exceptions import OutputParserException
+from langgraph.constants import END
 from langgraph.types import Send
 from pydantic import BaseModel
 from planning_ai.chains.fix_chain import fix_template
+from planning_ai.chains.hallucination_chain import hallucination_chain
 from planning_ai.chains.map_chain import create_dynamic_map_chain
 from planning_ai.states import DocumentState, OverallState
 logger = logging.getLogger(__name__)
+MAX_ATTEMPTS = 3
 def check_hallucination(state: DocumentState):
         that need to be addressed.
     """
     logger.warning(f"Checking hallucinations for document {state['filename']}")
+    if (state["refinement_attempts"] >= MAX_ATTEMPTS) or state["processed"]:
+        logger.warning(f"Max attempts exceeded for document: {state['filename']}")
+        return {"documents": [{**state, "failed": True, "processed": True}]}
+    elif not state["is_hallucinated"]:
+        logger.warning(f"Finished processing document: {state['filename']}")
+        return {"documents": [{**state, "processed": True}]}
     try:
         response = hallucination_chain.invoke(
             {"document": state["document"], "summary": state["summary"].summary}
         )
+        is_hallucinated = response.score == 0
+        refinement_attempts = state["refinement_attempts"] + 1
+        out = {
+            **state,
+            "hallucination": response,
+            "refinement_attempts": refinement_attempts,
+            "is_hallucinated": is_hallucinated,
+        }
+        logger.warning(f"Hallucination: {is_hallucinated}")
+        return (
+            {"documents": [{**out, "processed": False}]}
+            if is_hallucinated
+            else {"documents": [{**out, "processed": True}]}
+        )
     except (OutputParserException, json.JSONDecodeError) as e:
         logger.error(f"Failed to decode JSON: {e}.")
+        return {"documents": [{**state, "failed": True, "processed": True}]}
 def fix_hallucination(state: DocumentState):
         )
     except (OutputParserException, json.JSONDecodeError) as e:
         logger.error(f"Failed to decode JSON: {e}.")
+        return {"documents": [{**state, "failed": True, "processed": True}]}
+    return {"documents": [{**state, "summary": response}]}
+def map_check(state: OverallState):
+    return [Send("check_hallucination", doc) for doc in state["documents"]]
+def map_fix(state: OverallState):
     return [
+        Send("fix_hallucination", doc)
+        for doc in state["documents"]
+        if doc["is_hallucinated"] and not doc["processed"]
     ]

planning_ai/nodes/map_node.py CHANGED Viewed

@@ -3,6 +3,7 @@ import logging
 import spacy
 from langchain_core.exceptions import OutputParserException
 from langgraph.types import Send
 from presidio_analyzer import AnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine
@@ -19,26 +20,14 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
-class BasicSummaryBroken(BaseModel):
-    summary: str
-    policies: None
 analyzer = AnalyzerEngine()
 anonymizer = AnonymizerEngine()
 nlp = spacy.load("en_core_web_lg")
-def _return_summary_error(state):
-    state["iteration"] = 99
-    state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
-    state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
-    return {"documents": [state]}
 def retrieve_themes(state: DocumentState) -> DocumentState:
-    result = themes_chain.invoke({"document": state["document"]})
     if not result.themes:
         state["themes"] = set()
         return state
@@ -102,29 +91,40 @@ def generate_summary(state: DocumentState) -> dict:
     state = retrieve_themes(state)
     if not state["themes"]:
-        return _return_summary_error(state)
     map_chain = create_dynamic_map_chain(themes=state["themes"], prompt=map_template)
     try:
         response = map_chain.invoke({"context": state["document"].page_content})
     except (OutputParserException, json.JSONDecodeError) as e:
         logger.error(f"Failed to decode JSON: {e}.")
-        return _return_summary_error(state)
     logger.warning(f"Summary generation completed for document: {state['filename']}")
-    return {"documents": [{**state, "summary": response, "iteration": 1}]}
-def map_summaries(state: OverallState) -> list[Send]:
-    """Maps documents to the `generate_summary` function for processing.
-    This function prepares a list of documents to be summarized by sending them to the
-    `generate_summary` function. It allows for parallel processing of document summaries.
-    Args:
-        state (OverallState): The overall state containing all documents and their filenames.
-    Returns:
-        list: A list of Send objects directing each document to the `generate_summary`
-        function.
-    """
     logger.warning("Mapping documents to generate summaries.")
     return [Send("generate_summary", document) for document in state["documents"]]

 import spacy
 from langchain_core.exceptions import OutputParserException
+from langgraph.constants import END
 from langgraph.types import Send
 from presidio_analyzer import AnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine
 logger = logging.getLogger(__name__)
 analyzer = AnalyzerEngine()
 anonymizer = AnonymizerEngine()
 nlp = spacy.load("en_core_web_lg")
 def retrieve_themes(state: DocumentState) -> DocumentState:
+    result = themes_chain.invoke({"document": state["document"].page_content})
     if not result.themes:
         state["themes"] = set()
         return state
     state = retrieve_themes(state)
     if not state["themes"]:
+        logger.error("No themes found.")
+        return {
+            "documents": [
+                {
+                    **state,
+                    "summary": "",
+                    "processed": True,
+                    "is_hallucinated": True,
+                    "failed": True,
+                    "refinement_attempts": 0,
+                }
+            ]
+        }
     map_chain = create_dynamic_map_chain(themes=state["themes"], prompt=map_template)
     try:
         response = map_chain.invoke({"context": state["document"].page_content})
     except (OutputParserException, json.JSONDecodeError) as e:
         logger.error(f"Failed to decode JSON: {e}.")
+        return {"documents": [{**state, "failed": True, "processed": True}]}
     logger.warning(f"Summary generation completed for document: {state['filename']}")
+    return {
+        "documents": [
+            {
+                **state,
+                "summary": response,
+                "refinement_attempts": 0,
+                "is_hallucinated": True,  # start true to ensure cycle begins
+                "failed": False,
+                "processed": False,
+            }
+        ]
+    }
+def map_documents(state: OverallState) -> list[Send]:
     logger.warning("Mapping documents to generate summaries.")
     return [Send("generate_summary", document) for document in state["documents"]]

planning_ai/nodes/reduce_node.py CHANGED Viewed

@@ -15,46 +15,6 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
-def extract_policies_from_docs(docs):
-    policies = {"themes": [], "policies": [], "details": [], "stance": []}
-    for doc in docs:
-        if not doc["summary"].policies:
-            continue
-        for policy in doc["summary"].policies:
-            for theme, p in THEMES_AND_POLICIES.items():
-                if policy.policy.name in p:
-                    policies["themes"].append(theme)
-                    policies["policies"].append(policy.policy.name)
-                    policies["details"].append(
-                        f"{policy.note} [{doc['document'].metadata['index']}]"
-                    )
-                    policies["stance"].append(
-                        doc["document"].metadata["representations_support/object"]
-                    )
-    df = pl.DataFrame(policies)
-    grouped = df.group_by(["themes", "policies", "stance"]).agg(pl.col("details"))
-    return grouped
-def filter_final_documents(state: OverallState):
-    return [doc for doc in state["documents"] if doc["hallucination"].score == 1]
-def filter_docs(final_docs):
-    out_docs = []
-    for doc in final_docs:
-        if (
-            (doc["summary"].summary != "INVALID")
-            and (doc["themes"] != set())
-            and (doc["iteration"] != 99)
-        ):
-            doc["summary"].summary = (
-                f"Document ID: [{doc['document'].metadata['index']}]\n\n{doc['summary'].summary}"
-            )
-            out_docs.append(doc)
-    return out_docs
 def save_summaries_to_json(docs):
     """Saves summaries to JSON files.
@@ -69,12 +29,12 @@ def save_summaries_to_json(docs):
             "entities": doc["entities"],
             "themes": list(doc["themes"]),
             "summary": doc["summary"].model_dump()["summary"],
-            "policies": [
-                {"policy": policy["policy"].name, "note": policy["note"]}
-                for policy in (doc["summary"].model_dump().get("policies", []) or [])
-            ],
-            "iteration": doc["iteration"],
             "hallucination": doc["hallucination"].model_dump(),
         }
         for doc in docs
     ]
@@ -84,6 +44,41 @@ def save_summaries_to_json(docs):
             json.dump(doc, f)
 def batch_generate_executive_summaries(summaries):
     """Processes summaries to generate final responses.
@@ -101,7 +96,7 @@ def batch_generate_executive_summaries(summaries):
     batch_size = 50
     for i in range(0, len(summaries_text), batch_size):
         logger.warning(
-            f"Processing batches... {i/50}/{len(summaries_text)//batch_size}"
         )
         batch = summaries_text[i : i + batch_size]
         response = reduce_chain.invoke({"context": batch})
@@ -110,62 +105,42 @@ def batch_generate_executive_summaries(summaries):
 def generate_policy_output(policy_groups):
-    policies_support = []
-    policies_object = []
-    for _, policy in policy_groups.group_by(["themes", "policies"]):
-        logger.warning("Processing policies.")
-        bullets = "* " + "* \n".join(policy["details"][0])
-        pchain_out = policy_chain.invoke(
-            {"policy": policy["policies"][0], "bullet_points": bullets}
         )
-        if policy["stance"][0] == "Support":
-            policies_support.append(
-                {
-                    "theme": policy["themes"][0],
-                    "policy": policy["policies"][0],
-                    "points": pchain_out,
-                }
-            )
-        else:
-            policies_object.append(
-                {
-                    "theme": policy["themes"][0],
-                    "policy": policy["policies"][0],
-                    "points": pchain_out,
-                }
-            )
-    return policies_support, policies_object
-def format_themes(policies):
-    themes = ""
-    for theme, policies in pl.DataFrame(policies).group_by("theme"):
-        themes += f"### {theme[0]}\n\n"
-        for row in policies.iter_rows(named=True):
-            themes += f"\n#### {row['policy']}\n\n"
-            themes += f"{row['points']}\n"
-        themes += "\n"
-    return themes
 def generate_final_report(state: OverallState):
-    logger.warning("Generating final summary")
-    final_docs = filter_final_documents(state)
-    logger.warning(f"Number of final docs: {len(final_docs)}")
     if len(final_docs) == state["n_docs"]:
-        docs = filter_docs(final_docs)
-        save_summaries_to_json(docs)
-        policy_groups = extract_policies_from_docs(docs)
-        policies_support, policies_object = generate_policy_output(policy_groups)
-        batch_executive = batch_generate_executive_summaries(docs)
-        executive = reduce_chain.invoke({"context": "\n\n".join(batch_executive)})
-        return {
-            "executive": executive,
-            "documents": final_docs,
-            "policies_support": format_themes(policies_support),
-            "policies_object": format_themes(policies_object),
-        }

 logger = logging.getLogger(__name__)
 def save_summaries_to_json(docs):
     """Saves summaries to JSON files.
             "entities": doc["entities"],
             "themes": list(doc["themes"]),
             "summary": doc["summary"].model_dump()["summary"],
+            "policies": doc["policies"],
+            "notes": doc["notes"],
+            "refinement_attempts": doc["refinement_attempts"],
             "hallucination": doc["hallucination"].model_dump(),
+            "is_hallucinated": doc["is_hallucinated"],
+            "failed": doc["failed"],
         }
         for doc in docs
     ]
             json.dump(doc, f)
+def extract_policies_from_docs(docs):
+    policies = {"doc_id": [], "themes": [], "policies": [], "details": [], "stance": []}
+    for doc in docs:
+        if not doc["summary"].policies or not doc["summary"].notes:
+            continue
+        # TODO: Test when this is sometimes empty
+        assert len(doc["summary"].policies) == len(doc["summary"].notes), __import__(
+            "ipdb"
+        ).set_trace()
+        try:
+            for policy, note in zip(doc["summary"].policies, doc["summary"].notes):
+                for theme, p in THEMES_AND_POLICIES.items():
+                    if policy in p:
+                        policies["doc_id"].append(doc["document"].metadata["index"])
+                        policies["themes"].append(theme)
+                        policies["policies"].append(policy)
+                        policies["details"].append(note)
+                        policies["stance"].append(
+                            doc["document"].metadata["representations_support/object"]
+                        )
+        except Exception:
+            __import__("ipdb").set_trace()
+    return pl.DataFrame(policies)
+def add_doc_id(final_docs):
+    out_docs = []
+    for doc in final_docs:
+        doc["summary"].summary = (
+            f"Document ID: [{doc['document'].metadata['index']}]\n\n{doc['summary'].summary}"
+        )
+        out_docs.append(doc)
+    return out_docs
 def batch_generate_executive_summaries(summaries):
     """Processes summaries to generate final responses.
     batch_size = 50
     for i in range(0, len(summaries_text), batch_size):
         logger.warning(
+            f"Processing batches... {int(i/50)+1}/{(len(summaries_text)//batch_size)+1}"
         )
         batch = summaries_text[i : i + batch_size]
         response = reduce_chain.invoke({"context": batch})
 def generate_policy_output(policy_groups):
+    out = []
+    for policy in (
+        policy_groups.group_by(["themes", "policies", "stance"])
+        .agg(pl.col("details"), pl.col("doc_id"))
+        .rows(named=True)
+    ):
+        logger.warning(f"Processing policies: {policy['policies']}...")
+        reduced = policy_chain.invoke(
+            {
+                "theme": policy["themes"],
+                "policy": policy["policies"],
+                "stance": policy["stance"],
+                "details": policy["details"],
+                "doc_id": policy["doc_id"],
+            }
         )
+        out.append(policy | reduced.dict())
+    return pl.DataFrame(out)
 def generate_final_report(state: OverallState):
+    final_docs = [doc for doc in state["documents"] if doc["processed"]]
     if len(final_docs) == state["n_docs"]:
+        logging.warning(f"Generating final report... ({len(final_docs)} documents)")
+        return final_output(final_docs)
+def final_output(final_docs):
+    docs = [doc for doc in final_docs if not doc["failed"]]
+    docs = add_doc_id(docs)
+    policy_groups = extract_policies_from_docs(docs)
+    policies = generate_policy_output(policy_groups)
+    batch_executive = batch_generate_executive_summaries(docs)
+    executive = reduce_chain.invoke({"context": "\n\n".join(batch_executive)})
+    return {"executive": executive, "documents": docs, "policies": policies}

planning_ai/states.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Annotated, TypedDict
 from langchain_core.documents import Document
 from pydantic import BaseModel
@@ -14,16 +15,19 @@ class DocumentState(TypedDict):
     entities: list[dict]
     themes: set[str]
     summary: BaseModel
     hallucination: HallucinationChecker
-    iteration: int
 class OverallState(TypedDict):
     executive: str
-    documents: Annotated[list[DocumentState], filename_reducer]
-    policies_support: str
-    policies_object: str
     n_docs: int

 from pathlib import Path
 from typing import Annotated, TypedDict
+import polars as pl
 from langchain_core.documents import Document
 from pydantic import BaseModel
     entities: list[dict]
     themes: set[str]
     summary: BaseModel
     hallucination: HallucinationChecker
+    is_hallucinated: bool
+    refinement_attempts: int
+    failed: bool
+    processed: bool
 class OverallState(TypedDict):
+    documents: Annotated[list, filename_reducer]
     executive: str
+    policies: pl.DataFrame
     n_docs: int