Spaces:

cjber
/

planning-ai

Sleeping

cjber commited on Feb 18

Commit

a3397bd

1 Parent(s): 891c408

docs: add docstrings

Former-commit-id: 911e225c96e47b8d034b22d01b39c349fec66480 [formerly bb5a07ee701317833fadcd936f5a3ca7dd5dc00a]
Former-commit-id: 25e8de686c9a6ffc9d589923e1e53e26e5ee1dbb

Files changed (13) hide show

planning_ai/chains/map_chain.py +2 -2
planning_ai/chains/policy_chain.py +1 -1
planning_ai/common/utils.py +18 -3
planning_ai/documents/document.py +41 -3
planning_ai/eval/compare_summaries.py +30 -1
planning_ai/graph.py +1 -1
planning_ai/nodes/hallucination_node.py +18 -0
planning_ai/nodes/map_node.py +35 -2
planning_ai/nodes/reduce_node.py +1 -1
planning_ai/preprocessing/gcpt3.py +0 -1
planning_ai/preprocessing/prompts/ocr.txt +0 -10
planning_ai/retrievers/theme_retriever.py +0 -84
planning_ai/states.py +0 -2

planning_ai/chains/map_chain.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from enum import Enum, auto
-from typing import Optional, Set, Type
 from langchain_core.prompts import ChatPromptTemplate
-from pydantic import BaseModel, Field, create_model
 from planning_ai.common.utils import Paths
 from planning_ai.llms.llm import GPT4o

 from enum import Enum, auto
+from typing import Type
 from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel, create_model
 from planning_ai.common.utils import Paths
 from planning_ai.llms.llm import GPT4o

planning_ai/chains/policy_chain.py CHANGED Viewed

@@ -36,6 +36,6 @@ if __name__ == "__main__":
     test_docids = [1, 13, 21]
     result = policy_chain.invoke(
-        {"theme": "Climate Change", "policy": test_policy, "details": zipped}
     )
     print(result)

     test_docids = [1, 13, 21]
     result = policy_chain.invoke(
+        {"theme": "Climate Change", "policy": test_policy, "details": test_bullet}
     )
     print(result)

planning_ai/common/utils.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import shutil
 from pathlib import Path
 import polars as pl
@@ -13,6 +12,17 @@ pl.Config(
 def filename_reducer(docs_a, docs_b):
     if docs_a == []:
         return docs_b
     b_dict = {d["filename"]: d for d in docs_b}
@@ -25,6 +35,10 @@ def filename_reducer(docs_a, docs_b):
 class Paths:
     DATA = Path("data")
     RAW = DATA / "raw"
@@ -40,8 +54,9 @@ class Paths:
     @classmethod
     def ensure_directories_exist(cls):
-        # for path in [cls.STAGING, cls.OUT]:
-        #     shutil.rmtree(path, ignore_errors=True)
         for path in [
             cls.DATA,
             cls.RAW,

 from pathlib import Path
 import polars as pl
 def filename_reducer(docs_a, docs_b):
+    """
+    Reduces two lists of document dictionaries by updating docs_a with entries from docs_b
+    based on matching filenames.
+    Args:
+        docs_a (list): A list of dictionaries, each containing a "filename" key.
+        docs_b (list): A list of dictionaries, each containing a "filename" key.
+    Returns:
+        list: The updated list of dictionaries from docs_a with entries from docs_b.
+    """
     if docs_a == []:
         return docs_b
     b_dict = {d["filename"]: d for d in docs_b}
 class Paths:
+    """
+    A utility class for managing directory paths used in the project.
+    """
     DATA = Path("data")
     RAW = DATA / "raw"
     @classmethod
     def ensure_directories_exist(cls):
+        """
+        Ensures that all necessary directories exist, creating them if necessary.
+        """
         for path in [
             cls.DATA,
             cls.RAW,

planning_ai/documents/document.py CHANGED Viewed

@@ -2,7 +2,6 @@ import logging
 import re
 from collections import Counter
 import geopandas as gpd
 import matplotlib as mpl
 import matplotlib.pyplot as plt
@@ -36,6 +35,14 @@ WARDS = [
 def _process_postcodes(final):
     documents = final["documents"]
     postcodes = [doc["document"].metadata["respondentpostcode"] for doc in documents]
     postcodes = (
@@ -52,9 +59,18 @@ def _process_postcodes(final):
 def _process_policies(final):
     def process_policy_group(policy_group, theme, stance):
         details = "".join(
-            f'\n### {row["policies"]}\n\n'
             + "".join(
                 f"- {detail} {doc_id}\n"
                 for detail, doc_id in zip(row["detail"], row["doc_id"])
@@ -83,6 +99,14 @@ def _process_policies(final):
 def _process_stances(final):
     documents = final["documents"]
     stances = [
         doc["document"].metadata["representations_support/object"] for doc in documents
@@ -105,6 +129,14 @@ def _process_stances(final):
 def _process_themes(final):
     documents = final["documents"]
     themes = Counter(
         [theme["theme"].value for doc in documents for theme in doc["themes"]]
@@ -121,6 +153,11 @@ def _process_themes(final):
 def fig_oa(postcodes):
     oa_lookup = pl.read_csv(
         Paths.RAW
         / "Output_Area_to_Local_Authority_District_(April_2023)_Lookup_in_England_and_Wales.csv"
@@ -247,7 +284,6 @@ def fig_wards(postcodes):
         ax=ax,
         column="count",
         legend=True,
-        vmax=20,
         legend_kwds={"label": "Number of Representations"},
     )
     ward_boundaries.plot(ax=ax, color="none", edgecolor="gray")
@@ -389,6 +425,8 @@ def build_final_report(out, rep):
         "## Unused Documents\n\n"
         "Please note that the following documents were not used to produce this report:\n\n"
         f"{str(unused_documents)}"
     )
     out_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.md"

 import re
 from collections import Counter
 import geopandas as gpd
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 def _process_postcodes(final):
+    """Processes postcodes from the final document data.
+    Args:
+        final (dict): A dictionary containing document data.
+    Returns:
+        pl.DataFrame: A DataFrame with postcodes and their counts joined with ONSPD data.
+    """
     documents = final["documents"]
     postcodes = [doc["document"].metadata["respondentpostcode"] for doc in documents]
     postcodes = (
 def _process_policies(final):
+    """Processes policies from the final document data.
+    Args:
+        final (dict): A dictionary containing document data.
+    Returns:
+        tuple: A tuple containing strings of support, object, and other policies.
+    """
     def process_policy_group(policy_group, theme, stance):
         details = "".join(
+            f"\n### {row['policies']}\n\n"
             + "".join(
                 f"- {detail} {doc_id}\n"
                 for detail, doc_id in zip(row["detail"], row["doc_id"])
 def _process_stances(final):
+    """Processes stances from the final document data.
+    Args:
+        final (dict): A dictionary containing document data.
+    Returns:
+        str: A formatted string of stances with their percentages and counts.
+    """
     documents = final["documents"]
     stances = [
         doc["document"].metadata["representations_support/object"] for doc in documents
 def _process_themes(final):
+    """Processes themes from the final document data.
+    Args:
+        final (dict): A dictionary containing document data.
+    Returns:
+        str: A markdown table of themes with their counts and percentages.
+    """
     documents = final["documents"]
     themes = Counter(
         [theme["theme"].value for doc in documents for theme in doc["themes"]]
 def fig_oa(postcodes):
+    """Generates a figure for Output Area (OA) classifications.
+    Args:
+        postcodes (pl.DataFrame): A DataFrame containing postcode data.
+    """
     oa_lookup = pl.read_csv(
         Paths.RAW
         / "Output_Area_to_Local_Authority_District_(April_2023)_Lookup_in_England_and_Wales.csv"
         ax=ax,
         column="count",
         legend=True,
         legend_kwds={"label": "Number of Representations"},
     )
     ward_boundaries.plot(ax=ax, color="none", edgecolor="gray")
         "## Unused Documents\n\n"
         "Please note that the following documents were not used to produce this report:\n\n"
         f"{str(unused_documents)}"
+        "Documents are excluded if they provide no relevant information. These documents "
+        "are typically very short, and contain information that provides no relation to policies or themes."
     )
     out_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.md"

planning_ai/eval/compare_summaries.py CHANGED Viewed

@@ -8,10 +8,21 @@ from planning_ai.llms.llm import GPT4o
 class SummaryEvaluator(BaseModel):
-    score: int = Field(..., description="The number of the best summary.")
 def load_templates():
     with open("./planning_ai/eval/eval.txt", "r") as f:
         compare_template = f.read()
     with open("./planning_ai/eval/summary.txt", "r") as f:
@@ -20,6 +31,15 @@ def load_templates():
 def initialize_chains(compare_template, summary_template):
     SLLM = GPT4o.with_structured_output(SummaryEvaluator, strict=True)
     compare_prompt = ChatPromptTemplate([("system", compare_template)])
     compare_chain = compare_prompt | SLLM
@@ -31,6 +51,15 @@ def initialize_chains(compare_template, summary_template):
 def process_summaries(compare_chain, summary_chain):
     original = pl.read_parquet(Paths.STAGING / "gcpt3.parquet").filter(
         pl.col("attachments_id").is_null()
     )

 class SummaryEvaluator(BaseModel):
+    """Model for evaluating summaries.
+    Attributes:
+        score (int): The number of the best summary.
+    """
+    score: int = Field(...)
 def load_templates():
+    """Loads the comparison and summary templates from files.
+    Returns:
+        tuple: A tuple containing the compare template and summary template as strings.
+    """
     with open("./planning_ai/eval/eval.txt", "r") as f:
         compare_template = f.read()
     with open("./planning_ai/eval/summary.txt", "r") as f:
 def initialize_chains(compare_template, summary_template):
+    """Initializes the comparison and summary chains.
+    Args:
+        compare_template (str): The template for comparison.
+        summary_template (str): The template for summary.
+    Returns:
+        tuple: A tuple containing the compare chain and summary chain.
+    """
     SLLM = GPT4o.with_structured_output(SummaryEvaluator, strict=True)
     compare_prompt = ChatPromptTemplate([("system", compare_template)])
     compare_chain = compare_prompt | SLLM
 def process_summaries(compare_chain, summary_chain):
+    """Processes summaries by comparing and scoring them.
+    Args:
+        compare_chain: The chain used for comparing summaries.
+        summary_chain: The chain used for generating summaries.
+    Returns:
+        polars.DataFrame: A DataFrame containing the original text, summaries, and scores.
+    """
     original = pl.read_parquet(Paths.STAGING / "gcpt3.parquet").filter(
         pl.col("attachments_id").is_null()
     )

planning_ai/graph.py CHANGED Viewed

@@ -7,7 +7,7 @@ from planning_ai.nodes.hallucination_node import (
     map_check,
     map_fix,
 )
-from planning_ai.nodes.map_node import add_entities, generate_summary, map_documents
 from planning_ai.nodes.reduce_node import generate_final_report
 from planning_ai.states import OverallState

     map_check,
     map_fix,
 )
+from planning_ai.nodes.map_node import generate_summary, map_documents
 from planning_ai.nodes.reduce_node import generate_final_report
 from planning_ai.states import OverallState

planning_ai/nodes/hallucination_node.py CHANGED Viewed

@@ -111,10 +111,28 @@ def fix_hallucination(state: DocumentState):
 def map_check(state: OverallState):
     return [Send("check_hallucination", doc) for doc in state["documents"]]
 def map_fix(state: OverallState):
     return [
         Send("fix_hallucination", doc)
         for doc in state["documents"]

 def map_check(state: OverallState):
+    """Maps the check_hallucination function to each document in the overall state.
+    Args:
+        state (OverallState): The overall state containing multiple documents.
+    Returns:
+        list: A list of Send objects, each representing a request to check for hallucinations
+        in a document.
+    """
     return [Send("check_hallucination", doc) for doc in state["documents"]]
 def map_fix(state: OverallState):
+    """Maps the fix_hallucination function to each hallucinated document that is not processed.
+    Args:
+        state (OverallState): The overall state containing multiple documents.
+    Returns:
+        list: A list of Send objects, each representing a request to fix hallucinations
+        in a document that is hallucinated and not yet processed.
+    """
     return [
         Send("fix_hallucination", doc)
         for doc in state["documents"]

planning_ai/nodes/map_node.py CHANGED Viewed

@@ -16,6 +16,17 @@ nlp = spacy.load("en_core_web_lg")
 def retrieve_themes(state: DocumentState) -> DocumentState:
     try:
         result = themes_chain.invoke({"document": state["document"].page_content})
         if not result.themes:
@@ -34,6 +45,17 @@ def retrieve_themes(state: DocumentState) -> DocumentState:
 def add_entities(state: OverallState) -> OverallState:
     logger.info("Adding entities to all documents.")
     for idx, document in enumerate(
         nlp.pipe(
@@ -47,7 +69,7 @@ def add_entities(state: OverallState) -> OverallState:
 def remove_pii(document: str) -> str:
-    """Removes personally identifiable information (PII) from a document.
     This function uses the Presidio Analyzer and Anonymizer to detect and anonymize
     PII such as names, phone numbers, and email addresses in the given document.
@@ -67,7 +89,7 @@ def remove_pii(document: str) -> str:
 def generate_summary(state: DocumentState) -> dict:
-    """Generates a summary for a document after removing PII.
     This function first anonymizes the document to remove PII, then generates a summary
     using the `map_chain`. The summary is added to the document state.
@@ -136,5 +158,16 @@ def generate_summary(state: DocumentState) -> dict:
 def map_documents(state: OverallState) -> list[Send]:
     logger.info("Mapping documents to generate summaries.")
     return [Send("generate_summary", document) for document in state["documents"]]

 def retrieve_themes(state: DocumentState) -> DocumentState:
+    """Retrieve themes from a document's content.
+    This function uses the `themes_chain` to extract themes from the document's
+    page content. It updates the document state with the themes and their scores.
+    Args:
+        state (DocumentState): The current state of the document, including its content.
+    Returns:
+        DocumentState: The updated document state with themes and scores.
+    """
     try:
         result = themes_chain.invoke({"document": state["document"].page_content})
         if not result.themes:
 def add_entities(state: OverallState) -> OverallState:
+    """Add named entities to all documents in the state.
+    This function processes each document using a spaCy NLP pipeline to extract
+    named entities and adds them to the document state.
+    Args:
+        state (OverallState): The overall state containing multiple documents.
+    Returns:
+        OverallState: The updated state with entities added to each document.
+    """
     logger.info("Adding entities to all documents.")
     for idx, document in enumerate(
         nlp.pipe(
 def remove_pii(document: str) -> str:
+    """Remove personally identifiable information (PII) from a document.
     This function uses the Presidio Analyzer and Anonymizer to detect and anonymize
     PII such as names, phone numbers, and email addresses in the given document.
 def generate_summary(state: DocumentState) -> dict:
+    """Generate a summary for a document after removing PII.
     This function first anonymizes the document to remove PII, then generates a summary
     using the `map_chain`. The summary is added to the document state.
 def map_documents(state: OverallState) -> list[Send]:
+    """Map documents to generate summaries.
+    This function prepares a list of `Send` objects to trigger the summary generation
+    process for each document in the state.
+    Args:
+        state (OverallState): The overall state containing multiple documents.
+    Returns:
+        list[Send]: A list of `Send` objects for summary generation.
+    """
     logger.info("Mapping documents to generate summaries.")
     return [Send("generate_summary", document) for document in state["documents"]]

planning_ai/nodes/reduce_node.py CHANGED Viewed

@@ -82,7 +82,7 @@ def batch_generate_executive_summaries(summaries):
     batch_size = 50
     for i in range(0, len(summaries_text), batch_size):
         logger.info(
-            f"Processing batches... {int(i/50)+1}/{(len(summaries_text)//batch_size)+1}"
         )
         batch = summaries_text[i : i + batch_size]
         response = reduce_chain.invoke({"context": batch})

     batch_size = 50
     for i in range(0, len(summaries_text), batch_size):
         logger.info(
+            f"Processing batches... {int(i / 50) + 1}/{(len(summaries_text) // batch_size) + 1}"
         )
         batch = summaries_text[i : i + batch_size]
         response = reduce_chain.invoke({"context": batch})

planning_ai/preprocessing/gcpt3.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-import textwrap
 from io import BytesIO
 from pathlib import Path
 from typing import Any

 import logging
 from io import BytesIO
 from pathlib import Path
 from typing import Any

planning_ai/preprocessing/prompts/ocr.txt DELETED Viewed

@@ -1,10 +0,0 @@
-The images provided are from a planning response form filled out by a member of the public, containing free-form responses related to a planning application. These responses may be handwritten or typed.
-Please follow these instructions to process the images:
-1. **Extract Free-Form Information Only**: Focus on extracting and outputting the free-form written content from the images. Do not include single-word answers, brief responses, or any extra content that is not part of the detailed responses.
-2. **Verbatim Output**: Ensure that the extracted information is output exactly as it appears in the images. Add a heading before each section of free-form text if it helps with organisation, but ensure the heading is not added by the model itself. Ignore blank sections entirely—do not generate or include any additional thoughts or content.
-3. **Sequential Processing**: The images are sequentially ordered. A response might continue from one image to the next, so capture the full context across multiple images if necessary.
-4. **Ignore Non-Relevant Content**: Exclude any content that does not fit the criteria of free-form, detailed responses.
-Thank you for your attention to these details.

planning_ai/retrievers/theme_retriever.py DELETED Viewed

@@ -1,84 +0,0 @@
-import logging
-from pathlib import Path
-from chromadb import PersistentClient
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_community.vectorstores import Chroma
-from langchain_core.prompts import PromptTemplate
-from langchain_openai import OpenAIEmbeddings
-from pydantic import BaseModel, Field
-from planning_ai.llms.llm import GPT4o
-# See: https://consultations.greatercambridgeplanning.org/greater-cambridge-local-plan-preferred-options/supporting-documents
-PDFS = {
-    "Biodiversity and Green Spaces": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPBiodiversityandGreenSpacesAug21v2Nov21_0.pdf",
-    "Climate Change": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPClimateChangeAug21v2Nov21_0.pdf",
-    "Great Places": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPGreatPlacesAug21v1Aug21.pdf",
-    "Homes": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPHomesAug21v2Nov21.pdf",
-    "Infrastructure": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPInfrastructureAug21v2Nov21.pdf",
-    "Jobs": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPJobsAug21v2Nov21.pdf",
-    # "Strategy topic paper": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPStrategyAug21v3Nov21_0.pdf",
-    "Wellbeing and Social Inclusion": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPWellbeingAug21v2Nov21.pdf",
-}
-class Grade(BaseModel):
-    """Binary score for relevance check."""
-    binary_score: str = Field(description="Relevance score 'yes' or 'no'")
-def create_db():
-    chroma_dir = Path("./chroma_themesdb")
-    if chroma_dir.exists():
-        persistent_client = PersistentClient(path="./chroma_themesdb")
-        vectorstore = Chroma(
-            client=persistent_client,
-            collection_name="themes-chroma",
-            embedding_function=OpenAIEmbeddings(),
-        )
-    else:
-        docs = []
-        for name, pdf in PDFS.items():
-            doc = PyPDFLoader(pdf).load()[5:]
-            for d in doc:
-                d.metadata["theme"] = name
-            docs.extend(doc)
-        logging.warning(f"Building ChromaDB...")
-        vectorstore = Chroma.from_documents(
-            documents=docs,
-            collection_name="themes-chroma",
-            embedding=OpenAIEmbeddings(),
-            persist_directory="./chroma_themesdb",
-        )
-    return vectorstore
-grade_template = PromptTemplate(
-    template="""You are a grader assessing relevance of a retrieved document to a user question. \n
-        Here is the retrieved document: \n\n {context} \n\n
-        Here is the original document: {document} \n
-        If the retrieved document contains keyword(s) or semantic meaning related to the original, grade it as relevant. \n
-        Give a binary score 'yes' or 'no' score to indicate whether the retrieved document is relevant to the original.""",
-    input_variables=["context", "document"],
-)
-SLLM = GPT4o.with_structured_output(Grade, strict=True)
-grade_chain = grade_template | SLLM
-vectorstore = create_db()
-theme_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
-logging.warning(f"Finished building ChromaDB...")
-if __name__ == "__main__":
-    test_content = """
-    We would certainly support this and would emphasise the importance of trying
-    to solve the severance problems created by the M11 and A14.
-    """
-    len(theme_retriever.invoke(input=test_content))

planning_ai/states.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from pathlib import Path
 from typing import Annotated, TypedDict
 import polars as pl
@@ -6,7 +5,6 @@ from langchain_core.documents import Document
 from pydantic import BaseModel
 from planning_ai.chains.hallucination_chain import HallucinationChecker
-from planning_ai.chains.themes_chain import ThemeScore
 from planning_ai.common.utils import filename_reducer

 from typing import Annotated, TypedDict
 import polars as pl
 from pydantic import BaseModel
 from planning_ai.chains.hallucination_chain import HallucinationChecker
 from planning_ai.common.utils import filename_reducer