Spaces:

cjber
/

planning-ai

Build error

App Files Files Community

cjber commited on Sep 9, 2024

Commit

21b7409

1 Parent(s): a26aa12

refactor to modular structure

Browse files

Files changed (24) hide show

.gitignore +2 -0
planning_ai/chains/__init__.py +0 -0
planning_ai/chains/map_chain.py +23 -0
planning_ai/chains/prompts/map.txt +4 -0
planning_ai/chains/prompts/reduce.txt +31 -0
planning_ai/chains/reduce_chain.py +25 -0
planning_ai/common/utils.py +31 -0
planning_ai/graph.py +26 -0
planning_ai/llms/__init__.py +0 -0
planning_ai/llms/llm.py +11 -0
planning_ai/main.py +28 -0
planning_ai/nodes/__init__.py +0 -0
planning_ai/nodes/map_node.py +38 -0
planning_ai/nodes/reduce_node.py +24 -0
planning_ai/preprocessing/gclp.py +16 -0
src/gpt4o_structured.py → planning_ai/preprocessing/process_pdfs.py +15 -25
planning_ai/preprocessing/prompts/ocr.txt +10 -0
planning_ai/preprocessing/web_comments.py +14 -0
planning_ai/states.py +16 -0
pyproject.toml +9 -7
src/planning_ai/__init__.py +0 -2
src/planning_ai/loaders.py +0 -4
src/planning_ai/phi.py +0 -91
uv.lock +0 -0

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 data/
 .envrc
@@ -164,3 +165,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/

 data/
+.old/
 .envrc
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+.aider*

planning_ai/chains/__init__.py ADDED Viewed

File without changes

planning_ai/chains/map_chain.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from planning_ai.llms.llm import LLM
+with open("./planning_ai/chains/prompts/map.txt", "r") as f:
+    map_template = f.read()
+map_prompt = ChatPromptTemplate.from_messages([("system", map_template)])
+map_chain = map_prompt | LLM | StrOutputParser()
+if __name__ == "__main__":
+    test_document = """
+    The Local Plan proposes a mass development north-west of Cambridge despite marked growth
+    in the last twenty years or so following the previous New Settlement Study. In this period,
+    the major settlement of Cambourne has been created - now over the projected 3,000 homes and
+    Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
+    """
+    result = map_chain.invoke({"context": test_document})
+    print("Generated Summary:")
+    print(result)

planning_ai/chains/prompts/map.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Summarise the following response to a planning application concisely. Ensure the summary accurately reflects the key points of the response. After the summary, provide one word that represents the author's overall stance: either 'SUPPORT' or 'OPPOSE'.
+Response:
+{context}

planning_ai/chains/prompts/reduce.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+The following are summaries of responses to a local government planning application:
+{context}
+Create a final, consolidated summary of the main themes, and follow the specified format precisely. For each key point add inline citations which relate to the sources of the information. Use '[SOURCE_NUMBER]' for the citation (e.g. 'The Space Needle is in Seattle [1][2]').". Each summary will have OPPOSE or SUPPORT appended which indicates which grouping their main argument belongs to. Bare in mind that a summary may contain both supporting and opposing key points.
+Format:
+# Summary
+<Concise summary of the overall themes>
+# Key points raised in support
+Support: <Total number of responses supporting the application>
+* <Key point 1>
+* <Key point 2>
+* ...
+# Key points raised in opposition
+Opposed: <Total number of responses opposing the application>
+* <Key point 1>
+* <Key point 2>
+* ...
+# Thematic breakdown
+<Provide a breakdown of the key themes identified (e.g. environmental concerns, economic growth), along with the percentage of responses addressing each theme.>

planning_ai/chains/reduce_chain.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from planning_ai.llms.llm import LLM
+with open("./planning_ai/chains/prompts/reduce.txt", "r") as f:
+    reduce_template = f.read()
+reduce_prompt = ChatPromptTemplate([("human", reduce_template)])
+reduce_chain = reduce_prompt | LLM | StrOutputParser()
+if __name__ == "__main__":
+    test_document = """
+    The response expresses concern over the proposed mass development north-west of Cambridge,
+    highlighting significant growth in the area over the past twenty years, particularly with
+    the establishment of Cambourne and the expansion of Papworth Everard. The author is worried
+    about the implications of further development given the existing growth.
+    OPPOSE
+    """
+    result = reduce_chain.invoke({"context": test_document})
+    print("Generated Report:")
+    print(result)

planning_ai/common/utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from pathlib import Path
+from typing import List
+import polars as pl
+from langchain_core.documents import Document
+from planning_ai.llms.llm import LLM
+pl.Config(
+    fmt_str_lengths=9,
+    set_tbl_rows=5,
+    set_tbl_hide_dtype_separator=True,
+    set_tbl_dataframe_shape_below=True,
+    set_tbl_formatting="UTF8_FULL_CONDENSED",
+)
+class Paths:
+    DATA = Path("data")
+    RAW = DATA / "raw"
+    STAGING = DATA / "staging"
+    OUT = DATA / "out"
+class Consts:
+    TOKEN_MAX = 10_000
+def length_function(documents: List[Document]) -> int:
+    """Get number of tokens for input contents."""
+    return sum(LLM.get_num_tokens(doc.page_content) for doc in documents)

planning_ai/graph.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from langgraph.graph import END, START, StateGraph
+from planning_ai.nodes.map_node import (
+    collect_summaries,
+    generate_summary,
+    map_summaries,
+    should_collapse,
+)
+from planning_ai.nodes.reduce_node import collapse_summaries, generate_final_summary
+from planning_ai.states import OverallState
+def create_graph():
+    graph = StateGraph(OverallState)
+    graph.add_node("generate_summary", generate_summary)
+    graph.add_node("collect_summaries", collect_summaries)
+    graph.add_node("collapse_summaries", collapse_summaries)
+    graph.add_node("generate_final_summary", generate_final_summary)
+    graph.add_conditional_edges(START, map_summaries, ["generate_summary"])
+    graph.add_edge("generate_summary", "collect_summaries")
+    graph.add_conditional_edges("collect_summaries", should_collapse)
+    graph.add_conditional_edges("collapse_summaries", should_collapse)
+    graph.add_edge("generate_final_summary", END)
+    return graph.compile()

planning_ai/llms/__init__.py ADDED Viewed

File without changes

planning_ai/llms/llm.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from dotenv import load_dotenv
+from langchain_core.rate_limiters import InMemoryRateLimiter
+from langchain_openai import ChatOpenAI
+load_dotenv()
+rate_limiter = InMemoryRateLimiter(
+    requests_per_second=5,
+    check_every_n_seconds=0.1,
+)
+LLM = ChatOpenAI(temperature=0, model="gpt-4o-mini", rate_limiter=rate_limiter)

planning_ai/main.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from langchain_community.document_loaders import DirectoryLoader, TextLoader
+from langchain_text_splitters import CharacterTextSplitter
+from planning_ai.common.utils import Paths
+from planning_ai.graph import create_graph
+loader = DirectoryLoader(
+    path=str(Paths.STAGING),
+    show_progress=True,
+    use_multithreading=True,
+    loader_cls=TextLoader,
+    recursive=True,
+)
+docs = [doc for doc in loader.load() if doc.page_content]
+text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
+    chunk_size=1000, chunk_overlap=0
+)
+split_docs = text_splitter.split_documents(docs)
+app = create_graph()
+for step in app.stream(
+    {"contents": [doc.page_content for doc in split_docs]},
+    {"recursion_limit": 10},
+):
+    print(list(step.keys()))
+print(step["generate_final_summary"]["final_summary"])

planning_ai/nodes/__init__.py ADDED Viewed

File without changes

planning_ai/nodes/map_node.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from typing import Literal
+from langchain_core.documents import Document
+from langgraph.constants import Send
+from planning_ai.chains.map_chain import map_chain
+from planning_ai.common.utils import Consts, length_function
+from planning_ai.states import OverallState, SummaryState
+def generate_summary(state: SummaryState):
+    response = map_chain.invoke({"context": state["content"]})
+    return {"summaries": [response]}
+def map_summaries(state: OverallState):
+    return [
+        Send("generate_summary", {"content": content}) for content in state["contents"]
+    ]
+def collect_summaries(state: OverallState):
+    return {
+        "collapsed_summaries": [
+            Document(f"[{idx}]\n\n{summary}")
+            for idx, summary in enumerate(state["summaries"], start=1)
+        ]
+    }
+def should_collapse(
+    state: OverallState,
+) -> Literal["collapse_summaries", "generate_final_summary"]:
+    num_tokens = length_function(state["collapsed_summaries"])
+    if num_tokens > Consts.TOKEN_MAX:
+        return "collapse_summaries"
+    else:
+        return "generate_final_summary"

planning_ai/nodes/reduce_node.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from langchain.chains.combine_documents.reduce import collapse_docs, split_list_of_docs
+from planning_ai.chains.reduce_chain import reduce_chain
+from planning_ai.common.utils import Consts, length_function
+from planning_ai.states import OverallState
+def collapse_summaries(state: OverallState):
+    doc_lists = split_list_of_docs(
+        state["collapsed_summaries"], length_function, Consts.TOKEN_MAX
+    )
+    results = []
+    for doc_list in doc_lists:
+        results.append(collapse_docs(doc_list, reduce_chain.invoke))
+    return {"collapsed_summaries": results}
+def generate_final_summary(state: OverallState):
+    response = reduce_chain.invoke({"context": state["collapsed_summaries"]})
+    return {
+        "final_summary": response,
+        "collapsed_summaries": state["collapsed_summaries"],
+    }

planning_ai/preprocessing/gclp.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import polars as pl
+from planning_ai.common.utils import Paths
+df = pl.read_excel(
+    Paths.RAW / "gclp-first-proposals-questionnaire-responses-redacted.xlsx"
+)
+free_cols = [df.columns[0]] + df.columns[6:13] + [df.columns[33]]
+df = df[free_cols]
+for row in df.rows(named=True):
+    user = row.pop("UserNo")
+    content = "\n\n".join([f"**{k}**\n\n{v}" for k, v in row.items() if v != "-"])
+    with open(Paths.STAGING / "gclp" / f"{user}.txt", "w") as f:
+        f.write(content)

src/gpt4o_structured.py → planning_ai/preprocessing/process_pdfs.py RENAMED Viewed

@@ -1,19 +1,16 @@
-import ast
 import base64
 import os
 from io import BytesIO
 from pathlib import Path
-import polars as pl
 import requests
 from dotenv import load_dotenv
 from pdf2image import convert_from_path
-def load_environment():
-    load_dotenv()
-def convert_pdf_to_images(file_path):
-    return convert_from_path(file_path)
 def encode_images_to_base64(images):
     image_b64 = []
@@ -29,6 +26,7 @@ def encode_images_to_base64(images):
         )
     return image_b64
 def send_request_to_api(messages):
     api_key = os.getenv("OPENAI_API_KEY")
     headers = {
@@ -41,37 +39,29 @@ def send_request_to_api(messages):
     )
     return response.json()
-def main():
-    load_environment()
-    prompt = """
-    The following images are from a planning response form completed by a member of the public. They contain free-form responses related to a planning application, which may be either handwritten or typed.
-    Please extract all the free-form information from these images and output it verbatim. Do not include any additional information or summaries. Note that the images are sequentially ordered, so a response might continue from one image to the next.
-    """
-    path = Path("./data/raw/pdfs")
-    for file in path.glob("*.pdf"):
         if file.stem:
-            images = convert_pdf_to_images(file)
             image_b64 = encode_images_to_base64(images)
             messages = [
                 {
                     "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": prompt,
-                        },
-                    ]
-                    + image_b64,
                 }
             ]
             response = send_request_to_api(messages)
-            print(response)
-            break
 if __name__ == "__main__":
     main()

 import base64
 import os
 from io import BytesIO
 from pathlib import Path
 import requests
 from dotenv import load_dotenv
 from pdf2image import convert_from_path
+from planning_ai.common.utils import Paths
+load_dotenv()
 def encode_images_to_base64(images):
     image_b64 = []
         )
     return image_b64
 def send_request_to_api(messages):
     api_key = os.getenv("OPENAI_API_KEY")
     headers = {
     )
     return response.json()
+def main():
+    pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
+    with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
+        ocr_prompt = f.read()
+    for file in pdfs:
         if file.stem:
+            images = convert_from_path(file)
             image_b64 = encode_images_to_base64(images)
             messages = [
                 {
                     "role": "user",
+                    "content": [{"type": "text", "text": ocr_prompt}] + image_b64,
                 }
             ]
             response = send_request_to_api(messages)
+            out = response["choices"][0]["message"]["content"]
+            with open(Paths.STAGING / "pdfs" / f"{file.stem}.txt", "w") as f:
+                f.write(out)
 if __name__ == "__main__":
     main()

planning_ai/preprocessing/prompts/ocr.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+The images provided are from a planning response form filled out by a member of the public, containing free-form responses related to a planning application. These responses may be handwritten or typed.
+Please follow these instructions to process the images:
+1. **Extract Free-Form Information Only**: Focus on extracting and outputting the free-form written content from the images. Do not include single-word answers, brief responses, or any extra content that is not part of the detailed responses.
+2. **Verbatim Output**: Ensure that the extracted information is output exactly as it appears in the images. Add a heading before each section of free-form text if it helps with organisation, but ensure the heading is not added by the model itself. Ignore blank sections entirely—do not generate or include any additional thoughts or content.
+3. **Sequential Processing**: The images are sequentially ordered. A response might continue from one image to the next, so capture the full context across multiple images if necessary.
+4. **Ignore Non-Relevant Content**: Exclude any content that does not fit the criteria of free-form, detailed responses.
+Thank you for your attention to these details.

planning_ai/preprocessing/web_comments.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import polars as pl
+from planning_ai.common.utils import Paths
+dfs = pl.read_excel(Paths.RAW / "web comments.xlsx", sheet_id=0)
+for sheet_name, df in dfs.items():
+    string_df = df.select(pl.col(pl.String)).drop_nulls()
+    for col in string_df.columns:
+        series = string_df[col]
+        name = series.name
+        content = f"**{name}**" + "\n\n* ".join(["\n"] + series.to_list())
+        with open(Paths.STAGING / "web" / f"{sheet_name}.txt", "w") as f:
+            f.write(content)

planning_ai/states.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import operator
+from typing import Annotated, List, TypedDict
+from langchain_core.documents import Document
+class OverallState(TypedDict):
+    contents: List[str]
+    summaries: Annotated[list, operator.add]
+    collapsed_summaries: List[Document]
+    final_summary: str
+class SummaryState(TypedDict):
+    content: str

pyproject.toml CHANGED Viewed

@@ -5,16 +5,18 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.10,<3.11"
 dependencies = [
     "langchain-core>=0.2.38",
     "langchain-community>=0.2.16",
-    "langchain-unstructured>=0.1.2",
-    "transformers>=4.44.2",
-    "torch>=2.4.1",
-    "accelerate>=0.34.0",
     "pillow>=10.4.0",
-    "setuptools",
-    "flash-attn>=2.6.3",
-    "torchvision>=0.19.1",
     "pdf2image>=1.17.0",
 ]

 readme = "README.md"
 requires-python = ">=3.10,<3.11"
 dependencies = [
+    "python-dotenv>=1.0.1",
     "langchain-core>=0.2.38",
     "langchain-community>=0.2.16",
+    "langchain-openai>=0.1.23",
     "pillow>=10.4.0",
+    "polars>=1.6.0",
+    "fastexcel>=0.11.6",
+    "spacy>=3.7.6",
+    "pip>=24.2",
+    "spacytextblob>=4.0.0",
+    "transformers>=4.44.2",
+    "langgraph>=0.2.18",
     "pdf2image>=1.17.0",
 ]

src/planning_ai/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- def hello() -> str:
2	- return "Hello from planning-ai!"

src/planning_ai/loaders.py DELETED Viewed

@@ -1,4 +0,0 @@
-from langchain_unstructured import UnstructuredLoader
-loader = UnstructuredLoader("./data/raw/pdfs/57693-94 Response Form.pdf")
-loader.load()

src/planning_ai/phi.py DELETED Viewed

@@ -1,91 +0,0 @@
-from pathlib import Path
-from pdf2image import convert_from_path
-from PIL import Image
-from transformers import AutoModelForCausalLM, AutoProcessor
-model_id = "microsoft/Phi-3.5-vision-instruct"
-# Note: set _attn_implementation='eager' if you don't have flash_attn installed
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="cuda",
-    trust_remote_code=True,
-    torch_dtype="auto",
-    _attn_implementation="flash_attention_2",
-)
-# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
-processor = AutoProcessor.from_pretrained(
-    model_id, trust_remote_code=True, num_crops=16
-)
-images = []
-placeholder = ""
-path = Path("./data/raw/pdfs")
-i = 1
-for file in path.glob("*.pdf"):
-    pdf_images = convert_from_path(file)
-    for image in pdf_images:
-        images.append(image)
-        placeholder += f"<|image_{i}|>\n"
-        i += 1
-messages = [
-    {
-        "role": "user",
-        "content": """
-<|image_1|>
-This image is an extract from a planning response form filled out by a member of the public. The form may contain typed or handwritten responses, including potentially incomplete or unclear sections. Your task is to extract relevant information in a strict, structured format. Do not repeat the document verbatim. Only output responses in the structured format below.
-Instructions:
-1. Extract responses to all structured questions on the form, in the format:
-   {"<question>": "<response>"}
-   Example:
-   {"Do you support the planning proposal?": "Yes"}
-2. For the handwritten notes under 'Your comments:', extract them verbatim. If any word is illegible or unclear, use the token <UNKNOWN>. Do not attempt to infer or complete missing parts. Use the format:
-   {"Your comments:": "<verbatim comments>"}
-   Example:
-   {"Your comments:": "I support the proposal, but the <UNKNOWN> aspect requires attention."}
-3. **Do not** output or repeat the original document content in full. Only return structured data in the format described above.
-4. **Ignore irrelevant sections** that are not part of the structured questionnaire or 'Your comments:' section.
-5. If a response is missing or the form section is blank, output:
-   {"<question>": "No response"}
-Guidelines:
-- Ensure you return only structured data in JSON-like format.
-- Strictly follow the format for both structured questions and handwritten comments.
-- If any part of the form is unclear or unreadable, do not fill it in with assumptions.
-- Avoid repeating the full content of the form. Focus only on extracting the relevant sections.
-Example output:
-{
-  "Do you support the planning proposal?": "Yes",
-  "Your comments:": "The proposal seems reasonable, but <UNKNOWN> needs further assessment."
-}
-""",
-    }
-]
-prompt = processor.tokenizer.apply_chat_template(
-    messages, tokenize=False, add_generation_prompt=True
-)
-inputs = processor(prompt, images[1], return_tensors="pt").to("cuda:0")
-generation_args = {"max_new_tokens": 10_000}
-generate_ids = model.generate(
-    **inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args
-)
-# remove input tokens
-generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
-response = processor.batch_decode(
-    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(response)

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff