Spaces:

cjber
/

planning-ai

Sleeping

cjber commited on Feb 17

Commit

8ef72b8

1 Parent(s): 945fac4

docs: update docs for correct preprocessing

Former-commit-id: 588ef403b1ebeefda50a76a56e01c0768e1d89e8 [formerly 13c024539070464b2bf80599a0e8824ae4d29857]
Former-commit-id: 2c77b4f2341951ea4d3008543995d3ef016b1130

Files changed (3) hide show

README.md +10 -5
planning_ai/preprocessing/azure_doc.py +4 -0
planning_ai/preprocessing/process_pdfs.py +0 -87

README.md CHANGED Viewed

@@ -51,14 +51,19 @@ pip install . # (or uv sync)
 ## Usage
 1. **Preprocessing**: Run the preprocessing scripts to convert raw data into a format suitable for analysis.
    ```bash
-   python planning_ai/preprocessing/process_pdfs.py
-   python planning_ai/preprocessing/gclp.py
-   python planning_ai/preprocessing/web_comments.py
    ```
-2. **Run Graph**: Execute the main script to process the documents and generate a Quarto summary document.
    ```bash
    python planning_ai/main.py
    ```
@@ -66,7 +71,7 @@ pip install . # (or uv sync)
 ## Configuration
 - **Environment Variables**: Use a `.env` file to store sensitive information like API keys.
-    - `OPENAI_API_KEY` required for summarisation; `OPENCAGE_API_KEY` required for geocoding (Quarto report)
 - **Constants**: Adjust `Consts` in `planning_ai/common/utils.py` to modify token limits and other settings.
 ## Workflow

 ## Usage
+This project uses **Streamlit** to provide a simple frontend to the system. Run using:
+`streamlit run app.py`
+Alternatively run everything manually:
 1. **Preprocessing**: Run the preprocessing scripts to convert raw data into a format suitable for analysis.
    ```bash
+   python planning_ai/preprocessing/gcpt3.py
+   python planning_ai/preprocessing/azure_doc.py
    ```
+2. **Run Graph**: Execute the main script to process the documents and generate Summary documents.
    ```bash
    python planning_ai/main.py
    ```
 ## Configuration
 - **Environment Variables**: Use a `.env` file to store sensitive information like API keys.
+    - `OPENAI_API_KEY` required for summarisation.
 - **Constants**: Adjust `Consts` in `planning_ai/common/utils.py` to modify token limits and other settings.
 ## Workflow

planning_ai/preprocessing/azure_doc.py CHANGED Viewed

@@ -88,3 +88,7 @@ def azure_process_pdfs():
             continue
         analyze_document_with_azure(pdf_path, out_pdf, failed_txt)

             continue
         analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
+if __name__ == "__main__":
+    azure_process_pdfs()

planning_ai/preprocessing/process_pdfs.py DELETED Viewed

@@ -1,87 +0,0 @@
-import base64
-import os
-from io import BytesIO
-import requests
-from dotenv import load_dotenv
-from pdf2image import convert_from_path
-from PyPDF2 import PdfReader
-from tqdm import tqdm
-from planning_ai.common.utils import Paths
-load_dotenv()
-def encode_images_to_base64(images):
-    image_b64 = []
-    for image in images:
-        buffered = BytesIO()
-        image.save(buffered, format="JPEG")
-        base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
-        image_b64.append(
-            {
-                "type": "image_url",
-                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
-            }
-        )
-    return image_b64
-def send_request_to_api(messages):
-    api_key = os.getenv("OPENAI_API_KEY")
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {api_key}",
-    }
-    payload = {"model": "gpt-4o-mini", "messages": messages}
-    response = requests.post(
-        "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
-    )
-    return response.json()
-def extract_text_from_pdf(file_path):
-    """Extracts text from a PDF file using PyPDF2."""
-    try:
-        reader = PdfReader(file_path, strict=True)
-        text = [page.extract_text() or "" for page in reader.pages]
-        return "\n".join(text).strip()
-    except Exception as e:
-        print(e)
-        return None
-def main():
-    pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
-    with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
-        ocr_prompt = f.read()
-    for file in tqdm(pdfs):
-        outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
-        try:
-            images = convert_from_path(file)
-            image_b64 = encode_images_to_base64(images)
-            messages = [
-                {
-                    "role": "user",
-                    "content": [{"type": "text", "text": ocr_prompt}] + image_b64,
-                }
-            ]
-            response = send_request_to_api(messages)
-            if "choices" not in response:
-                continue
-            out = response["choices"][0]["message"]["content"]
-            if outfile.exists():
-                continue
-            with open(outfile, "w") as f:
-                f.write(out)
-        except:
-            continue
-if __name__ == "__main__":
-    main()