Spaces:

cjber
/

planning-ai

Build error

App Files Files Community

cjber commited on Dec 5, 2024

Commit

8d63d9f

1 Parent(s): 82bbfd1

add processing for gcpt3

Browse files

Files changed (5) hide show

planning_ai/common/utils.py +2 -1
planning_ai/preprocessing/gclp.py +0 -22
planning_ai/preprocessing/gcpt3.py +62 -0
planning_ai/preprocessing/process_pdfs.py +27 -2
planning_ai/preprocessing/web_comments.py +0 -20

planning_ai/common/utils.py CHANGED Viewed

@@ -19,12 +19,13 @@ class Paths:
     OUT = DATA / "out"
     SUMMARY = OUT / "summary"
     PROMPTS = Path("planning_ai/chains/prompts")
     @classmethod
     def ensure_directories_exist(cls):
-        for path in [cls.DATA, cls.RAW, cls.STAGING, cls.OUT, cls.SUMMARY]:
             path.mkdir(parents=True, exist_ok=True)

     OUT = DATA / "out"
     SUMMARY = OUT / "summary"
+    SUMMARIES = OUT / "summaries"
     PROMPTS = Path("planning_ai/chains/prompts")
     @classmethod
     def ensure_directories_exist(cls):
+        for path in [cls.DATA, cls.RAW, cls.STAGING, cls.OUT, cls.SUMMARY, cls.SUMMARIES]:
             path.mkdir(parents=True, exist_ok=True)

planning_ai/preprocessing/gclp.py DELETED Viewed

@@ -1,22 +0,0 @@
-import polars as pl
-from planning_ai.common.utils import Paths
-def main():
-    df = pl.read_excel(
-        Paths.RAW / "gclp-first-proposals-questionnaire-responses-redacted.xlsx"
-    )
-    free_cols = [df.columns[0]] + df.columns[6:13] + [df.columns[33]]
-    df = df[free_cols]
-    for row in df.rows(named=True):
-        user = row.pop("UserNo")
-        content = "\n\n".join([f"**{k}**\n\n{v}" for k, v in row.items() if v != "-"])
-        with open(Paths.STAGING / "gclp" / f"{user}.txt", "w") as f:
-            f.write(content)
-if __name__ == "__main__":
-    main()

planning_ai/preprocessing/gcpt3.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from pathlib import Path
 from typing import Any
 import polars as pl
 from tqdm import tqdm
 from planning_ai.common.utils import Paths
@@ -12,6 +14,7 @@ def get_schema() -> dict[str, Any]:
         "id": pl.Int64,
         "method": pl.String,
         "text": pl.String,
         "attachments": pl.List(
             pl.Struct(
                 [
@@ -54,6 +57,65 @@ def process_files(files: list[Path], schema: dict[str, Any]) -> None:
     )
 def main() -> None:
     files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
     schema = get_schema()

+import logging
 from pathlib import Path
 from typing import Any
 import polars as pl
+import requests
 from tqdm import tqdm
 from planning_ai.common.utils import Paths
         "id": pl.Int64,
         "method": pl.String,
         "text": pl.String,
+        "respondentpostcode": pl.String,
         "attachments": pl.List(
             pl.Struct(
                 [
     )
+def download_attachments():
+    df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
+    existing_files = {int(f.stem) for f in (Paths.RAW / "pdfs").glob("*.pdf")}
+    failed_files = set()
+    failed_file_path = Paths.RAW / "failed_downloads.txt"
+    if failed_file_path.exists():
+        with open(failed_file_path, "r") as file:
+            failed_files = set(int(l) for l in file.read().splitlines())
+    for row in tqdm(
+        df.drop_nulls(subset="attachments_id")
+        .unique(subset="attachments_id")
+        .sample(shuffle=True, fraction=1)
+        .rows(named=True)
+    ):
+        attachment_id = int(row["attachments_id"])
+        if attachment_id in existing_files or attachment_id in failed_files:
+            print(f"Skipping {attachment_id} (already exists or previously failed)")
+            continue
+        if (
+            row["attachments_url"].endswith(".pdf")
+            and not row["attachments_url"].startswith("https://egov.scambs.gov.uk")
+            and not row["attachments_url"].startswith("http://egov.scambs.gov.uk")
+        ):
+            file_path = Paths.RAW / "pdfs" / f"{attachment_id}.pdf"
+            try:
+                response = requests.get(row["attachments_url"], timeout=10)
+                response.raise_for_status()
+                with open(file_path, "wb") as f:
+                    f.write(response.content)
+                print(f"Downloaded {attachment_id} to {file_path}")
+            except requests.RequestException as e:
+                logging.error(f"RequestException for {attachment_id}: {e}")
+                failed_files.add(attachment_id)
+                with open(failed_file_path, "a") as file:
+                    file.write(f"{attachment_id}\n")
+                print(f"Skipping {attachment_id} due to error: {e}")
+            except Exception as e:
+                logging.error(f"Unexpected error for {attachment_id}: {e}")
+                failed_files.add(attachment_id)
+                with open(failed_file_path, "a") as file:
+                    file.write(f"{attachment_id}\n")
+                print(f"Unexpected error for {attachment_id}: {e}")
+def convert_txt():
+    df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
+    # attachment_txt =
+    f"{df['text']}\n\nPOSITION: {df['representations_support/object']}"
 def main() -> None:
     files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
     schema = get_schema()

planning_ai/preprocessing/process_pdfs.py CHANGED Viewed

@@ -2,14 +2,36 @@ import base64
 import os
 from io import BytesIO
 import requests
 from dotenv import load_dotenv
 from pdf2image import convert_from_path
 from planning_ai.common.utils import Paths
 load_dotenv()
 def encode_images_to_base64(images):
     image_b64 = []
@@ -44,7 +66,7 @@ def main():
     with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
         ocr_prompt = f.read()
-    for file in pdfs:
         if file.stem:
             images = convert_from_path(file)
             image_b64 = encode_images_to_base64(images)
@@ -58,7 +80,10 @@ def main():
             response = send_request_to_api(messages)
             out = response["choices"][0]["message"]["content"]
-            with open(Paths.STAGING / "pdfs" / f"{file.stem}.txt", "w") as f:
                 f.write(out)

 import os
 from io import BytesIO
+import cv2
+import numpy as np
 import requests
 from dotenv import load_dotenv
 from pdf2image import convert_from_path
+from tqdm import tqdm
 from planning_ai.common.utils import Paths
 load_dotenv()
+import easyocr
+from pdf2image import convert_from_path
+pdf_path = "data/raw/pdfs/25.pdf"
+# pdf_path = "../../data/raw/pdfs/26.pdf"
+images = convert_from_path(pdf_path)
+reader = easyocr.Reader(lang_list=["en"], gpu=True)
+for i, image in enumerate(images):
+    results = reader.readtext(np.array(image))
+    print(f"Page {i+1}:")
+    confidences = []
+    for result in results:
+        confidences.append(result[2])
+        print(f"Detected text: {result[1]} (confidence: {result[2]:.2f})")
+np.array(confidences).mean()
 def encode_images_to_base64(images):
     image_b64 = []
     with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
         ocr_prompt = f.read()
+    for file in tqdm(pdfs):
         if file.stem:
             images = convert_from_path(file)
             image_b64 = encode_images_to_base64(images)
             response = send_request_to_api(messages)
             out = response["choices"][0]["message"]["content"]
+            outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
+            if outfile.exists():
+                continue
+            with open(outfile, "w") as f:
                 f.write(out)

planning_ai/preprocessing/web_comments.py DELETED Viewed

@@ -1,20 +0,0 @@
-import polars as pl
-from planning_ai.common.utils import Paths
-def main():
-    dfs = pl.read_excel(Paths.RAW / "web comments.xlsx", sheet_id=0)
-    for sheet_name, df in dfs.items():
-        string_df = df.select(pl.col(pl.String)).drop_nulls()
-        for col in string_df.columns:
-            series = string_df[col]
-            name = series.name
-            content = f"**{name}**" + "\n\n* ".join(["\n"] + series.to_list())
-            with open(Paths.STAGING / "web" / f"{sheet_name}.txt", "w") as f:
-                f.write(content)
-if __name__ == "__main__":
-    main()