cjber commited on
Commit
8d63d9f
·
1 Parent(s): 82bbfd1

add processing for gcpt3

Browse files
planning_ai/common/utils.py CHANGED
@@ -19,12 +19,13 @@ class Paths:
19
  OUT = DATA / "out"
20
 
21
  SUMMARY = OUT / "summary"
 
22
 
23
  PROMPTS = Path("planning_ai/chains/prompts")
24
 
25
  @classmethod
26
  def ensure_directories_exist(cls):
27
- for path in [cls.DATA, cls.RAW, cls.STAGING, cls.OUT, cls.SUMMARY]:
28
  path.mkdir(parents=True, exist_ok=True)
29
 
30
 
 
19
  OUT = DATA / "out"
20
 
21
  SUMMARY = OUT / "summary"
22
+ SUMMARIES = OUT / "summaries"
23
 
24
  PROMPTS = Path("planning_ai/chains/prompts")
25
 
26
  @classmethod
27
  def ensure_directories_exist(cls):
28
+ for path in [cls.DATA, cls.RAW, cls.STAGING, cls.OUT, cls.SUMMARY, cls.SUMMARIES]:
29
  path.mkdir(parents=True, exist_ok=True)
30
 
31
 
planning_ai/preprocessing/gclp.py DELETED
@@ -1,22 +0,0 @@
1
- import polars as pl
2
-
3
- from planning_ai.common.utils import Paths
4
-
5
-
6
- def main():
7
- df = pl.read_excel(
8
- Paths.RAW / "gclp-first-proposals-questionnaire-responses-redacted.xlsx"
9
- )
10
-
11
- free_cols = [df.columns[0]] + df.columns[6:13] + [df.columns[33]]
12
- df = df[free_cols]
13
-
14
- for row in df.rows(named=True):
15
- user = row.pop("UserNo")
16
- content = "\n\n".join([f"**{k}**\n\n{v}" for k, v in row.items() if v != "-"])
17
- with open(Paths.STAGING / "gclp" / f"{user}.txt", "w") as f:
18
- f.write(content)
19
-
20
-
21
- if __name__ == "__main__":
22
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
planning_ai/preprocessing/gcpt3.py CHANGED
@@ -1,7 +1,9 @@
 
1
  from pathlib import Path
2
  from typing import Any
3
 
4
  import polars as pl
 
5
  from tqdm import tqdm
6
 
7
  from planning_ai.common.utils import Paths
@@ -12,6 +14,7 @@ def get_schema() -> dict[str, Any]:
12
  "id": pl.Int64,
13
  "method": pl.String,
14
  "text": pl.String,
 
15
  "attachments": pl.List(
16
  pl.Struct(
17
  [
@@ -54,6 +57,65 @@ def process_files(files: list[Path], schema: dict[str, Any]) -> None:
54
  )
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def main() -> None:
58
  files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
59
  schema = get_schema()
 
1
+ import logging
2
  from pathlib import Path
3
  from typing import Any
4
 
5
  import polars as pl
6
+ import requests
7
  from tqdm import tqdm
8
 
9
  from planning_ai.common.utils import Paths
 
14
  "id": pl.Int64,
15
  "method": pl.String,
16
  "text": pl.String,
17
+ "respondentpostcode": pl.String,
18
  "attachments": pl.List(
19
  pl.Struct(
20
  [
 
57
  )
58
 
59
 
60
+ def download_attachments():
61
+ df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
62
+
63
+ existing_files = {int(f.stem) for f in (Paths.RAW / "pdfs").glob("*.pdf")}
64
+ failed_files = set()
65
+
66
+ failed_file_path = Paths.RAW / "failed_downloads.txt"
67
+ if failed_file_path.exists():
68
+ with open(failed_file_path, "r") as file:
69
+ failed_files = set(int(l) for l in file.read().splitlines())
70
+
71
+ for row in tqdm(
72
+ df.drop_nulls(subset="attachments_id")
73
+ .unique(subset="attachments_id")
74
+ .sample(shuffle=True, fraction=1)
75
+ .rows(named=True)
76
+ ):
77
+ attachment_id = int(row["attachments_id"])
78
+
79
+ if attachment_id in existing_files or attachment_id in failed_files:
80
+ print(f"Skipping {attachment_id} (already exists or previously failed)")
81
+ continue
82
+ if (
83
+ row["attachments_url"].endswith(".pdf")
84
+ and not row["attachments_url"].startswith("https://egov.scambs.gov.uk")
85
+ and not row["attachments_url"].startswith("http://egov.scambs.gov.uk")
86
+ ):
87
+ file_path = Paths.RAW / "pdfs" / f"{attachment_id}.pdf"
88
+ try:
89
+ response = requests.get(row["attachments_url"], timeout=10)
90
+ response.raise_for_status()
91
+
92
+ with open(file_path, "wb") as f:
93
+ f.write(response.content)
94
+ print(f"Downloaded {attachment_id} to {file_path}")
95
+
96
+ except requests.RequestException as e:
97
+ logging.error(f"RequestException for {attachment_id}: {e}")
98
+ failed_files.add(attachment_id)
99
+ with open(failed_file_path, "a") as file:
100
+ file.write(f"{attachment_id}\n")
101
+ print(f"Skipping {attachment_id} due to error: {e}")
102
+
103
+ except Exception as e:
104
+ logging.error(f"Unexpected error for {attachment_id}: {e}")
105
+ failed_files.add(attachment_id)
106
+ with open(failed_file_path, "a") as file:
107
+ file.write(f"{attachment_id}\n")
108
+ print(f"Unexpected error for {attachment_id}: {e}")
109
+
110
+
111
+ def convert_txt():
112
+ df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
113
+
114
+ # attachment_txt =
115
+
116
+ f"{df['text']}\n\nPOSITION: {df['representations_support/object']}"
117
+
118
+
119
  def main() -> None:
120
  files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
121
  schema = get_schema()
planning_ai/preprocessing/process_pdfs.py CHANGED
@@ -2,14 +2,36 @@ import base64
2
  import os
3
  from io import BytesIO
4
 
 
 
5
  import requests
6
  from dotenv import load_dotenv
7
  from pdf2image import convert_from_path
 
8
 
9
  from planning_ai.common.utils import Paths
10
 
11
  load_dotenv()
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def encode_images_to_base64(images):
15
  image_b64 = []
@@ -44,7 +66,7 @@ def main():
44
  with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
45
  ocr_prompt = f.read()
46
 
47
- for file in pdfs:
48
  if file.stem:
49
  images = convert_from_path(file)
50
  image_b64 = encode_images_to_base64(images)
@@ -58,7 +80,10 @@ def main():
58
 
59
  response = send_request_to_api(messages)
60
  out = response["choices"][0]["message"]["content"]
61
- with open(Paths.STAGING / "pdfs" / f"{file.stem}.txt", "w") as f:
 
 
 
62
  f.write(out)
63
 
64
 
 
2
  import os
3
  from io import BytesIO
4
 
5
+ import cv2
6
+ import numpy as np
7
  import requests
8
  from dotenv import load_dotenv
9
  from pdf2image import convert_from_path
10
+ from tqdm import tqdm
11
 
12
  from planning_ai.common.utils import Paths
13
 
14
  load_dotenv()
15
 
16
+ import easyocr
17
+ from pdf2image import convert_from_path
18
+
19
+ pdf_path = "data/raw/pdfs/25.pdf"
20
+ # pdf_path = "../../data/raw/pdfs/26.pdf"
21
+ images = convert_from_path(pdf_path)
22
+
23
+ reader = easyocr.Reader(lang_list=["en"], gpu=True)
24
+
25
+ for i, image in enumerate(images):
26
+ results = reader.readtext(np.array(image))
27
+ print(f"Page {i+1}:")
28
+ confidences = []
29
+ for result in results:
30
+ confidences.append(result[2])
31
+ print(f"Detected text: {result[1]} (confidence: {result[2]:.2f})")
32
+
33
+ np.array(confidences).mean()
34
+
35
 
36
  def encode_images_to_base64(images):
37
  image_b64 = []
 
66
  with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
67
  ocr_prompt = f.read()
68
 
69
+ for file in tqdm(pdfs):
70
  if file.stem:
71
  images = convert_from_path(file)
72
  image_b64 = encode_images_to_base64(images)
 
80
 
81
  response = send_request_to_api(messages)
82
  out = response["choices"][0]["message"]["content"]
83
+ outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
84
+ if outfile.exists():
85
+ continue
86
+ with open(outfile, "w") as f:
87
  f.write(out)
88
 
89
 
planning_ai/preprocessing/web_comments.py DELETED
@@ -1,20 +0,0 @@
1
- import polars as pl
2
-
3
- from planning_ai.common.utils import Paths
4
-
5
-
6
- def main():
7
- dfs = pl.read_excel(Paths.RAW / "web comments.xlsx", sheet_id=0)
8
-
9
- for sheet_name, df in dfs.items():
10
- string_df = df.select(pl.col(pl.String)).drop_nulls()
11
- for col in string_df.columns:
12
- series = string_df[col]
13
- name = series.name
14
- content = f"**{name}**" + "\n\n* ".join(["\n"] + series.to_list())
15
- with open(Paths.STAGING / "web" / f"{sheet_name}.txt", "w") as f:
16
- f.write(content)
17
-
18
-
19
- if __name__ == "__main__":
20
- main()