Spaces:
Build error
Build error
add processing for gcpt3
Browse files
planning_ai/common/utils.py
CHANGED
@@ -19,12 +19,13 @@ class Paths:
|
|
19 |
OUT = DATA / "out"
|
20 |
|
21 |
SUMMARY = OUT / "summary"
|
|
|
22 |
|
23 |
PROMPTS = Path("planning_ai/chains/prompts")
|
24 |
|
25 |
@classmethod
|
26 |
def ensure_directories_exist(cls):
|
27 |
-
for path in [cls.DATA, cls.RAW, cls.STAGING, cls.OUT, cls.SUMMARY]:
|
28 |
path.mkdir(parents=True, exist_ok=True)
|
29 |
|
30 |
|
|
|
19 |
OUT = DATA / "out"
|
20 |
|
21 |
SUMMARY = OUT / "summary"
|
22 |
+
SUMMARIES = OUT / "summaries"
|
23 |
|
24 |
PROMPTS = Path("planning_ai/chains/prompts")
|
25 |
|
26 |
@classmethod
|
27 |
def ensure_directories_exist(cls):
|
28 |
+
for path in [cls.DATA, cls.RAW, cls.STAGING, cls.OUT, cls.SUMMARY, cls.SUMMARIES]:
|
29 |
path.mkdir(parents=True, exist_ok=True)
|
30 |
|
31 |
|
planning_ai/preprocessing/gclp.py
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
import polars as pl
|
2 |
-
|
3 |
-
from planning_ai.common.utils import Paths
|
4 |
-
|
5 |
-
|
6 |
-
def main():
|
7 |
-
df = pl.read_excel(
|
8 |
-
Paths.RAW / "gclp-first-proposals-questionnaire-responses-redacted.xlsx"
|
9 |
-
)
|
10 |
-
|
11 |
-
free_cols = [df.columns[0]] + df.columns[6:13] + [df.columns[33]]
|
12 |
-
df = df[free_cols]
|
13 |
-
|
14 |
-
for row in df.rows(named=True):
|
15 |
-
user = row.pop("UserNo")
|
16 |
-
content = "\n\n".join([f"**{k}**\n\n{v}" for k, v in row.items() if v != "-"])
|
17 |
-
with open(Paths.STAGING / "gclp" / f"{user}.txt", "w") as f:
|
18 |
-
f.write(content)
|
19 |
-
|
20 |
-
|
21 |
-
if __name__ == "__main__":
|
22 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
planning_ai/preprocessing/gcpt3.py
CHANGED
@@ -1,7 +1,9 @@
|
|
|
|
1 |
from pathlib import Path
|
2 |
from typing import Any
|
3 |
|
4 |
import polars as pl
|
|
|
5 |
from tqdm import tqdm
|
6 |
|
7 |
from planning_ai.common.utils import Paths
|
@@ -12,6 +14,7 @@ def get_schema() -> dict[str, Any]:
|
|
12 |
"id": pl.Int64,
|
13 |
"method": pl.String,
|
14 |
"text": pl.String,
|
|
|
15 |
"attachments": pl.List(
|
16 |
pl.Struct(
|
17 |
[
|
@@ -54,6 +57,65 @@ def process_files(files: list[Path], schema: dict[str, Any]) -> None:
|
|
54 |
)
|
55 |
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
def main() -> None:
|
58 |
files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
|
59 |
schema = get_schema()
|
|
|
1 |
+
import logging
|
2 |
from pathlib import Path
|
3 |
from typing import Any
|
4 |
|
5 |
import polars as pl
|
6 |
+
import requests
|
7 |
from tqdm import tqdm
|
8 |
|
9 |
from planning_ai.common.utils import Paths
|
|
|
14 |
"id": pl.Int64,
|
15 |
"method": pl.String,
|
16 |
"text": pl.String,
|
17 |
+
"respondentpostcode": pl.String,
|
18 |
"attachments": pl.List(
|
19 |
pl.Struct(
|
20 |
[
|
|
|
57 |
)
|
58 |
|
59 |
|
60 |
+
def download_attachments():
|
61 |
+
df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
|
62 |
+
|
63 |
+
existing_files = {int(f.stem) for f in (Paths.RAW / "pdfs").glob("*.pdf")}
|
64 |
+
failed_files = set()
|
65 |
+
|
66 |
+
failed_file_path = Paths.RAW / "failed_downloads.txt"
|
67 |
+
if failed_file_path.exists():
|
68 |
+
with open(failed_file_path, "r") as file:
|
69 |
+
failed_files = set(int(l) for l in file.read().splitlines())
|
70 |
+
|
71 |
+
for row in tqdm(
|
72 |
+
df.drop_nulls(subset="attachments_id")
|
73 |
+
.unique(subset="attachments_id")
|
74 |
+
.sample(shuffle=True, fraction=1)
|
75 |
+
.rows(named=True)
|
76 |
+
):
|
77 |
+
attachment_id = int(row["attachments_id"])
|
78 |
+
|
79 |
+
if attachment_id in existing_files or attachment_id in failed_files:
|
80 |
+
print(f"Skipping {attachment_id} (already exists or previously failed)")
|
81 |
+
continue
|
82 |
+
if (
|
83 |
+
row["attachments_url"].endswith(".pdf")
|
84 |
+
and not row["attachments_url"].startswith("https://egov.scambs.gov.uk")
|
85 |
+
and not row["attachments_url"].startswith("http://egov.scambs.gov.uk")
|
86 |
+
):
|
87 |
+
file_path = Paths.RAW / "pdfs" / f"{attachment_id}.pdf"
|
88 |
+
try:
|
89 |
+
response = requests.get(row["attachments_url"], timeout=10)
|
90 |
+
response.raise_for_status()
|
91 |
+
|
92 |
+
with open(file_path, "wb") as f:
|
93 |
+
f.write(response.content)
|
94 |
+
print(f"Downloaded {attachment_id} to {file_path}")
|
95 |
+
|
96 |
+
except requests.RequestException as e:
|
97 |
+
logging.error(f"RequestException for {attachment_id}: {e}")
|
98 |
+
failed_files.add(attachment_id)
|
99 |
+
with open(failed_file_path, "a") as file:
|
100 |
+
file.write(f"{attachment_id}\n")
|
101 |
+
print(f"Skipping {attachment_id} due to error: {e}")
|
102 |
+
|
103 |
+
except Exception as e:
|
104 |
+
logging.error(f"Unexpected error for {attachment_id}: {e}")
|
105 |
+
failed_files.add(attachment_id)
|
106 |
+
with open(failed_file_path, "a") as file:
|
107 |
+
file.write(f"{attachment_id}\n")
|
108 |
+
print(f"Unexpected error for {attachment_id}: {e}")
|
109 |
+
|
110 |
+
|
111 |
+
def convert_txt():
|
112 |
+
df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
|
113 |
+
|
114 |
+
# attachment_txt =
|
115 |
+
|
116 |
+
f"{df['text']}\n\nPOSITION: {df['representations_support/object']}"
|
117 |
+
|
118 |
+
|
119 |
def main() -> None:
|
120 |
files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
|
121 |
schema = get_schema()
|
planning_ai/preprocessing/process_pdfs.py
CHANGED
@@ -2,14 +2,36 @@ import base64
|
|
2 |
import os
|
3 |
from io import BytesIO
|
4 |
|
|
|
|
|
5 |
import requests
|
6 |
from dotenv import load_dotenv
|
7 |
from pdf2image import convert_from_path
|
|
|
8 |
|
9 |
from planning_ai.common.utils import Paths
|
10 |
|
11 |
load_dotenv()
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
def encode_images_to_base64(images):
|
15 |
image_b64 = []
|
@@ -44,7 +66,7 @@ def main():
|
|
44 |
with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
|
45 |
ocr_prompt = f.read()
|
46 |
|
47 |
-
for file in pdfs:
|
48 |
if file.stem:
|
49 |
images = convert_from_path(file)
|
50 |
image_b64 = encode_images_to_base64(images)
|
@@ -58,7 +80,10 @@ def main():
|
|
58 |
|
59 |
response = send_request_to_api(messages)
|
60 |
out = response["choices"][0]["message"]["content"]
|
61 |
-
|
|
|
|
|
|
|
62 |
f.write(out)
|
63 |
|
64 |
|
|
|
2 |
import os
|
3 |
from io import BytesIO
|
4 |
|
5 |
+
import cv2
|
6 |
+
import numpy as np
|
7 |
import requests
|
8 |
from dotenv import load_dotenv
|
9 |
from pdf2image import convert_from_path
|
10 |
+
from tqdm import tqdm
|
11 |
|
12 |
from planning_ai.common.utils import Paths
|
13 |
|
14 |
load_dotenv()
|
15 |
|
16 |
+
import easyocr
|
17 |
+
from pdf2image import convert_from_path
|
18 |
+
|
19 |
+
pdf_path = "data/raw/pdfs/25.pdf"
|
20 |
+
# pdf_path = "../../data/raw/pdfs/26.pdf"
|
21 |
+
images = convert_from_path(pdf_path)
|
22 |
+
|
23 |
+
reader = easyocr.Reader(lang_list=["en"], gpu=True)
|
24 |
+
|
25 |
+
for i, image in enumerate(images):
|
26 |
+
results = reader.readtext(np.array(image))
|
27 |
+
print(f"Page {i+1}:")
|
28 |
+
confidences = []
|
29 |
+
for result in results:
|
30 |
+
confidences.append(result[2])
|
31 |
+
print(f"Detected text: {result[1]} (confidence: {result[2]:.2f})")
|
32 |
+
|
33 |
+
np.array(confidences).mean()
|
34 |
+
|
35 |
|
36 |
def encode_images_to_base64(images):
|
37 |
image_b64 = []
|
|
|
66 |
with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
|
67 |
ocr_prompt = f.read()
|
68 |
|
69 |
+
for file in tqdm(pdfs):
|
70 |
if file.stem:
|
71 |
images = convert_from_path(file)
|
72 |
image_b64 = encode_images_to_base64(images)
|
|
|
80 |
|
81 |
response = send_request_to_api(messages)
|
82 |
out = response["choices"][0]["message"]["content"]
|
83 |
+
outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
|
84 |
+
if outfile.exists():
|
85 |
+
continue
|
86 |
+
with open(outfile, "w") as f:
|
87 |
f.write(out)
|
88 |
|
89 |
|
planning_ai/preprocessing/web_comments.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
import polars as pl
|
2 |
-
|
3 |
-
from planning_ai.common.utils import Paths
|
4 |
-
|
5 |
-
|
6 |
-
def main():
|
7 |
-
dfs = pl.read_excel(Paths.RAW / "web comments.xlsx", sheet_id=0)
|
8 |
-
|
9 |
-
for sheet_name, df in dfs.items():
|
10 |
-
string_df = df.select(pl.col(pl.String)).drop_nulls()
|
11 |
-
for col in string_df.columns:
|
12 |
-
series = string_df[col]
|
13 |
-
name = series.name
|
14 |
-
content = f"**{name}**" + "\n\n* ".join(["\n"] + series.to_list())
|
15 |
-
with open(Paths.STAGING / "web" / f"{sheet_name}.txt", "w") as f:
|
16 |
-
f.write(content)
|
17 |
-
|
18 |
-
|
19 |
-
if __name__ == "__main__":
|
20 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|