cjber commited on
Commit
8ef72b8
·
1 Parent(s): 945fac4

docs: update docs for correct preprocessing

Browse files

Former-commit-id: 588ef403b1ebeefda50a76a56e01c0768e1d89e8 [formerly 13c024539070464b2bf80599a0e8824ae4d29857]
Former-commit-id: 2c77b4f2341951ea4d3008543995d3ef016b1130

README.md CHANGED
@@ -51,14 +51,19 @@ pip install . # (or uv sync)
51
 
52
  ## Usage
53
 
 
 
 
 
 
 
54
  1. **Preprocessing**: Run the preprocessing scripts to convert raw data into a format suitable for analysis.
55
  ```bash
56
- python planning_ai/preprocessing/process_pdfs.py
57
- python planning_ai/preprocessing/gclp.py
58
- python planning_ai/preprocessing/web_comments.py
59
  ```
60
 
61
- 2. **Run Graph**: Execute the main script to process the documents and generate a Quarto summary document.
62
  ```bash
63
  python planning_ai/main.py
64
  ```
@@ -66,7 +71,7 @@ pip install . # (or uv sync)
66
  ## Configuration
67
 
68
  - **Environment Variables**: Use a `.env` file to store sensitive information like API keys.
69
- - `OPENAI_API_KEY` required for summarisation; `OPENCAGE_API_KEY` required for geocoding (Quarto report)
70
  - **Constants**: Adjust `Consts` in `planning_ai/common/utils.py` to modify token limits and other settings.
71
 
72
  ## Workflow
 
51
 
52
  ## Usage
53
 
54
+ This project uses **Streamlit** to provide a simple frontend to the system. Run using:
55
+
56
+ `streamlit run app.py`
57
+
58
+ Alternatively run everything manually:
59
+
60
  1. **Preprocessing**: Run the preprocessing scripts to convert raw data into a format suitable for analysis.
61
  ```bash
62
+ python planning_ai/preprocessing/gcpt3.py
63
+ python planning_ai/preprocessing/azure_doc.py
 
64
  ```
65
 
66
+ 2. **Run Graph**: Execute the main script to process the documents and generate Summary documents.
67
  ```bash
68
  python planning_ai/main.py
69
  ```
 
71
  ## Configuration
72
 
73
  - **Environment Variables**: Use a `.env` file to store sensitive information like API keys.
74
+ - `OPENAI_API_KEY` required for summarisation.
75
  - **Constants**: Adjust `Consts` in `planning_ai/common/utils.py` to modify token limits and other settings.
76
 
77
  ## Workflow
planning_ai/preprocessing/azure_doc.py CHANGED
@@ -88,3 +88,7 @@ def azure_process_pdfs():
88
  continue
89
 
90
  analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
 
 
 
 
 
88
  continue
89
 
90
  analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
91
+
92
+
93
+ if __name__ == "__main__":
94
+ azure_process_pdfs()
planning_ai/preprocessing/process_pdfs.py DELETED
@@ -1,87 +0,0 @@
1
- import base64
2
- import os
3
- from io import BytesIO
4
-
5
- import requests
6
- from dotenv import load_dotenv
7
- from pdf2image import convert_from_path
8
- from PyPDF2 import PdfReader
9
- from tqdm import tqdm
10
-
11
- from planning_ai.common.utils import Paths
12
-
13
- load_dotenv()
14
-
15
-
16
- def encode_images_to_base64(images):
17
- image_b64 = []
18
- for image in images:
19
- buffered = BytesIO()
20
- image.save(buffered, format="JPEG")
21
- base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
22
- image_b64.append(
23
- {
24
- "type": "image_url",
25
- "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
26
- }
27
- )
28
- return image_b64
29
-
30
-
31
- def send_request_to_api(messages):
32
- api_key = os.getenv("OPENAI_API_KEY")
33
- headers = {
34
- "Content-Type": "application/json",
35
- "Authorization": f"Bearer {api_key}",
36
- }
37
- payload = {"model": "gpt-4o-mini", "messages": messages}
38
- response = requests.post(
39
- "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
40
- )
41
- return response.json()
42
-
43
-
44
- def extract_text_from_pdf(file_path):
45
- """Extracts text from a PDF file using PyPDF2."""
46
- try:
47
- reader = PdfReader(file_path, strict=True)
48
- text = [page.extract_text() or "" for page in reader.pages]
49
- return "\n".join(text).strip()
50
- except Exception as e:
51
- print(e)
52
- return None
53
-
54
-
55
- def main():
56
- pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
57
- with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
58
- ocr_prompt = f.read()
59
-
60
- for file in tqdm(pdfs):
61
- outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
62
-
63
- try:
64
- images = convert_from_path(file)
65
- image_b64 = encode_images_to_base64(images)
66
-
67
- messages = [
68
- {
69
- "role": "user",
70
- "content": [{"type": "text", "text": ocr_prompt}] + image_b64,
71
- }
72
- ]
73
-
74
- response = send_request_to_api(messages)
75
- if "choices" not in response:
76
- continue
77
- out = response["choices"][0]["message"]["content"]
78
- if outfile.exists():
79
- continue
80
- with open(outfile, "w") as f:
81
- f.write(out)
82
- except:
83
- continue
84
-
85
-
86
- if __name__ == "__main__":
87
- main()