Spaces:
Sleeping
Sleeping
docs: update docs for correct preprocessing
Browse filesFormer-commit-id: 588ef403b1ebeefda50a76a56e01c0768e1d89e8 [formerly 13c024539070464b2bf80599a0e8824ae4d29857]
Former-commit-id: 2c77b4f2341951ea4d3008543995d3ef016b1130
- README.md +10 -5
- planning_ai/preprocessing/azure_doc.py +4 -0
- planning_ai/preprocessing/process_pdfs.py +0 -87
README.md
CHANGED
@@ -51,14 +51,19 @@ pip install . # (or uv sync)
|
|
51 |
|
52 |
## Usage
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
1. **Preprocessing**: Run the preprocessing scripts to convert raw data into a format suitable for analysis.
|
55 |
```bash
|
56 |
-
python planning_ai/preprocessing/
|
57 |
-
python planning_ai/preprocessing/
|
58 |
-
python planning_ai/preprocessing/web_comments.py
|
59 |
```
|
60 |
|
61 |
-
2. **Run Graph**: Execute the main script to process the documents and generate
|
62 |
```bash
|
63 |
python planning_ai/main.py
|
64 |
```
|
@@ -66,7 +71,7 @@ pip install . # (or uv sync)
|
|
66 |
## Configuration
|
67 |
|
68 |
- **Environment Variables**: Use a `.env` file to store sensitive information like API keys.
|
69 |
-
- `OPENAI_API_KEY` required for summarisation
|
70 |
- **Constants**: Adjust `Consts` in `planning_ai/common/utils.py` to modify token limits and other settings.
|
71 |
|
72 |
## Workflow
|
|
|
51 |
|
52 |
## Usage
|
53 |
|
54 |
+
This project uses **Streamlit** to provide a simple frontend to the system. Run using:
|
55 |
+
|
56 |
+
`streamlit run app.py`
|
57 |
+
|
58 |
+
Alternatively run everything manually:
|
59 |
+
|
60 |
1. **Preprocessing**: Run the preprocessing scripts to convert raw data into a format suitable for analysis.
|
61 |
```bash
|
62 |
+
python planning_ai/preprocessing/gcpt3.py
|
63 |
+
python planning_ai/preprocessing/azure_doc.py
|
|
|
64 |
```
|
65 |
|
66 |
+
2. **Run Graph**: Execute the main script to process the documents and generate Summary documents.
|
67 |
```bash
|
68 |
python planning_ai/main.py
|
69 |
```
|
|
|
71 |
## Configuration
|
72 |
|
73 |
- **Environment Variables**: Use a `.env` file to store sensitive information like API keys.
|
74 |
+
- `OPENAI_API_KEY` required for summarisation.
|
75 |
- **Constants**: Adjust `Consts` in `planning_ai/common/utils.py` to modify token limits and other settings.
|
76 |
|
77 |
## Workflow
|
planning_ai/preprocessing/azure_doc.py
CHANGED
@@ -88,3 +88,7 @@ def azure_process_pdfs():
|
|
88 |
continue
|
89 |
|
90 |
analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
|
|
|
|
|
|
|
|
|
|
88 |
continue
|
89 |
|
90 |
analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
|
91 |
+
|
92 |
+
|
93 |
+
if __name__ == "__main__":
|
94 |
+
azure_process_pdfs()
|
planning_ai/preprocessing/process_pdfs.py
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
import base64
|
2 |
-
import os
|
3 |
-
from io import BytesIO
|
4 |
-
|
5 |
-
import requests
|
6 |
-
from dotenv import load_dotenv
|
7 |
-
from pdf2image import convert_from_path
|
8 |
-
from PyPDF2 import PdfReader
|
9 |
-
from tqdm import tqdm
|
10 |
-
|
11 |
-
from planning_ai.common.utils import Paths
|
12 |
-
|
13 |
-
load_dotenv()
|
14 |
-
|
15 |
-
|
16 |
-
def encode_images_to_base64(images):
|
17 |
-
image_b64 = []
|
18 |
-
for image in images:
|
19 |
-
buffered = BytesIO()
|
20 |
-
image.save(buffered, format="JPEG")
|
21 |
-
base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
22 |
-
image_b64.append(
|
23 |
-
{
|
24 |
-
"type": "image_url",
|
25 |
-
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
26 |
-
}
|
27 |
-
)
|
28 |
-
return image_b64
|
29 |
-
|
30 |
-
|
31 |
-
def send_request_to_api(messages):
|
32 |
-
api_key = os.getenv("OPENAI_API_KEY")
|
33 |
-
headers = {
|
34 |
-
"Content-Type": "application/json",
|
35 |
-
"Authorization": f"Bearer {api_key}",
|
36 |
-
}
|
37 |
-
payload = {"model": "gpt-4o-mini", "messages": messages}
|
38 |
-
response = requests.post(
|
39 |
-
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload
|
40 |
-
)
|
41 |
-
return response.json()
|
42 |
-
|
43 |
-
|
44 |
-
def extract_text_from_pdf(file_path):
|
45 |
-
"""Extracts text from a PDF file using PyPDF2."""
|
46 |
-
try:
|
47 |
-
reader = PdfReader(file_path, strict=True)
|
48 |
-
text = [page.extract_text() or "" for page in reader.pages]
|
49 |
-
return "\n".join(text).strip()
|
50 |
-
except Exception as e:
|
51 |
-
print(e)
|
52 |
-
return None
|
53 |
-
|
54 |
-
|
55 |
-
def main():
|
56 |
-
pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
|
57 |
-
with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
|
58 |
-
ocr_prompt = f.read()
|
59 |
-
|
60 |
-
for file in tqdm(pdfs):
|
61 |
-
outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
|
62 |
-
|
63 |
-
try:
|
64 |
-
images = convert_from_path(file)
|
65 |
-
image_b64 = encode_images_to_base64(images)
|
66 |
-
|
67 |
-
messages = [
|
68 |
-
{
|
69 |
-
"role": "user",
|
70 |
-
"content": [{"type": "text", "text": ocr_prompt}] + image_b64,
|
71 |
-
}
|
72 |
-
]
|
73 |
-
|
74 |
-
response = send_request_to_api(messages)
|
75 |
-
if "choices" not in response:
|
76 |
-
continue
|
77 |
-
out = response["choices"][0]["message"]["content"]
|
78 |
-
if outfile.exists():
|
79 |
-
continue
|
80 |
-
with open(outfile, "w") as f:
|
81 |
-
f.write(out)
|
82 |
-
except:
|
83 |
-
continue
|
84 |
-
|
85 |
-
|
86 |
-
if __name__ == "__main__":
|
87 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|