Spaces:
Runtime error
Runtime error
AWS S3 Integration
Browse files- .ipynb_checkpoints/app-checkpoint.py +22 -3
- .ipynb_checkpoints/requirements-checkpoint.txt +6 -1
- app.py +19 -3
- requirements.txt +2 -0
.ipynb_checkpoints/app-checkpoint.py
CHANGED
@@ -2,20 +2,39 @@ import os
|
|
2 |
import openai
|
3 |
import gradio as gr
|
4 |
import pdfplumber
|
|
|
5 |
from llama_index.core import Document, VectorStoreIndex, Settings
|
6 |
from llama_index.llms.openai import OpenAI
|
7 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
8 |
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
|
9 |
from llama_index.core.node_parser import SentenceWindowNodeParser
|
10 |
|
|
|
|
|
|
|
11 |
# Set your OpenAI API key here
|
12 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
13 |
openai.api_key = OPENAI_API_KEY
|
14 |
|
15 |
-
#
|
|
|
|
|
|
|
|
|
16 |
resume_path = 'resumes'
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
# Function to load PDFs using pdfplumber
|
21 |
def load_pdfs_with_pdfplumber(directory):
|
|
|
2 |
import openai
|
3 |
import gradio as gr
|
4 |
import pdfplumber
|
5 |
+
import boto3
|
6 |
from llama_index.core import Document, VectorStoreIndex, Settings
|
7 |
from llama_index.llms.openai import OpenAI
|
8 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
9 |
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
|
10 |
from llama_index.core.node_parser import SentenceWindowNodeParser
|
11 |
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
load_dotenv("config.env")
|
14 |
+
|
15 |
# Set your OpenAI API key here
|
16 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
17 |
openai.api_key = OPENAI_API_KEY
|
18 |
|
19 |
+
# AWS S3 setup
|
20 |
+
s3_bucket_name = "sagemaker-studio-gm4vm5dimae"
|
21 |
+
s3_client = boto3.client('s3')
|
22 |
+
|
23 |
+
# Directory to store downloaded PDFs
|
24 |
resume_path = 'resumes'
|
25 |
+
os.makedirs(resume_path, exist_ok=True)
|
26 |
+
|
27 |
+
# Function to download PDFs from S3
|
28 |
+
def download_pdfs_from_s3(bucket_name, local_path):
|
29 |
+
objects = s3_client.list_objects_v2(Bucket=bucket_name)
|
30 |
+
for obj in objects.get('Contents', []):
|
31 |
+
file_name = obj['Key']
|
32 |
+
local_file_path = os.path.join(local_path, file_name)
|
33 |
+
s3_client.download_file(bucket_name, file_name, local_file_path)
|
34 |
+
print(f"Downloaded {file_name} to {local_file_path}")
|
35 |
+
|
36 |
+
# Download PDFs
|
37 |
+
download_pdfs_from_s3(s3_bucket_name, resume_path)
|
38 |
|
39 |
# Function to load PDFs using pdfplumber
|
40 |
def load_pdfs_with_pdfplumber(directory):
|
.ipynb_checkpoints/requirements-checkpoint.txt
CHANGED
@@ -1 +1,6 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
pdfplumber
|
3 |
+
openai
|
4 |
+
llama_index
|
5 |
+
boto3
|
6 |
+
|
app.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
import openai
|
3 |
import gradio as gr
|
4 |
import pdfplumber
|
|
|
5 |
from llama_index.core import Document, VectorStoreIndex, Settings
|
6 |
from llama_index.llms.openai import OpenAI
|
7 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
@@ -15,10 +16,25 @@ load_dotenv("config.env")
|
|
15 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
16 |
openai.api_key = OPENAI_API_KEY
|
17 |
|
18 |
-
#
|
|
|
|
|
|
|
|
|
19 |
resume_path = 'resumes'
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# Function to load PDFs using pdfplumber
|
24 |
def load_pdfs_with_pdfplumber(directory):
|
|
|
2 |
import openai
|
3 |
import gradio as gr
|
4 |
import pdfplumber
|
5 |
+
import boto3
|
6 |
from llama_index.core import Document, VectorStoreIndex, Settings
|
7 |
from llama_index.llms.openai import OpenAI
|
8 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
|
|
16 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
17 |
openai.api_key = OPENAI_API_KEY
|
18 |
|
19 |
+
# AWS S3 setup
|
20 |
+
s3_bucket_name = "sagemaker-studio-gm4vm5dimae"
|
21 |
+
s3_client = boto3.client('s3')
|
22 |
+
|
23 |
+
# Directory to store downloaded PDFs
|
24 |
resume_path = 'resumes'
|
25 |
+
os.makedirs(resume_path, exist_ok=True)
|
26 |
+
|
27 |
+
# Function to download PDFs from S3
|
28 |
+
def download_pdfs_from_s3(bucket_name, local_path):
|
29 |
+
objects = s3_client.list_objects_v2(Bucket=bucket_name)
|
30 |
+
for obj in objects.get('Contents', []):
|
31 |
+
file_name = obj['Key']
|
32 |
+
local_file_path = os.path.join(local_path, file_name)
|
33 |
+
s3_client.download_file(bucket_name, file_name, local_file_path)
|
34 |
+
print(f"Downloaded {file_name} to {local_file_path}")
|
35 |
+
|
36 |
+
# Download PDFs
|
37 |
+
download_pdfs_from_s3(s3_bucket_name, resume_path)
|
38 |
|
39 |
# Function to load PDFs using pdfplumber
|
40 |
def load_pdfs_with_pdfplumber(directory):
|
requirements.txt
CHANGED
@@ -2,3 +2,5 @@ gradio
|
|
2 |
pdfplumber
|
3 |
openai
|
4 |
llama_index
|
|
|
|
|
|
2 |
pdfplumber
|
3 |
openai
|
4 |
llama_index
|
5 |
+
boto3
|
6 |
+
|