Namanj46 commited on
Commit
bea313d
·
1 Parent(s): ef33567

AWS S3 Integration

Browse files
.ipynb_checkpoints/app-checkpoint.py CHANGED
@@ -2,20 +2,39 @@ import os
2
  import openai
3
  import gradio as gr
4
  import pdfplumber
 
5
  from llama_index.core import Document, VectorStoreIndex, Settings
6
  from llama_index.llms.openai import OpenAI
7
  from llama_index.embeddings.openai import OpenAIEmbedding
8
  from llama_index.core.postprocessor import MetadataReplacementPostProcessor
9
  from llama_index.core.node_parser import SentenceWindowNodeParser
10
 
 
 
 
11
  # Set your OpenAI API key here
12
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
13
  openai.api_key = OPENAI_API_KEY
14
 
15
- # Get the current working directory and join with 'resumes'
 
 
 
 
16
  resume_path = 'resumes'
17
- if not os.path.exists(resume_path):
18
- raise ValueError(f"Directory 'resumes' not found")
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # Function to load PDFs using pdfplumber
21
  def load_pdfs_with_pdfplumber(directory):
 
2
  import openai
3
  import gradio as gr
4
  import pdfplumber
5
+ import boto3
6
  from llama_index.core import Document, VectorStoreIndex, Settings
7
  from llama_index.llms.openai import OpenAI
8
  from llama_index.embeddings.openai import OpenAIEmbedding
9
  from llama_index.core.postprocessor import MetadataReplacementPostProcessor
10
  from llama_index.core.node_parser import SentenceWindowNodeParser
11
 
12
+ from dotenv import load_dotenv
13
+ load_dotenv("config.env")
14
+
15
  # Set your OpenAI API key here
16
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17
  openai.api_key = OPENAI_API_KEY
18
 
19
+ # AWS S3 setup
20
+ s3_bucket_name = "sagemaker-studio-gm4vm5dimae"
21
+ s3_client = boto3.client('s3')
22
+
23
+ # Directory to store downloaded PDFs
24
  resume_path = 'resumes'
25
+ os.makedirs(resume_path, exist_ok=True)
26
+
27
+ # Function to download PDFs from S3
28
+ def download_pdfs_from_s3(bucket_name, local_path):
29
+ objects = s3_client.list_objects_v2(Bucket=bucket_name)
30
+ for obj in objects.get('Contents', []):
31
+ file_name = obj['Key']
32
+ local_file_path = os.path.join(local_path, file_name)
33
+ s3_client.download_file(bucket_name, file_name, local_file_path)
34
+ print(f"Downloaded {file_name} to {local_file_path}")
35
+
36
+ # Download PDFs
37
+ download_pdfs_from_s3(s3_bucket_name, resume_path)
38
 
39
  # Function to load PDFs using pdfplumber
40
  def load_pdfs_with_pdfplumber(directory):
.ipynb_checkpoints/requirements-checkpoint.txt CHANGED
@@ -1 +1,6 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
 
1
+ gradio
2
+ pdfplumber
3
+ openai
4
+ llama_index
5
+ boto3
6
+
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import openai
3
  import gradio as gr
4
  import pdfplumber
 
5
  from llama_index.core import Document, VectorStoreIndex, Settings
6
  from llama_index.llms.openai import OpenAI
7
  from llama_index.embeddings.openai import OpenAIEmbedding
@@ -15,10 +16,25 @@ load_dotenv("config.env")
15
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
16
  openai.api_key = OPENAI_API_KEY
17
 
18
- # Get the current working directory and join with 'resumes'
 
 
 
 
19
  resume_path = 'resumes'
20
- if not os.path.exists(resume_path):
21
- raise ValueError(f"Directory 'resumes' not found")
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Function to load PDFs using pdfplumber
24
  def load_pdfs_with_pdfplumber(directory):
 
2
  import openai
3
  import gradio as gr
4
  import pdfplumber
5
+ import boto3
6
  from llama_index.core import Document, VectorStoreIndex, Settings
7
  from llama_index.llms.openai import OpenAI
8
  from llama_index.embeddings.openai import OpenAIEmbedding
 
16
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17
  openai.api_key = OPENAI_API_KEY
18
 
19
+ # AWS S3 setup
20
+ s3_bucket_name = "sagemaker-studio-gm4vm5dimae"
21
+ s3_client = boto3.client('s3')
22
+
23
+ # Directory to store downloaded PDFs
24
  resume_path = 'resumes'
25
+ os.makedirs(resume_path, exist_ok=True)
26
+
27
+ # Function to download PDFs from S3
28
+ def download_pdfs_from_s3(bucket_name, local_path):
29
+ objects = s3_client.list_objects_v2(Bucket=bucket_name)
30
+ for obj in objects.get('Contents', []):
31
+ file_name = obj['Key']
32
+ local_file_path = os.path.join(local_path, file_name)
33
+ s3_client.download_file(bucket_name, file_name, local_file_path)
34
+ print(f"Downloaded {file_name} to {local_file_path}")
35
+
36
+ # Download PDFs
37
+ download_pdfs_from_s3(s3_bucket_name, resume_path)
38
 
39
  # Function to load PDFs using pdfplumber
40
  def load_pdfs_with_pdfplumber(directory):
requirements.txt CHANGED
@@ -2,3 +2,5 @@ gradio
2
  pdfplumber
3
  openai
4
  llama_index
 
 
 
2
  pdfplumber
3
  openai
4
  llama_index
5
+ boto3
6
+