Spaces:

kgauvin603
/

rag-10k-analysis

Sleeping

File size: 4,094 Bytes

# Import the necessary libraries
import subprocess
import sys

# Function to install a package using pip
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
try:
    install("gradio")
    install("openai==1.23.2")
    install("tiktoken==0.6.0")
    install("pypdf==4.0.1")
    install("langchain==0.1.1")
    install("langchain-community==0.0.13")
    install("chromadb==0.4.22")
    install("sentence-transformers==2.3.1")
except subprocess.CalledProcessError as e:
    print(f"An error occurred: {e}")

import gradio as gr
import os
import uuid
import json
import pandas as pd
import subprocess
from openai import OpenAI
from huggingface_hub import HfApi
from huggingface_hub import CommitScheduler
from huggingface_hub import hf_hub_download
import zipfile
# Define your repository and file path
repo_id = "kgauvin603/rag-10k"
#file_path = "dataset.zip"

# Download the file
#downloaded_file = hf_hub_download(repo_id, file_path)

# Print the path to the downloaded file
#print(f"Downloaded file is located at: {downloaded_file}")

from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings
)
from langchain_community.vectorstores import Chroma
#from google.colab import userdata, drive
from pathlib import Path
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
import tiktoken
import pandas as pd
import tiktoken

print(f"Pass 1")

# Define the embedding model and the vectorstore
embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')

# If dataset directory exixts, remove it and all of the contents within

#if os.path.exists('dataset'):
#  !rm -rf dataset

# If collection_db exists, remove it and all of the contents within

#if os.path.exists('collection_db'):
#    !rm -rf dataset

#Mount the Google Drive
#drive.mount('/content/drive')

#Upload Dataset-10k.zip and unzip it dataset folder using -d option
#!unzip Dataset-10k.zip -d dataset

import subprocess

# Command to unzip the file
#command = "unzip kgauvin603/10k-reports/Dataset-10k.zip -d dataset"
command = "pip install transformers huggingface_hub requests"
# Execute the command
try:
    subprocess.run(command, check=True, shell=True)
except subprocess.CalledProcessError as e:
    print(f"An error occurred: {e}")
    
from huggingface_hub import hf_hub_download
import zipfile
import os
import requests

print(f"Pass 2")

    
#repo_id = "kgauvin603/10k-reports"
#file_path = "dataset"
# Get the URL for the file in the repository
#file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file_path}"
#print(file_url)
# Command to unzip the file
#command = "unzip kgauvin603/10k-reports/Dataset-10k.zip -d dataset"
# Execute the command
#try:
#    subprocess.run(command, check=True, shell=True)
#except subprocess.CalledProcessError as e:
#    print(f"An error occurred: {e}")

#https://huggingface.co/datasets/kgauvin603/10k-reports

# Define the repository and file path
repo_id = "kgauvin603/10k-reports"
file_path = "Dataset-10k.zip"

# Construct the URL for the file in the repository
file_url = f"https://huggingface.co/datasets/{repo_id}/{file_path}"
print(f"File URL: {file_url}")

# Download the zip file
response = requests.get(file_url)
response.raise_for_status()  # Ensure the request was successful

# Unzip the file in memory
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    # List the files in the zip archive
    zip_file_list = zip_ref.namelist()
    print(f"Files in the zip archive: {zip_file_list}")

    # Extract specific files or work with them directly in memory
    for file_name in zip_file_list:
        with zip_ref.open(file_name) as file:
            content = file.read()
            print(f"Content of {file_name}: {content[:100]}...")  # Print the first 100 characters of each file

# If you need to save the extracted files to disk, you can do so as follows:
# Define the extraction path
extraction_path = "./dataset"
import os