from PyPDF2 import PdfReader import requests import json import os import concurrent.futures import random from langchain_google_genai import ChatGoogleGenerativeAI from langchain_community.document_loaders import WebBaseLoader from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import google.generativeai as genai from langchain_core.messages import HumanMessage from io import BytesIO from search import search_images gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1) gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1) gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1) gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1) genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI") def pdf_extractor(link): text = '' try: # Fetch the PDF file from the URL response = requests.get(link) response.raise_for_status() # Raise an error for bad status codes # Use BytesIO to handle the PDF content in memory pdf_file = BytesIO(response.content) # Load the PDF file reader = PdfReader(pdf_file) for page in reader.pages: text += page.extract_text() # Extract text from each page except requests.exceptions.HTTPError as e: print(f'HTTP error occurred: {e}') except Exception as e: print(f'An error occurred: {e}') return [text] def web_extractor(link): text = '' try: loader = WebBaseLoader(link) pages = loader.load_and_split() for page in pages: text+=page.page_content except: pass return [text] def feature_extraction(tag, history , context): prompt = f''' You are an intelligent assistant tasked with updating product information. You have two data sources: 1. Tag_History: Previously gathered information about the product. 2. Tag_Context: New data that might contain additional details. Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD. Guidelines: - Only add new details that are relevant to the {tag} FIELD. - Do not add or modify any other fields in the Tag_History. - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information. Here is the data: Tag_Context: {str(context)} Tag_History: {history} Respond with the updated Tag_History. ''' model = random.choice([gemini,gemini1,gemini2,gemini3]) result = model.invoke(prompt) return result.content def feature_extraction_image(url,): vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1) # result = gemini.invoke('''Hello''') # Markdown(result.content) # print(result) message = HumanMessage(content=[ {"type": "text", "text": "Please, Describe this image in detail"}, {"type": "image_url", "image_url": url} ]) text = vision.invoke([message]) return text.content def detailed_feature_extraction(find, context): prompt = f''' You are an intelligent assistant tasked with finding product information. You have one data source and one output format: 1. Context: The gathered information about the product. 2. Format: Details which need to be filled based on Context. Your job is to read the Context and update the relevant field in Format using Context. Guidelines: - Only add details that are relevant to the individual FIELD. - Do not add or modify any other fields in the Format. - If nothing found return None. Here is the data: The Context is {str(context)} The Format is {str(find)} ''' model = random.choice([gemini,gemini1,gemini2,gemini3]) result = model.invoke(prompt) return result.content def detailed_history(history): details = { "Introduction": { "Product Name": None, "Overview of the product": None, "Purpose of the manual": None, "Audience": None, "Additional Details": None }, "Specifications": { "Technical specifications": None, "Performance metrics": None, "Additional Details": None }, "Product Overview": { "Product features": None, "Key components and parts": None, "Additional Details": None }, "Safety Information": { "Safety warnings and precautions": None, "Compliance and certification information": None, "Additional Details": None }, "Installation Instructions": { "Unboxing and inventory checklist": None, "Step-by-step installation guide": None, "Required tools and materials": None, "Additional Details": None }, "Setup and Configuration": { "Initial setup procedures": None, "Configuration settings": None, "Troubleshooting setup issues": None, "Additional Details": None }, "Operation Instructions": { "How to use the product": None, "Detailed instructions for different functionalities": None, "User interface guide": None, "Additional Details": None }, "Maintenance and Care": { "Cleaning instructions": None, "Maintenance schedule": None, "Replacement parts and accessories": None, "Additional Details": None }, "Troubleshooting": { "Common issues and solutions": None, "Error messages and their meanings": None, "Support Information": None, "Additional Details": None }, "Warranty Information": { "Terms and Conditions": None, "Service and repair information": None, "Additional Details": None }, "Legal Information": { "Copyright information": None, "Trademarks and patents": None, "Disclaimers": None, "Additional Details": None } } for key,val in history.items(): find = details[key] details[key] = str(detailed_feature_extraction(find,val)) return details def get_embeddings(link,tag_option): print(f"\nCreating Embeddings ----- {link}") if tag_option=='Complete Document Similarity': history = { "Details": "" } else: history = { "Introduction": "", "Specifications": "", "Product Overview": "", "Safety Information": "", "Installation Instructions": "", "Setup and Configuration": "", "Operation Instructions": "", "Maintenance and Care": "", "Troubleshooting": "", "Warranty Information": "", "Legal Information": "" } # Extract Text ----------------------------- print("Extracting Text") if link[-3:] == '.md' or link[8:11] == 'en.': text = web_extractor(link) else: text = pdf_extractor(link) # Create Chunks ---------------------------- print("Writing Tag Data") if tag_option=="Complete Document Similarity": history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000]) else: chunks = text_splitter.create_documents(text) for chunk in chunks: with concurrent.futures.ThreadPoolExecutor() as executor: future_to_key = { executor.submit( feature_extraction, f"Product {key}", history[key], chunk.page_content ): key for key in history } for future in concurrent.futures.as_completed(future_to_key): key = future_to_key[future] try: response = future.result() history[key] = response except Exception as e: print(f"Error processing {key}: {e}") print("Creating Vectors") genai_embeddings=[] for tag in history: result = genai.embed_content( model="models/embedding-001", content=history[tag], task_type="retrieval_document") genai_embeddings.append(result['embedding']) return history,genai_embeddings def get_image_embeddings(Product): image_embeddings = [] links = search_images(Product)[0] description = feature_extraction_image(links) result = genai.embed_content( model="models/embedding-001", content=description, task_type="retrieval_document") return result global text_splitter global data global history text_splitter = RecursiveCharacterTextSplitter( chunk_size = 10000, chunk_overlap = 100, separators = ["",''," "] ) if __name__ == '__main__': # print(get_embeddings('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf',"Complete Document Similarity")) print(get_image_embeddings(Product='Samsung Galaxy S24'))