File size: 6,120 Bytes
4f30460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import streamlit as st
import pandas as pd
import time
from typing import List, Dict
from serpapi import GoogleSearch
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
import gspread
from google.oauth2.service_account import Credentials
import pandas as pd
import os

credentials = {
    "type": os.getenv("type"),
    "project_id": os.getenv("project_id"),
    "private_key_id": os.getenv("private_key_id"),
    "private_key": os.getenv("private_key"),
    "client_email": os.getenv("client_email"),
    "client_id": os.getenv("client_id"),
    "client_x509_cert_url": os.getenv("client_x509_cert_url"),
    "universe_domain": os.getenv("universe_domain"),
    "auth_uri": os.getenv("auth_uri"),
    "token_uri": os.getenv("token_uri"),
    "auth_provider_x509_cert_url": os.getenv("auth_provider_x509_cert_url")
}
def get_sheet_client():
    """Helper function to create authenticated Google Sheets client"""
    try:
        scope = ["https://www.googleapis.com/auth/spreadsheets"]
        creds = Credentials.from_service_account_info(credentials)
        client = gspread.authorize(creds)
        
        # Get service account email for error messages
        service_account_email = creds.service_account_email
        st.session_state['service_account_email'] = service_account_email
        
        return client
    except FileNotFoundError:
        raise ValueError(
            "credentials.json file not found. Please ensure it exists in the project directory."
        )
    except Exception as e:
        raise ValueError(f"Error setting up Google Sheets client: {str(e)}")

def get_worksheet(sheet_id: str, range_name: str = None):
    """Helper function to get worksheet with improved error handling"""
    try:
        client = get_sheet_client()
        sheet = client.open_by_key(sheet_id)
        return sheet.worksheet(range_name) if range_name else sheet
    except gspread.exceptions.SpreadsheetNotFound:
        service_email = st.session_state.get('service_account_email', 'the service account')
        raise ValueError(
            f"Spreadsheet not found. Please verify:\n"
            f"1. The spreadsheet ID is correct\n"
            f"2. The sheet is shared with {service_email}\n"
            f"3. Sharing permissions allow edit access"
        )
    except gspread.exceptions.WorksheetNotFound:
        raise ValueError(f"Worksheet '{range_name}' not found in the spreadsheet")
    except gspread.exceptions.APIError as e:
        if 'PERMISSION_DENIED' in str(e):
            service_email = st.session_state.get('service_account_email', 'the service account')
            raise ValueError(
                f"Permission denied. Please share the spreadsheet with {service_email} "
                f"and ensure it has edit access."
            )
        raise ValueError(f"Google Sheets API error: {str(e)}")

def process_queries(df: pd.DataFrame, primary_column: str, query_template: str) -> List[Dict]:
    results = []
    
    serpapi_key = os.getenv("SERPAPI_API_KEY")  
    for index, row in df.iterrows():
        try:
            value = row[primary_column]
            query = query_template.replace(f"{{{primary_column}}}", str(value))
            
            # Perform search
            search = GoogleSearch({
                "q": query,
                "gl": "in",
                "api_key": serpapi_key,
                "num": 5  
            })
            search_results = search.get_dict()
            
            # Store results
            results.append({
                primary_column: value,
                "query": query,
                "search_results": search_results.get("organic_results", [])
            })
            
            # Rate limiting
            time.sleep(1) 
            
            
            if index % 10 == 0:  
                st.write(f"Processed {index + 1} queries...")
                
        except Exception as e:
            st.warning(f"Error processing query for {value}: {str(e)}")
            continue
    
    return results

def setup_llm():
    """Setup LangChain with Groq"""
    api_key=os.getenv("GROQ_API_KEY")
    llm = ChatGroq(
        api_key=api_key, 
        model="llama-3.1-8b-instant",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )
    return llm

def process_with_ai(search_results: dict, query: str, llm) -> str:
    template = """
    Extract ONLY the specific information requested from the search results for: {query}
    
    Search Results:
    {search_results}
    
    Provide ONLY the extracted information as a simple text response. 
    If multiple items exist, separate them with semicolons.
    If no relevant information is found, respond with "Not found".
    
    For example:
    - If asked for locations: "Bengaluru; Mumbai; Delhi"
    - If asked for email: "[email protected]"
    - If asked for address: "123 Main Street, City, Country"
    """
    
    prompt = PromptTemplate(
        input_variables=["query", "search_results"],
        template=template
    )
    
    chain = prompt | llm
    response = chain.invoke({"query": query, "search_results": search_results})
    
    return response


def load_google_sheet(sheet_id: str, range_name: str) -> pd.DataFrame:    
    worksheet = get_worksheet(sheet_id,range_name)
    data = worksheet.get_all_records()    
    return pd.DataFrame(data)


def write_to_google_sheet(sheet_id: str, range_name: str, results_df: pd.DataFrame):

    worksheet = get_worksheet(sheet_id, range_name)
    
    all_values = worksheet.get_all_values()
    num_rows = len(all_values)
    next_col_num = len(all_values[0]) + 1
    next_col_letter = chr(64 + next_col_num) 
    
    range = f'{next_col_letter}1:{next_col_letter}{num_rows}'
    
    values = [['AI Results']] + [[str(result)] for result in results_df['result']]
    
    worksheet.update(values, f'{range}')


def get_all_sheet_names(sheet_id: str) -> List[str]:
    
    worksheet = get_worksheet(sheet_id)
    sheets = map(lambda x: x.title, worksheet.worksheets())
    return list(sheets)