import urllib import streamlit as st import http.client from bs4 import BeautifulSoup from openai import OpenAI import os import pandas as pd import pinecone client = OpenAI(api_key =os.environ['OPENAI']) pinecone.init(api_key=os.environ['PINECONE'], environment="gcp-starter") index = pinecone.Index("bens-bites") st.session_state.sidebar_state = 'expanded' st.set_page_config(page_title="Credo Search", layout="wide", initial_sidebar_state=st.session_state.sidebar_state) # UI text strings st.session_state.page_title = "Credo Search" st.session_state.page_helper = "Discover relevant commpanies and articles!" semantic_search_header = "Company website" search_label = "Process" def get_summary(website): # return 'Chainloop is a company that provides an open-source software supply chain control plane, which serves as a single source of truth for artifacts and facilitates a declarative attestation process. Their solution allows SecOps teams to define and manage security compliance and standardization efforts for CI/CD workflows without requiring developers to be security experts. Chainloop integrates with third-party tools for further functions such as SBOM analysis and offers developer-friendly tools to meet compliance demands. It aims to improve software supply chain security, auditability, and observability within organizations.' conn = http.client.HTTPSConnection("api.scrapingant.com") conn.request("GET", f"/v2/general?url={urllib.parse.quote_plus(website)}&x-api-key=9e03210d60b64c76950cc134aff9cca8") res = conn.getresponse() data = res.read().decode("utf-8") soup = BeautifulSoup(data, "lxml") text = soup.body.get_text(strip=False) response = client.chat.completions.create( model="gpt-4-1106-preview", messages=[ {"role": "system", "content": "You are an agent that is trying to figure out what a company does based on an extract of their website. The website might be malformated but you will have to deal with that. Your task is to read the website and product 2-3 sentences on what the company does. Try to be as precise as possible. Respond with just the pure description of the company e.g. Comapny X deals with ... or other direct forms"}, {"role": "user", "content": text} ] ) return response.choices[0].message.content def handle_search(): st.session_state.page_title = 'Processing...' st.session_state.page_helper = '' print('handle_search') st.session_state.summary = get_summary(st.session_state.website) print('got summary ' + st.session_state.summary ) st.session_state.search_queries = get_search_queries(st.session_state.summary) print('got queries') st.session_state.articles = get_relevant_from_index(st.session_state.search_queries, 'article') st.session_state.funding_rounds = get_relevant_from_index(st.session_state.search_queries, 'funding round', 'early') st.session_state.funding_rounds_later_stage = get_relevant_from_index(st.session_state.search_queries, 'funding round', 'late') print('got relevant companies and articles ') st.session_state.sidebar_state = 'collapsed' def render_search(): """ Render the search form in the sidebar. """ search_disabled = True with st.sidebar: st.text_input(label=semantic_search_header, placeholder='elevenlabs.io', key="website") if "website" in st.session_state and st.session_state.website != "": search_disabled = False st.button(label='Search', key="location_search", disabled=search_disabled, on_click=handle_search) st.write("---") def get_search_queries(summary, n = 4): response = client.chat.completions.create( model="gpt-4-1106-preview", messages=[ {"role": "system", "content": f"You are a search assistant. You will receive a summary of what a company does and is. Your task is to create search queries for a vector database that will find the most relevant competitive companies to the one in the description. You should take the summary a product {n} queries for the vector database. These queries should be several words long. Response with just the queries separated by a new line."}, {"role": "user", "content": summary} ] ) return response.choices[0].message.content.split('\n') def get_embedding(text): response = client.embeddings.create( input=text, model="text-embedding-ada-002" ) return response.data[0].embedding def make_clickable(row): if 'URL' in row: url = row['URL'] else: url = row['Website'] if 'Title' in row: text = row['Title'] else: text = row['Company Name'] return f'{text}' def get_relevant_from_index(search_queries, type='article', stage = 'early', limit_funding = 10000000, k = 4): data = [] if type == 'article': for query in search_queries: query_response = index.query( top_k=k, include_values=True, include_metadata=True, vector=get_embedding(query), filter={"type":type} ) for i in query_response['matches']: data.append(i['metadata']) data = pd.DataFrame(data).drop_duplicates(subset=['Title']) data['Title'] = data.apply(make_clickable, axis = 1) return data[['Title', 'Summary']] if type == 'funding round': if stage == 'early': filter = {"$and": [{"type":type}, {"Total Funding Amount": { "$lt": limit_funding }}]} else: filter = {"$and": [{"type":type}, {"Total Funding Amount": { "$gte": limit_funding }}]} for query in search_queries: query_response = index.query( top_k=k, include_values=True, include_metadata=True, vector=get_embedding(query), filter=filter ) for i in query_response['matches']: data.append(i['metadata']) data = pd.DataFrame(data).drop_duplicates(subset=['Company Name']) data['Company Name'] = data.apply(make_clickable, axis = 1) data['Total Funding Amount'] = data['Total Funding Amount'].map('${:,.0f}'.format) return data[['Company Name', 'Company Description', 'Funding Round', 'Total Funding Amount']] def render_search_result(): """ Render the search results on the main content area. """ if ("summary" in st.session_state) and ('funding_rounds' in st.session_state) and ('articles' in st.session_state): pd.set_option('display.max_colwidth', None) st.title(st.session_state.website) st.write(st.session_state.summary) # if 'search_queries' in st.session_state: # st.subheader('Search Queries') # for q in st.session_state.search_queries: # st.write(q) st.write("---") st.subheader('Relevant Companies - Early Stage') st.write(st.session_state.funding_rounds.to_html(escape = False, index=False, na_rep=''), unsafe_allow_html = True) st.subheader('Relevant Companies - Later Stage') st.write(st.session_state.funding_rounds_later_stage.to_html(escape = False, index=False, na_rep=''), unsafe_allow_html = True) st.subheader(f"Relevant Ben's Bites") st.write(st.session_state.articles.to_html(escape = False, index=False, na_rep = ''), unsafe_allow_html = True) render_search() if "summary" not in st.session_state: st.title(st.session_state.page_title) st.write(st.session_state.page_helper) else: render_search_result()