Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- create_embeddings.py +51 -0
- create_embeddings2.py +58 -0
- extract_variables.py +36 -0
- paragraphs2.py +0 -0
- variables.py +159 -0
create_embeddings.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pinecone
|
2 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
3 |
+
from paragraphs2 import podcasts
|
4 |
+
import uuid
|
5 |
+
google_embeddings = GoogleGenerativeAIEmbeddings(
|
6 |
+
model="models/embedding-001", # Correct model name
|
7 |
+
google_api_key="AIzaSyANNRKfEb-YnVIBaSAq6hQ38XpxxGwvaws" # Your API key
|
8 |
+
)
|
9 |
+
|
10 |
+
# Initialize Pinecone instance
|
11 |
+
pc = pinecone.Pinecone(
|
12 |
+
api_key="4a80f293-ae6d-489a-a7d8-33ea3fcdd26b" # Your Pinecone API key
|
13 |
+
)
|
14 |
+
|
15 |
+
# Define the Pinecone index name (make sure it exists in your Pinecone dashboard)
|
16 |
+
index_name = "iocl2"
|
17 |
+
index = pc.Index(index_name)
|
18 |
+
|
19 |
+
def create_embedding(variable):
|
20 |
+
try:
|
21 |
+
content=variable.get("description")
|
22 |
+
url=variable.get("url")
|
23 |
+
tag=variable.get("tag")
|
24 |
+
updated_url=""
|
25 |
+
if isinstance(url,list):
|
26 |
+
updated_url=",".join(url)
|
27 |
+
else:
|
28 |
+
updated_url=url
|
29 |
+
|
30 |
+
embedding=google_embeddings.embed_query(content)
|
31 |
+
vectors=[]
|
32 |
+
vectors.append({
|
33 |
+
'id': str(uuid.uuid4()),
|
34 |
+
'values': embedding,
|
35 |
+
'metadata': {
|
36 |
+
'chunk': content,
|
37 |
+
"url":updated_url,
|
38 |
+
"tag":tag
|
39 |
+
}
|
40 |
+
})
|
41 |
+
index.upsert(vectors)
|
42 |
+
print(f"inserted : {updated_url}")
|
43 |
+
except Exception as e:
|
44 |
+
print(f"error occured {e}")
|
45 |
+
|
46 |
+
|
47 |
+
create_embedding(podcasts)
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
|
create_embeddings2.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pinecone
|
2 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
3 |
+
from variables import variables # Import the list of variable names
|
4 |
+
import uuid
|
5 |
+
|
6 |
+
# Initialize Google Embeddings
|
7 |
+
google_embeddings = GoogleGenerativeAIEmbeddings(
|
8 |
+
model="models/embedding-001", # Correct model name
|
9 |
+
google_api_key="AIzaSyANNRKfEb-YnVIBaSAq6hQ38XpxxGwvaws" # Your API key
|
10 |
+
)
|
11 |
+
|
12 |
+
# Initialize Pinecone instance
|
13 |
+
pc = pinecone.Pinecone(
|
14 |
+
api_key="4a80f293-ae6d-489a-a7d8-33ea3fcdd26b" # Your Pinecone API key
|
15 |
+
)
|
16 |
+
|
17 |
+
# Define the Pinecone index name (make sure it exists in your Pinecone dashboard)
|
18 |
+
index_name = "iocl2"
|
19 |
+
index = pc.Index(index_name)
|
20 |
+
|
21 |
+
|
22 |
+
def create_embedding(variable):
|
23 |
+
try:
|
24 |
+
content = variable.get("description", None)
|
25 |
+
url = variable.get("url", "")
|
26 |
+
tag = variable.get("tag", "")
|
27 |
+
updated_url = ""
|
28 |
+
|
29 |
+
if isinstance(url, list):
|
30 |
+
updated_url = ",".join(url)
|
31 |
+
else:
|
32 |
+
updated_url = url
|
33 |
+
|
34 |
+
embedding = google_embeddings.embed_query(content)
|
35 |
+
vectors = []
|
36 |
+
vectors.append({
|
37 |
+
'id': str(uuid.uuid4()),
|
38 |
+
'values': embedding,
|
39 |
+
'metadata': {
|
40 |
+
'chunk': content,
|
41 |
+
"url": updated_url,
|
42 |
+
"tag": tag
|
43 |
+
}
|
44 |
+
})
|
45 |
+
index.upsert(vectors)
|
46 |
+
print(f"Inserted the chunk: {updated_url}")
|
47 |
+
except Exception as e:
|
48 |
+
print(f"Error occurred: {e}")
|
49 |
+
|
50 |
+
|
51 |
+
# Iterate over the variable names and create embeddings
|
52 |
+
for variable_name in variables:
|
53 |
+
# Dynamically import the variable from paragraphs2
|
54 |
+
variable = __import__('paragraphs2', fromlist=[variable_name])
|
55 |
+
variable_data = getattr(variable, variable_name)
|
56 |
+
print(f"trying to create embedding for {variable}")
|
57 |
+
# Call the create_embedding function with the variable data
|
58 |
+
create_embedding(variable_data)
|
extract_variables.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
|
3 |
+
def get_dict_variable_names(file_path):
|
4 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
5 |
+
node = ast.parse(file.read(), filename=file_path)
|
6 |
+
|
7 |
+
dict_variable_names = []
|
8 |
+
|
9 |
+
class DictVariableVisitor(ast.NodeVisitor):
|
10 |
+
def visit_Assign(self, node):
|
11 |
+
for target in node.targets:
|
12 |
+
if isinstance(target, ast.Name):
|
13 |
+
# Check if the assigned value is a dictionary
|
14 |
+
if isinstance(node.value, ast.Dict):
|
15 |
+
dict_variable_names.append(target.id)
|
16 |
+
self.generic_visit(node)
|
17 |
+
|
18 |
+
DictVariableVisitor().visit(node)
|
19 |
+
|
20 |
+
return dict_variable_names
|
21 |
+
|
22 |
+
def write_variables_to_file(variables, output_file):
|
23 |
+
with open(output_file, 'w', encoding='utf-8') as file:
|
24 |
+
file.write("variables = [\n")
|
25 |
+
for var in variables:
|
26 |
+
file.write(f" '{var}',\n")
|
27 |
+
file.write("]\n")
|
28 |
+
|
29 |
+
# Example usage
|
30 |
+
input_file_path = 'paragraphs2.py' # Replace with your input file path
|
31 |
+
output_file_path = 'variables.py' # Replace with your desired output file path
|
32 |
+
|
33 |
+
dict_variables = get_dict_variable_names(input_file_path)
|
34 |
+
write_variables_to_file(dict_variables, output_file_path)
|
35 |
+
|
36 |
+
print(f"Initialized dictionary variables stored in {output_file_path}.")
|
paragraphs2.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
variables.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
variables = [
|
2 |
+
'about_iocl',
|
3 |
+
'awards',
|
4 |
+
'corporate_logo',
|
5 |
+
'vision_and_values',
|
6 |
+
'company_leaders',
|
7 |
+
'company_mascot',
|
8 |
+
'refining_overview',
|
9 |
+
'installed_refinery_capacities',
|
10 |
+
'bongaigaon_refinery',
|
11 |
+
'barauni_refinery',
|
12 |
+
'pradip_refinery',
|
13 |
+
'haldia_refinery',
|
14 |
+
'mathura_refinery',
|
15 |
+
'gujarat_refinery',
|
16 |
+
'panipat_refinery',
|
17 |
+
'guwahati_refinery',
|
18 |
+
'digboi_refinery',
|
19 |
+
'pipeline_overview',
|
20 |
+
'pipelines_under_implementation',
|
21 |
+
'crude_oil_pipelines',
|
22 |
+
'petroleum_pipelines',
|
23 |
+
'gas_pipelines',
|
24 |
+
'pipeline_safety_overview',
|
25 |
+
'pipeline_identification',
|
26 |
+
'pipeline_emergency_contact',
|
27 |
+
'pipeline_leak_response',
|
28 |
+
'pipeline_protection',
|
29 |
+
'pipeline_crossing_portal',
|
30 |
+
'rd_overiview_and_achievements',
|
31 |
+
'lubricants',
|
32 |
+
'refining_tech_innovations',
|
33 |
+
'petrochemicals_polymer_pipeline_maintenance',
|
34 |
+
'fuel_additives_and_additional_energy',
|
35 |
+
'bioenergy_nano_tech',
|
36 |
+
'marketing',
|
37 |
+
'pump_locator',
|
38 |
+
'petrochemical_overview',
|
39 |
+
'petrochemical_strategic_businessUnit_and_achievements',
|
40 |
+
'petrochemical_plants',
|
41 |
+
'petrochemical_contact',
|
42 |
+
'natural_gas',
|
43 |
+
'natural_contact',
|
44 |
+
'natural_gas_brochure',
|
45 |
+
'cgd',
|
46 |
+
'eAndp',
|
47 |
+
'eAndpContact',
|
48 |
+
'iocl_explosives_business_overview',
|
49 |
+
'indogel_explosive_brand',
|
50 |
+
'explosive_safety_effeciency',
|
51 |
+
'current_explosive_plants_and_future_plans',
|
52 |
+
'explosive_contact',
|
53 |
+
'cryogenics_overview',
|
54 |
+
'cryo_product_range',
|
55 |
+
'cryo_certifications',
|
56 |
+
'cryocan',
|
57 |
+
'cryovessel',
|
58 |
+
'special_cryo_project',
|
59 |
+
'pressure_vessels_cryo',
|
60 |
+
'cryo_aviation',
|
61 |
+
'iocl_offices',
|
62 |
+
'indian_subsidries',
|
63 |
+
'foreign_subsidries',
|
64 |
+
'joint_ventures',
|
65 |
+
'sri_lanka',
|
66 |
+
'mauritus',
|
67 |
+
'middle_east',
|
68 |
+
'chennai',
|
69 |
+
'refinery_upcoming_projects',
|
70 |
+
'pipeline_projects',
|
71 |
+
'cgd_projects',
|
72 |
+
'marketin_projects',
|
73 |
+
'podcasts',
|
74 |
+
'iocl_revenue',
|
75 |
+
'xp95',
|
76 |
+
'xtraGreen',
|
77 |
+
'cng',
|
78 |
+
'cng_price',
|
79 |
+
'gasoline',
|
80 |
+
'high_speed_diesel',
|
81 |
+
'xp100',
|
82 |
+
'swagat',
|
83 |
+
'autogas',
|
84 |
+
'fuel_testing',
|
85 |
+
'xtrapower_program',
|
86 |
+
'xtrarewards',
|
87 |
+
'servo_lubricant',
|
88 |
+
'lubes_contact',
|
89 |
+
'automtive_lubricating_oil',
|
90 |
+
'png',
|
91 |
+
'png_urls',
|
92 |
+
'indane',
|
93 |
+
'indane_price',
|
94 |
+
'chotu_gas',
|
95 |
+
'composite_cylinder',
|
96 |
+
'munna_cylinder',
|
97 |
+
'kersone',
|
98 |
+
'non_fuel_products',
|
99 |
+
'surya_nutan',
|
100 |
+
'commercial_indane',
|
101 |
+
'industrial_png',
|
102 |
+
'bulk_fuel',
|
103 |
+
'fuel_call',
|
104 |
+
'aviation_fuel',
|
105 |
+
'aviation_contact',
|
106 |
+
'avgas',
|
107 |
+
'marine_oils',
|
108 |
+
'bitumen',
|
109 |
+
'agri_spray_oils',
|
110 |
+
'industrial_greases',
|
111 |
+
'industrial_lubes',
|
112 |
+
'industrial_speciality_oil',
|
113 |
+
'metal_working_oil',
|
114 |
+
'railroad_grease',
|
115 |
+
'non_fuel_alliances',
|
116 |
+
'non_pds_keosene',
|
117 |
+
'glycols',
|
118 |
+
'lab',
|
119 |
+
'polymers',
|
120 |
+
'pta',
|
121 |
+
'special_products',
|
122 |
+
'benzene',
|
123 |
+
'cbfs',
|
124 |
+
'food_grade_hexane',
|
125 |
+
'jute_batching_oil',
|
126 |
+
'paraffin',
|
127 |
+
'propylene',
|
128 |
+
'tech_for_licensing',
|
129 |
+
'cgm_contact',
|
130 |
+
'csr',
|
131 |
+
'iocl_foundation',
|
132 |
+
'sustainability',
|
133 |
+
'iocl_suppliers',
|
134 |
+
'iocl_sports',
|
135 |
+
'sports_values',
|
136 |
+
'sports_legacy',
|
137 |
+
'sport_scholarship',
|
138 |
+
'sport_equality',
|
139 |
+
'family_sport',
|
140 |
+
'news',
|
141 |
+
'petrol_diesel_price',
|
142 |
+
'india_energy_week',
|
143 |
+
'iocl_iim',
|
144 |
+
'iocl_careers',
|
145 |
+
'net_zero',
|
146 |
+
'env_management',
|
147 |
+
'pollution_control',
|
148 |
+
'air_poll',
|
149 |
+
'solid_waste',
|
150 |
+
'oil_spill',
|
151 |
+
'noise_poll',
|
152 |
+
'green_belt',
|
153 |
+
'energy_effecient',
|
154 |
+
'satat_scheme',
|
155 |
+
'safety',
|
156 |
+
'occupational_health',
|
157 |
+
'green_fuel',
|
158 |
+
'contact_iocl',
|
159 |
+
]
|