akshansh36 commited on
Commit
e282277
·
verified ·
1 Parent(s): d09d051

Upload 5 files

Browse files
create_embeddings.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pinecone
2
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
3
+ from paragraphs2 import podcasts
4
+ import uuid
5
+ google_embeddings = GoogleGenerativeAIEmbeddings(
6
+ model="models/embedding-001", # Correct model name
7
+ google_api_key="AIzaSyANNRKfEb-YnVIBaSAq6hQ38XpxxGwvaws" # Your API key
8
+ )
9
+
10
+ # Initialize Pinecone instance
11
+ pc = pinecone.Pinecone(
12
+ api_key="4a80f293-ae6d-489a-a7d8-33ea3fcdd26b" # Your Pinecone API key
13
+ )
14
+
15
+ # Define the Pinecone index name (make sure it exists in your Pinecone dashboard)
16
+ index_name = "iocl2"
17
+ index = pc.Index(index_name)
18
+
19
+ def create_embedding(variable):
20
+ try:
21
+ content=variable.get("description")
22
+ url=variable.get("url")
23
+ tag=variable.get("tag")
24
+ updated_url=""
25
+ if isinstance(url,list):
26
+ updated_url=",".join(url)
27
+ else:
28
+ updated_url=url
29
+
30
+ embedding=google_embeddings.embed_query(content)
31
+ vectors=[]
32
+ vectors.append({
33
+ 'id': str(uuid.uuid4()),
34
+ 'values': embedding,
35
+ 'metadata': {
36
+ 'chunk': content,
37
+ "url":updated_url,
38
+ "tag":tag
39
+ }
40
+ })
41
+ index.upsert(vectors)
42
+ print(f"inserted : {updated_url}")
43
+ except Exception as e:
44
+ print(f"error occured {e}")
45
+
46
+
47
+ create_embedding(podcasts)
48
+
49
+
50
+
51
+
create_embeddings2.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pinecone
2
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
3
+ from variables import variables # Import the list of variable names
4
+ import uuid
5
+
6
+ # Initialize Google Embeddings
7
+ google_embeddings = GoogleGenerativeAIEmbeddings(
8
+ model="models/embedding-001", # Correct model name
9
+ google_api_key="AIzaSyANNRKfEb-YnVIBaSAq6hQ38XpxxGwvaws" # Your API key
10
+ )
11
+
12
+ # Initialize Pinecone instance
13
+ pc = pinecone.Pinecone(
14
+ api_key="4a80f293-ae6d-489a-a7d8-33ea3fcdd26b" # Your Pinecone API key
15
+ )
16
+
17
+ # Define the Pinecone index name (make sure it exists in your Pinecone dashboard)
18
+ index_name = "iocl2"
19
+ index = pc.Index(index_name)
20
+
21
+
22
+ def create_embedding(variable):
23
+ try:
24
+ content = variable.get("description", None)
25
+ url = variable.get("url", "")
26
+ tag = variable.get("tag", "")
27
+ updated_url = ""
28
+
29
+ if isinstance(url, list):
30
+ updated_url = ",".join(url)
31
+ else:
32
+ updated_url = url
33
+
34
+ embedding = google_embeddings.embed_query(content)
35
+ vectors = []
36
+ vectors.append({
37
+ 'id': str(uuid.uuid4()),
38
+ 'values': embedding,
39
+ 'metadata': {
40
+ 'chunk': content,
41
+ "url": updated_url,
42
+ "tag": tag
43
+ }
44
+ })
45
+ index.upsert(vectors)
46
+ print(f"Inserted the chunk: {updated_url}")
47
+ except Exception as e:
48
+ print(f"Error occurred: {e}")
49
+
50
+
51
+ # Iterate over the variable names and create embeddings
52
+ for variable_name in variables:
53
+ # Dynamically import the variable from paragraphs2
54
+ variable = __import__('paragraphs2', fromlist=[variable_name])
55
+ variable_data = getattr(variable, variable_name)
56
+ print(f"trying to create embedding for {variable}")
57
+ # Call the create_embedding function with the variable data
58
+ create_embedding(variable_data)
extract_variables.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+
3
+ def get_dict_variable_names(file_path):
4
+ with open(file_path, 'r', encoding='utf-8') as file:
5
+ node = ast.parse(file.read(), filename=file_path)
6
+
7
+ dict_variable_names = []
8
+
9
+ class DictVariableVisitor(ast.NodeVisitor):
10
+ def visit_Assign(self, node):
11
+ for target in node.targets:
12
+ if isinstance(target, ast.Name):
13
+ # Check if the assigned value is a dictionary
14
+ if isinstance(node.value, ast.Dict):
15
+ dict_variable_names.append(target.id)
16
+ self.generic_visit(node)
17
+
18
+ DictVariableVisitor().visit(node)
19
+
20
+ return dict_variable_names
21
+
22
+ def write_variables_to_file(variables, output_file):
23
+ with open(output_file, 'w', encoding='utf-8') as file:
24
+ file.write("variables = [\n")
25
+ for var in variables:
26
+ file.write(f" '{var}',\n")
27
+ file.write("]\n")
28
+
29
+ # Example usage
30
+ input_file_path = 'paragraphs2.py' # Replace with your input file path
31
+ output_file_path = 'variables.py' # Replace with your desired output file path
32
+
33
+ dict_variables = get_dict_variable_names(input_file_path)
34
+ write_variables_to_file(dict_variables, output_file_path)
35
+
36
+ print(f"Initialized dictionary variables stored in {output_file_path}.")
paragraphs2.py ADDED
The diff for this file is too large to render. See raw diff
 
variables.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ variables = [
2
+ 'about_iocl',
3
+ 'awards',
4
+ 'corporate_logo',
5
+ 'vision_and_values',
6
+ 'company_leaders',
7
+ 'company_mascot',
8
+ 'refining_overview',
9
+ 'installed_refinery_capacities',
10
+ 'bongaigaon_refinery',
11
+ 'barauni_refinery',
12
+ 'pradip_refinery',
13
+ 'haldia_refinery',
14
+ 'mathura_refinery',
15
+ 'gujarat_refinery',
16
+ 'panipat_refinery',
17
+ 'guwahati_refinery',
18
+ 'digboi_refinery',
19
+ 'pipeline_overview',
20
+ 'pipelines_under_implementation',
21
+ 'crude_oil_pipelines',
22
+ 'petroleum_pipelines',
23
+ 'gas_pipelines',
24
+ 'pipeline_safety_overview',
25
+ 'pipeline_identification',
26
+ 'pipeline_emergency_contact',
27
+ 'pipeline_leak_response',
28
+ 'pipeline_protection',
29
+ 'pipeline_crossing_portal',
30
+ 'rd_overiview_and_achievements',
31
+ 'lubricants',
32
+ 'refining_tech_innovations',
33
+ 'petrochemicals_polymer_pipeline_maintenance',
34
+ 'fuel_additives_and_additional_energy',
35
+ 'bioenergy_nano_tech',
36
+ 'marketing',
37
+ 'pump_locator',
38
+ 'petrochemical_overview',
39
+ 'petrochemical_strategic_businessUnit_and_achievements',
40
+ 'petrochemical_plants',
41
+ 'petrochemical_contact',
42
+ 'natural_gas',
43
+ 'natural_contact',
44
+ 'natural_gas_brochure',
45
+ 'cgd',
46
+ 'eAndp',
47
+ 'eAndpContact',
48
+ 'iocl_explosives_business_overview',
49
+ 'indogel_explosive_brand',
50
+ 'explosive_safety_effeciency',
51
+ 'current_explosive_plants_and_future_plans',
52
+ 'explosive_contact',
53
+ 'cryogenics_overview',
54
+ 'cryo_product_range',
55
+ 'cryo_certifications',
56
+ 'cryocan',
57
+ 'cryovessel',
58
+ 'special_cryo_project',
59
+ 'pressure_vessels_cryo',
60
+ 'cryo_aviation',
61
+ 'iocl_offices',
62
+ 'indian_subsidries',
63
+ 'foreign_subsidries',
64
+ 'joint_ventures',
65
+ 'sri_lanka',
66
+ 'mauritus',
67
+ 'middle_east',
68
+ 'chennai',
69
+ 'refinery_upcoming_projects',
70
+ 'pipeline_projects',
71
+ 'cgd_projects',
72
+ 'marketin_projects',
73
+ 'podcasts',
74
+ 'iocl_revenue',
75
+ 'xp95',
76
+ 'xtraGreen',
77
+ 'cng',
78
+ 'cng_price',
79
+ 'gasoline',
80
+ 'high_speed_diesel',
81
+ 'xp100',
82
+ 'swagat',
83
+ 'autogas',
84
+ 'fuel_testing',
85
+ 'xtrapower_program',
86
+ 'xtrarewards',
87
+ 'servo_lubricant',
88
+ 'lubes_contact',
89
+ 'automtive_lubricating_oil',
90
+ 'png',
91
+ 'png_urls',
92
+ 'indane',
93
+ 'indane_price',
94
+ 'chotu_gas',
95
+ 'composite_cylinder',
96
+ 'munna_cylinder',
97
+ 'kersone',
98
+ 'non_fuel_products',
99
+ 'surya_nutan',
100
+ 'commercial_indane',
101
+ 'industrial_png',
102
+ 'bulk_fuel',
103
+ 'fuel_call',
104
+ 'aviation_fuel',
105
+ 'aviation_contact',
106
+ 'avgas',
107
+ 'marine_oils',
108
+ 'bitumen',
109
+ 'agri_spray_oils',
110
+ 'industrial_greases',
111
+ 'industrial_lubes',
112
+ 'industrial_speciality_oil',
113
+ 'metal_working_oil',
114
+ 'railroad_grease',
115
+ 'non_fuel_alliances',
116
+ 'non_pds_keosene',
117
+ 'glycols',
118
+ 'lab',
119
+ 'polymers',
120
+ 'pta',
121
+ 'special_products',
122
+ 'benzene',
123
+ 'cbfs',
124
+ 'food_grade_hexane',
125
+ 'jute_batching_oil',
126
+ 'paraffin',
127
+ 'propylene',
128
+ 'tech_for_licensing',
129
+ 'cgm_contact',
130
+ 'csr',
131
+ 'iocl_foundation',
132
+ 'sustainability',
133
+ 'iocl_suppliers',
134
+ 'iocl_sports',
135
+ 'sports_values',
136
+ 'sports_legacy',
137
+ 'sport_scholarship',
138
+ 'sport_equality',
139
+ 'family_sport',
140
+ 'news',
141
+ 'petrol_diesel_price',
142
+ 'india_energy_week',
143
+ 'iocl_iim',
144
+ 'iocl_careers',
145
+ 'net_zero',
146
+ 'env_management',
147
+ 'pollution_control',
148
+ 'air_poll',
149
+ 'solid_waste',
150
+ 'oil_spill',
151
+ 'noise_poll',
152
+ 'green_belt',
153
+ 'energy_effecient',
154
+ 'satat_scheme',
155
+ 'safety',
156
+ 'occupational_health',
157
+ 'green_fuel',
158
+ 'contact_iocl',
159
+ ]