danieldux commited on
Commit
2126a12
1 Parent(s): 9696cd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -29
app.py CHANGED
@@ -6,57 +6,154 @@ import json
6
  import faiss
7
  import numpy as np
8
  import gradio as gr
9
- import os
10
  from FlagEmbedding import BGEM3FlagModel
 
 
11
 
12
  # Define a function to load the ISCO taxonomy
13
  def load_isco_taxonomy(file_path: str) -> list:
14
- with open(file_path, 'r', encoding='utf-8') as file:
15
  isco_data = [json.loads(line.strip()) for line in file]
16
  return isco_data
17
 
 
18
  # Define a function to create a FAISS index
19
- def create_faiss_index(isco_taxonomy, model_name='BAAI/bge-m3'):
20
- model = BGEM3FlagModel(model_name, use_fp16=True)
21
- texts = [str(entry['ESCO_DESCRIPTION']) for entry in isco_taxonomy]
22
- embeddings = model.encode(texts, batch_size=12, max_length=256)['dense_vecs']
23
- embeddings = np.array(embeddings).astype('float32')
 
 
 
 
 
 
 
 
 
24
  dimension = embeddings.shape[1]
25
  index = faiss.IndexFlatL2(dimension)
26
  index.add(embeddings)
27
- faiss.write_index(index, '/data/isco_taxonomy.index')
28
- with open('/data/isco_taxonomy_mapping.json', 'w') as f:
29
  json.dump({i: entry for i, entry in enumerate(isco_taxonomy)}, f)
30
 
 
31
  # Define a function to retrieve and rerank using FAISS
32
- def retrieve_and_rerank_faiss(job_duties, model_name="BAAI/bge-m3", top_k=4):
33
  # Check if isco_taxonomy.index exists, if not, create it with create_faiss_index
34
- # if not os.path.exists("/data/isco_taxonomy.index"):
35
- isco_taxonomy = load_isco_taxonomy('isco_taxonomy.jsonl')
36
- create_faiss_index(isco_taxonomy)
37
  index = faiss.read_index("/data/isco_taxonomy.index")
38
  with open("/data/isco_taxonomy_mapping.json", "r") as f:
39
  isco_taxonomy = json.load(f)
40
- model = BGEM3FlagModel(model_name, use_fp16=True)
41
- query_embedding = model.encode([job_duties], max_length=256)["dense_vecs"]
 
 
 
 
 
 
 
 
42
  query_embedding = np.array(query_embedding).astype("float32")
43
  distances, indices = index.search(query_embedding, top_k)
 
44
  results = [
45
- {
46
- "ISCO_CODE_4": isco_taxonomy[str(idx)]["ISCO_CODE_4"],
47
- "ISCO_LABEL_4": isco_taxonomy[str(idx)]["ISCO_LABEL_4"],
48
- "ESCO_OCCUPATION": isco_taxonomy[str(idx)]["ESCO_OCCUPATION"],
49
- "ESCO_DESCRIPTION": isco_taxonomy[str(idx)]["ESCO_DESCRIPTION"],
50
- "Similarity": distances[0][i]
51
- }
52
  for i, idx in enumerate(indices[0])
53
  ]
54
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- # Gradio Interface
57
- def gradio_interface(job_duties):
58
- results = retrieve_and_rerank_faiss(job_duties)
59
- return results
60
 
61
- iface = gr.Interface(fn=gradio_interface, inputs="text", outputs=gr.outputs.Dataframe(type="pandas"), title="Semantic similarity matches with ESCO descriptions")
62
- iface.launch()
 
6
  import faiss
7
  import numpy as np
8
  import gradio as gr
9
+ import torch
10
  from FlagEmbedding import BGEM3FlagModel
11
+ import os
12
+
13
 
14
  # Define a function to load the ISCO taxonomy
15
  def load_isco_taxonomy(file_path: str) -> list:
16
+ with open(file_path, "r", encoding="utf-8") as file:
17
  isco_data = [json.loads(line.strip()) for line in file]
18
  return isco_data
19
 
20
+
21
  # Define a function to create a FAISS index
22
+ def create_faiss_index(isco_taxonomy, model_name="BAAI/bge-m3"):
23
+ model = BGEM3FlagModel(
24
+ model_name, use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu"
25
+ )
26
+ texts = [str(entry["ESCO_DESCRIPTION"]) for entry in isco_taxonomy]
27
+ embeddings = model.encode(
28
+ texts,
29
+ batch_size=12,
30
+ max_length=128,
31
+ return_dense=True,
32
+ return_sparse=True,
33
+ return_colbert_vecs=True,
34
+ )["dense_vecs"]
35
+ embeddings = np.array(embeddings).astype("float32")
36
  dimension = embeddings.shape[1]
37
  index = faiss.IndexFlatL2(dimension)
38
  index.add(embeddings)
39
+ faiss.write_index(index, "/data/isco_taxonomy.index")
40
+ with open("/data/isco_taxonomy_mapping.json", "w") as f:
41
  json.dump({i: entry for i, entry in enumerate(isco_taxonomy)}, f)
42
 
43
+
44
  # Define a function to retrieve and rerank using FAISS
45
+ def retrieve_and_rerank_faiss(job, model_name="BAAI/bge-m3", top_k=4):
46
  # Check if isco_taxonomy.index exists, if not, create it with create_faiss_index
47
+ if not os.path.exists("/data/isco_taxonomy.index"):
48
+ isco_taxonomy = load_isco_taxonomy("isco_taxonomy.jsonl")
49
+ create_faiss_index(isco_taxonomy)
50
  index = faiss.read_index("/data/isco_taxonomy.index")
51
  with open("/data/isco_taxonomy_mapping.json", "r") as f:
52
  isco_taxonomy = json.load(f)
53
+ model = BGEM3FlagModel(
54
+ model_name, use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu"
55
+ )
56
+ query_embedding = model.encode(
57
+ [job],
58
+ max_length=128,
59
+ return_dense=True,
60
+ return_sparse=True,
61
+ return_colbert_vecs=True,
62
+ )["dense_vecs"]
63
  query_embedding = np.array(query_embedding).astype("float32")
64
  distances, indices = index.search(query_embedding, top_k)
65
+ # top_documents = [isco_taxonomy[str(idx)] for idx in indices[0]]
66
  results = [
67
+ [
68
+ float(distances[0][i]),
69
+ isco_taxonomy[str(idx)]["ISCO_CODE_4"],
70
+ isco_taxonomy[str(idx)]["ISCO_LABEL_4"],
71
+ isco_taxonomy[str(idx)]["ESCO_OCCUPATION"],
72
+ isco_taxonomy[str(idx)]["ESCO_DESCRIPTION"],
73
+ ]
74
  for i, idx in enumerate(indices[0])
75
  ]
76
+ ranked_results = sorted(results, key=lambda x: x[0], reverse=True)
77
+ return ranked_results
78
+
79
+
80
+ with gr.Blocks() as demo:
81
+ with gr.Row():
82
+ text1 = gr.Textbox(label="Job")
83
+ # text2 = gr.Textbox(label="Duties")
84
+ # drop1 = gr.Dropdown([4, 6, 8, 10], label="Number of results")
85
+ btn = gr.Button("Submit")
86
+ with gr.Row():
87
+ with gr.Column(scale=1, min_width=600):
88
+
89
+ @btn.click(
90
+ inputs=text1,
91
+ outputs=gr.DataFrame(
92
+ datatype="str",
93
+ label="Results",
94
+ headers=[
95
+ "Score",
96
+ "ISCO code",
97
+ "ISCO label",
98
+ "ESCO label",
99
+ "ESCO description",
100
+ ],
101
+ ),
102
+ )
103
+ def greet(job):
104
+ return retrieve_and_rerank_faiss(job)
105
+
106
+ with gr.Accordion(label="Explanation", open=False):
107
+ gr.Markdown(
108
+ """
109
+ ### Overview of the ESCO rank and retrieve application
110
+ The ESCO rank and retrieve application developed using Gradio and the BAAI/BGE-m3 model via a FAISS vector database represents a novel approach in the realm of information retrieval, particularly in the context of occupational classifications such as the ISCO-08 standard.
111
+ This application leverages machine learning to semantically process and rank occupation-related documents based on their relevance to user-input job descriptions.
112
+
113
+ ### How the Application Works
114
+
115
+ The application is structured into several key components:
116
+
117
+ 1. **Data preparation:** The ESCO taxonomy data, which includes descriptions of various occupations and corresponding ISCO codes, is initially loaded and processed. This involves reading from a JSON Lines file, ensuring that each entry is correctly formatted and accessible for subsequent operations.
118
+
119
+ 2. **Embedding generation:** Using the BAAI/BGE-m3 model, which is optimized for multilingual information processing and retrieval tasks, embeddings (high-dimensional vector representations) are generated for each occupation description in the ESCO dataset. These embeddings capture the semantic essence of the text, allowing for meaningful comparisons between texts.
120
+
121
+ 3. **Index creation and storage:** The generated embeddings are then stored in a Faiss index. [Faiss](https://faiss.ai/) (Facebook AI Similarity Search) is an efficient library for similarity search and clustering of dense vectors. It facilitates rapid retrieval of items whose embeddings are most similar to that of a query vector (e.g., cosine of the angle or euclidian distance between two vectors).
122
+
123
+ 4. **Retrieval and Ranking:** When a user submits a job title or description of the job through the Gradio interface, the application:
124
+
125
+ - Generates an embedding for the input using the same BAAI/BGE-m3 model.
126
+ - Queries the pre-computed FAISS index to retrieve the closest occupation descriptions based on cosine similarity measures between embeddings.
127
+ - Ranks these descriptions according to their similarity scores and presents the results to the user.
128
+
129
+ ### Advantages of the rank and retrieve method
130
+
131
+ #### Enhanced relevance through semantic processing
132
+
133
+ Unlike traditional keyword-based search methods, the rank and retrieve approach uses pre-trained deep learning models to understand the context and semantics of texts.
134
+ This ensures that the results are not just syntactically but also semantically aligned with the user’s query, thereby increasing the relevance and utility of the retrieved documents.
135
+
136
+ #### Efficiency and scalability
137
+
138
+ By pre-computing embeddings and storing them in a FAISS index, the application can quickly retrieve and rank documents without the need for on-the-fly computation.
139
+ This makes the system highly efficient and scalable, capable of handling large datasets and high query volumes with minimal latency.
140
+
141
+ #### Avoidance of training on sensitive data
142
+
143
+ One significant advantage of this approach over traditional text classification models is that it does not require training on sensitive or personally identifiable information (PII).
144
+ Since the model operates solely on public domain occupational descriptions from ESCO, there is no need to train a text classification model and hence no risk of exposing personal data.
145
+ An important factor given the regulations around data privacy (such as GDPR in Europe) and the ethical considerations of working with PII.
146
+
147
+ #### Adaptability and Multilingual Capability
148
+
149
+ The BAAI/BGE-m3 model's multilingual capabilities mean that the application can function effectively across different languages without the need for separate models or extensive retraining.
150
+ This adaptability makes it suitable for global deployment, particularly in diverse linguistic and cultural contexts.
151
+
152
+ ### Conclusion
153
+
154
+ The rank and retrieve application showcases an advanced use of langauge models in information retrieval, offering a practical, efficient, and privacy-respecting solution for matching job titles (and/or descriptions) with occupational standards like ISCO-08.
155
+ """
156
+ )
157
 
158
+ demo.launch()
 
 
 
159