ESCO-bge-m3

Sleeping

App Files Files Community

danieldux commited on Apr 17

Commit

2126a12

•

1 Parent(s): 9696cd0

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -29

app.py CHANGED Viewed

@@ -6,57 +6,154 @@ import json
 import faiss
 import numpy as np
 import gradio as gr
-import os
 from FlagEmbedding import BGEM3FlagModel
 # Define a function to load the ISCO taxonomy
 def load_isco_taxonomy(file_path: str) -> list:
-    with open(file_path, 'r', encoding='utf-8') as file:
         isco_data = [json.loads(line.strip()) for line in file]
     return isco_data
 # Define a function to create a FAISS index
-def create_faiss_index(isco_taxonomy, model_name='BAAI/bge-m3'):
-    model = BGEM3FlagModel(model_name, use_fp16=True)
-    texts = [str(entry['ESCO_DESCRIPTION']) for entry in isco_taxonomy]
-    embeddings = model.encode(texts, batch_size=12, max_length=256)['dense_vecs']
-    embeddings = np.array(embeddings).astype('float32')
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(embeddings)
-    faiss.write_index(index, '/data/isco_taxonomy.index')
-    with open('/data/isco_taxonomy_mapping.json', 'w') as f:
         json.dump({i: entry for i, entry in enumerate(isco_taxonomy)}, f)
 # Define a function to retrieve and rerank using FAISS
-def retrieve_and_rerank_faiss(job_duties, model_name="BAAI/bge-m3", top_k=4):
     # Check if isco_taxonomy.index exists, if not, create it with create_faiss_index
-    #  if not os.path.exists("/data/isco_taxonomy.index"):
-    isco_taxonomy = load_isco_taxonomy('isco_taxonomy.jsonl')
-    create_faiss_index(isco_taxonomy)
     index = faiss.read_index("/data/isco_taxonomy.index")
     with open("/data/isco_taxonomy_mapping.json", "r") as f:
         isco_taxonomy = json.load(f)
-    model = BGEM3FlagModel(model_name, use_fp16=True)
-    query_embedding = model.encode([job_duties], max_length=256)["dense_vecs"]
     query_embedding = np.array(query_embedding).astype("float32")
     distances, indices = index.search(query_embedding, top_k)
     results = [
-        {
-            "ISCO_CODE_4": isco_taxonomy[str(idx)]["ISCO_CODE_4"],
-            "ISCO_LABEL_4": isco_taxonomy[str(idx)]["ISCO_LABEL_4"],
-            "ESCO_OCCUPATION": isco_taxonomy[str(idx)]["ESCO_OCCUPATION"],
-            "ESCO_DESCRIPTION": isco_taxonomy[str(idx)]["ESCO_DESCRIPTION"],
-            "Similarity": distances[0][i]
-        }
         for i, idx in enumerate(indices[0])
     ]
-    return results
-# Gradio Interface
-def gradio_interface(job_duties):
-    results = retrieve_and_rerank_faiss(job_duties)
-    return results
-iface = gr.Interface(fn=gradio_interface, inputs="text", outputs=gr.outputs.Dataframe(type="pandas"), title="Semantic similarity matches with ESCO descriptions")
-iface.launch()

 import faiss
 import numpy as np
 import gradio as gr
+import torch
 from FlagEmbedding import BGEM3FlagModel
+import os
 # Define a function to load the ISCO taxonomy
 def load_isco_taxonomy(file_path: str) -> list:
+    with open(file_path, "r", encoding="utf-8") as file:
         isco_data = [json.loads(line.strip()) for line in file]
     return isco_data
 # Define a function to create a FAISS index
+def create_faiss_index(isco_taxonomy, model_name="BAAI/bge-m3"):
+    model = BGEM3FlagModel(
+        model_name, use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu"
+    )
+    texts = [str(entry["ESCO_DESCRIPTION"]) for entry in isco_taxonomy]
+    embeddings = model.encode(
+        texts,
+        batch_size=12,
+        max_length=128,
+        return_dense=True,
+        return_sparse=True,
+        return_colbert_vecs=True,
+    )["dense_vecs"]
+    embeddings = np.array(embeddings).astype("float32")
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(embeddings)
+    faiss.write_index(index, "/data/isco_taxonomy.index")
+    with open("/data/isco_taxonomy_mapping.json", "w") as f:
         json.dump({i: entry for i, entry in enumerate(isco_taxonomy)}, f)
 # Define a function to retrieve and rerank using FAISS
+def retrieve_and_rerank_faiss(job, model_name="BAAI/bge-m3", top_k=4):
     # Check if isco_taxonomy.index exists, if not, create it with create_faiss_index
+    if not os.path.exists("/data/isco_taxonomy.index"):
+        isco_taxonomy = load_isco_taxonomy("isco_taxonomy.jsonl")
+        create_faiss_index(isco_taxonomy)
     index = faiss.read_index("/data/isco_taxonomy.index")
     with open("/data/isco_taxonomy_mapping.json", "r") as f:
         isco_taxonomy = json.load(f)
+    model = BGEM3FlagModel(
+        model_name, use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu"
+    )
+    query_embedding = model.encode(
+        [job],
+        max_length=128,
+        return_dense=True,
+        return_sparse=True,
+        return_colbert_vecs=True,
+    )["dense_vecs"]
     query_embedding = np.array(query_embedding).astype("float32")
     distances, indices = index.search(query_embedding, top_k)
+    # top_documents = [isco_taxonomy[str(idx)] for idx in indices[0]]
     results = [
+        [
+            float(distances[0][i]),
+            isco_taxonomy[str(idx)]["ISCO_CODE_4"],
+            isco_taxonomy[str(idx)]["ISCO_LABEL_4"],
+            isco_taxonomy[str(idx)]["ESCO_OCCUPATION"],
+            isco_taxonomy[str(idx)]["ESCO_DESCRIPTION"],
+        ]
         for i, idx in enumerate(indices[0])
     ]
+    ranked_results = sorted(results, key=lambda x: x[0], reverse=True)
+    return ranked_results
+with gr.Blocks() as demo:
+    with gr.Row():
+        text1 = gr.Textbox(label="Job")
+        # text2 = gr.Textbox(label="Duties")
+        # drop1 = gr.Dropdown([4, 6, 8, 10], label="Number of results")
+        btn = gr.Button("Submit")
+    with gr.Row():
+        with gr.Column(scale=1, min_width=600):
+            @btn.click(
+                inputs=text1,
+                outputs=gr.DataFrame(
+                    datatype="str",
+                    label="Results",
+                    headers=[
+                        "Score",
+                        "ISCO code",
+                        "ISCO label",
+                        "ESCO label",
+                        "ESCO description",
+                    ],
+                ),
+            )
+            def greet(job):
+                return retrieve_and_rerank_faiss(job)
+    with gr.Accordion(label="Explanation", open=False):
+        gr.Markdown(
+            """
+            ### Overview of the ESCO rank and retrieve application
+            The ESCO rank and retrieve application developed using Gradio and the BAAI/BGE-m3 model via a FAISS vector database represents a novel approach in the realm of information retrieval, particularly in the context of occupational classifications such as the ISCO-08 standard.
+            This application leverages machine learning to semantically process and rank occupation-related documents based on their relevance to user-input job descriptions.
+            ### How the Application Works
+            The application is structured into several key components:
+            1. **Data preparation:** The ESCO taxonomy data, which includes descriptions of various occupations and corresponding ISCO codes, is initially loaded and processed. This involves reading from a JSON Lines file, ensuring that each entry is correctly formatted and accessible for subsequent operations.
+            2. **Embedding generation:** Using the BAAI/BGE-m3 model, which is optimized for multilingual information processing and retrieval tasks, embeddings (high-dimensional vector representations) are generated for each occupation description in the ESCO dataset. These embeddings capture the semantic essence of the text, allowing for meaningful comparisons between texts.
+            3. **Index creation and storage:** The generated embeddings are then stored in a Faiss index. [Faiss](https://faiss.ai/) (Facebook AI Similarity Search) is an efficient library for similarity search and clustering of dense vectors. It facilitates rapid retrieval of items whose embeddings are most similar to that of a query vector (e.g., cosine of the angle or euclidian distance between two vectors).
+            4. **Retrieval and Ranking:** When a user submits a job title or description of the job through the Gradio interface, the application:
+                - Generates an embedding for the input using the same BAAI/BGE-m3 model.
+                - Queries the pre-computed FAISS index to retrieve the closest occupation descriptions based on cosine similarity measures between embeddings.
+                - Ranks these descriptions according to their similarity scores and presents the results to the user.
+            ### Advantages of the rank and retrieve method
+            #### Enhanced relevance through semantic processing
+            Unlike traditional keyword-based search methods, the rank and retrieve approach uses pre-trained deep learning models to understand the context and semantics of texts.
+            This ensures that the results are not just syntactically but also semantically aligned with the user’s query, thereby increasing the relevance and utility of the retrieved documents.
+            #### Efficiency and scalability
+            By pre-computing embeddings and storing them in a FAISS index, the application can quickly retrieve and rank documents without the need for on-the-fly computation.
+            This makes the system highly efficient and scalable, capable of handling large datasets and high query volumes with minimal latency.
+            #### Avoidance of training on sensitive data
+            One significant advantage of this approach over traditional text classification models is that it does not require training on sensitive or personally identifiable information (PII).
+            Since the model operates solely on public domain occupational descriptions from ESCO, there is no need to train a text classification model and hence no risk of exposing personal data.
+            An important factor given the regulations around data privacy (such as GDPR in Europe) and the ethical considerations of working with PII.
+            #### Adaptability and Multilingual Capability
+            The BAAI/BGE-m3 model's multilingual capabilities mean that the application can function effectively across different languages without the need for separate models or extensive retraining.
+            This adaptability makes it suitable for global deployment, particularly in diverse linguistic and cultural contexts.
+            ### Conclusion
+            The rank and retrieve application showcases an advanced use of langauge models in information retrieval, offering a practical, efficient, and privacy-respecting solution for matching job titles (and/or descriptions) with occupational standards like ISCO-08.
+            """
+        )
+demo.launch()