{"metadata":{"accelerator":"GPU","colab":{"gpuType":"T4","provenance":[]},"gpuClass":"standard","kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":7571253,"sourceType":"datasetVersion","datasetId":4407676},{"sourceId":7678915,"sourceType":"datasetVersion","datasetId":4479814},{"sourceId":7713636,"sourceType":"datasetVersion","datasetId":4504654},{"sourceId":7964016,"sourceType":"datasetVersion","datasetId":4685329},{"sourceId":8017122,"sourceType":"datasetVersion","datasetId":4723613}],"dockerImageVersionId":30684,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"

Benchmark 2: Gemini-1.5-Pro, Gemini-Pro, OpenHermes-Mistral and Mistral-7B

\n","metadata":{}},{"cell_type":"markdown","source":"
\n

\n Notebook Gool\n

\n

T\nThe objective of this notebook is to evaluate the performance of Gemini-1.5-Pro, Gemini-Pro, OpenHermes, and Mistral-7B using the Table-extract Benchmark dataset available at Hugging Face.

\n
\n","metadata":{}},{"cell_type":"markdown","source":"#
Table of Content
\n\n* [I. Loading and Importing Libraries](#1)\n* [II. Definition and Implementation of Metrics](#2)\n* [III. Clean Response Obtained by LLM](#3)\n* [IV. Data Preparation](#5)\n* [V. Benchmark](#6)\n * [Prompt](#61)\n * [Gemini-1.5-Pro-latest](#62)\n * [Gemini-Pro](#63)\n * [OpenHermes-Mistral](#64)\n * [ Mistral-7B-Instruct-v0.2](#65)","metadata":{}},{"cell_type":"markdown","source":"\n#
I | Loading and Importing Libraries
\n","metadata":{}},{"cell_type":"code","source":"%%capture\n!pip install google-generativeai\n!pip install --upgrade pip\n!pip install bitsandbytes\n!pip install transformers","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:12:19.002598Z","iopub.execute_input":"2024-04-16T13:12:19.003425Z","iopub.status.idle":"2024-04-16T13:13:28.478550Z","shell.execute_reply.started":"2024-04-16T13:12:19.003393Z","shell.execute_reply":"2024-04-16T13:13:28.477476Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"import re\nimport json\nfrom tqdm import tqdm\nimport pandas as pd\nfrom datasets import load_dataset, Dataset\nfrom wand.image import Image as WImage\nimport torch\nimport pandas as pd\nfrom transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\nimport time \nimport random\nimport numpy as np","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:13:28.480451Z","iopub.execute_input":"2024-04-16T13:13:28.480794Z","iopub.status.idle":"2024-04-16T13:13:37.393188Z","shell.execute_reply.started":"2024-04-16T13:13:28.480765Z","shell.execute_reply":"2024-04-16T13:13:37.392033Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"import google.generativeai as genai\nimport time \ngenai.configure(api_key=\"AIzaSyAhz9UBzkEIYI886zZRm40qqB1Kd_9Y4-0\")","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:13:37.394787Z","iopub.execute_input":"2024-04-16T13:13:37.395288Z","iopub.status.idle":"2024-04-16T13:13:38.039962Z","shell.execute_reply.started":"2024-04-16T13:13:37.395257Z","shell.execute_reply":"2024-04-16T13:13:38.039067Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"# Set random seed for reproducibility\nrandom.seed(42)\nnp.random.seed(42)\ntorch.manual_seed(42)\ntorch.cuda.manual_seed_all(42)\ntorch.backends.cudnn.deterministic = True\ntorch.backends.cudnn.benchmark = False","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:13:38.041765Z","iopub.execute_input":"2024-04-16T13:13:38.042332Z","iopub.status.idle":"2024-04-16T13:13:38.050925Z","shell.execute_reply.started":"2024-04-16T13:13:38.042303Z","shell.execute_reply":"2024-04-16T13:13:38.049937Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"markdown","source":"\n#
II | Definition and Implementation of Metrics
\nSo, let's begin by providing an example of the example output.","metadata":{}},{"cell_type":"code","source":"desired_output = [{'aircraft': 'robinson r - 22',\n 'description': 'light utility helicopter',\n 'max gross weight': '1370 lb (635 kg)',\n 'total disk area': '497 ft square (46.2 m square)',\n 'max disk loading': '2.6 lb / ft square (14 kg / m square)'},\n {'aircraft': 'bell 206b3 jetranger',\n 'description': 'turboshaft utility helicopter',\n 'max gross weight': '3200 lb (1451 kg)',\n 'total disk area': '872 ft square (81.1 m square)',\n 'max disk loading': '3.7 lb / ft square (18 kg / m square)'},\n {'aircraft': 'ch - 47d chinook',\n 'description': 'tandem rotor helicopter',\n 'max gross weight': '50000 lb (22680 kg)',\n 'total disk area': '5655 ft square (526 m square)',\n 'max disk loading': '8.8 lb / ft square (43 kg / m square)'},\n {'aircraft': 'mil mi - 26',\n 'description': 'heavy - lift helicopter',\n 'max gross weight': '123500 lb (56000 kg)',\n 'total disk area': '8495 ft square (789 m square)',\n 'max disk loading': '14.5 lb / ft square (71 kg / m square)'},\n {'aircraft': 'ch - 53e super stallion',\n 'description': 'heavy - lift helicopter',\n 'max gross weight': '73500 lb (33300 kg)',\n 'total disk area': '4900 ft square (460 m square)',\n 'max disk loading': '15 lb / ft square (72 kg / m square)'}]\n","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:24:44.748719Z","iopub.execute_input":"2024-04-16T13:24:44.749654Z","iopub.status.idle":"2024-04-16T13:24:44.756340Z","shell.execute_reply.started":"2024-04-16T13:24:44.749619Z","shell.execute_reply":"2024-04-16T13:24:44.755387Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"markdown","source":"To compare between the expected list of records and the predicted list of records, we first need to verify the percentage of predicted keys relative to the desired keys","metadata":{}},{"cell_type":"markdown","source":">## Percentage of predicted keys","metadata":{}},{"cell_type":"markdown","source":"Let's begin by defining a function to retrieve all keys of record","metadata":{}},{"cell_type":"code","source":"def get_keys(d):\n # Iterate over each key-value pair in the dictionary\n for k, v in d.items():\n # Append the key to the list of all_keys\n all_keys.append(k)\n # If the value is a dictionary, recursively call get_keys\n if isinstance(v, dict):\n get_keys(v)\n # If the value is a list, iterate over each item\n elif isinstance(v, list):\n for item in v:\n # If the item is a dictionary, recursively call get_keys\n if isinstance(item, dict):\n get_keys(item)\n# Define a function to retrieve all unique keys from a nested dictionary\ndef get_all_keys(d):\n # Declare all_keys as a global variable\n global all_keys\n # Initialize all_keys as an empty list\n all_keys = []\n # Call the helper function get_keys to populate all_keys\n get_keys(d)\n # Return a list containing the unique keys by converting all_keys to a set and then back to a list\n return list(set(all_keys))","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:24:47.352071Z","iopub.execute_input":"2024-04-16T13:24:47.352417Z","iopub.status.idle":"2024-04-16T13:24:47.359151Z","shell.execute_reply.started":"2024-04-16T13:24:47.352392Z","shell.execute_reply":"2024-04-16T13:24:47.358216Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"# Testing our function\nget_all_keys(desired_output[0])","metadata":{"execution":{"iopub.status.busy":"2024-04-16T09:18:57.764316Z","iopub.execute_input":"2024-04-16T09:18:57.765162Z","iopub.status.idle":"2024-04-16T09:18:57.772892Z","shell.execute_reply.started":"2024-04-16T09:18:57.765132Z","shell.execute_reply":"2024-04-16T09:18:57.771802Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"['max disk loading',\n 'total disk area',\n 'aircraft',\n 'max gross weight',\n 'description']"},"metadata":{}}]},{"cell_type":"markdown","source":"Now, we define the percentage of predicted keys as follows:\n\n$$\\Large \\text{Percentage of predicted keys} = \\frac{\\text{Number of correctly predicted keys}}{\\text{Total number of true keys}}$$\nThis percentage is calculated for every record in the list, then summed and divided by the number of records in the list.","metadata":{}},{"cell_type":"code","source":"def process_dict(data):\n if isinstance(data, dict):\n for key, value in data.items():\n if isinstance(value, str):\n data[key] = value.strip().lower()\n elif isinstance(value, list):\n data[key] = [process_dict(item) for item in value]\n elif isinstance(value, dict):\n data[key] = process_dict(value)\n return data","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:02.562198Z","iopub.execute_input":"2024-04-16T13:25:02.562934Z","iopub.status.idle":"2024-04-16T13:25:02.569006Z","shell.execute_reply.started":"2024-04-16T13:25:02.562903Z","shell.execute_reply":"2024-04-16T13:25:02.567897Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"def percentage_of_predicted_keys(true_dic, pred_dic):\n true_dic=process_dict(true_dic)\n pred_dic=process_dict(pred_dic)\n # Get all keys of the true dictionary\n all_keys_of_true_dic = get_all_keys(true_dic)\n # Get all keys of the predicted dictionary\n all_keys_of_pred_dic = get_all_keys(pred_dic)\n \n # Check if there are no keys in the true dictionary to avoid division by zero\n if len(all_keys_of_true_dic) == 0:\n return 0 # Avoid division by zero\n \n # Initialize count of predicted keys\n p_keys = 0\n # Iterate through all keys in the predicted dictionary\n for key in all_keys_of_pred_dic:\n # Check if the key is also present in the true dictionary\n if key in all_keys_of_true_dic:\n # Increment count if the key is found in both dictionaries\n p_keys += 1\n \n # Calculate the percentage of predicted keys compared to true keys\n p_keys /= len(all_keys_of_true_dic)\n # Return the percentage of predicted keys\n return p_keys","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:04.738122Z","iopub.execute_input":"2024-04-16T13:25:04.739060Z","iopub.status.idle":"2024-04-16T13:25:04.745531Z","shell.execute_reply.started":"2024-04-16T13:25:04.739023Z","shell.execute_reply":"2024-04-16T13:25:04.744305Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"def average_percentage_key(true_list, pred_list):\n min_length = min(len(true_list), len(pred_list)) # Find the minimum length of the two lists\n score = 0\n for i in range(min_length):\n score += percentage_of_predicted_keys(true_list[i], pred_list[i])\n return score / len(true_list)","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:06.245917Z","iopub.execute_input":"2024-04-16T13:25:06.246241Z","iopub.status.idle":"2024-04-16T13:25:06.251418Z","shell.execute_reply.started":"2024-04-16T13:25:06.246218Z","shell.execute_reply":"2024-04-16T13:25:06.250442Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"# Example true and predicted lists\ntrue_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 6}, {'key1': 7, 'key2': 8, 'key3': 9}]\npred_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 7}, {'key1': 7, 'key2': 8, 'key3': 9}]\n\n# Test the function\nresult = average_percentage_key(true_list, pred_list)\nprint(\"Average percentage of keys:\", result)","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:08.513042Z","iopub.execute_input":"2024-04-16T13:25:08.513694Z","iopub.status.idle":"2024-04-16T13:25:08.520278Z","shell.execute_reply.started":"2024-04-16T13:25:08.513663Z","shell.execute_reply":"2024-04-16T13:25:08.519432Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":"Average percentage of keys: 1.0\n","output_type":"stream"}]},{"cell_type":"markdown","source":"Now we will define the principal metrics used to compare the values of two list recods.","metadata":{}},{"cell_type":"markdown","source":">## Percentage of predicted values\n\nThe function calculates the percentage of correctly predicted values compared to the total number of true values across different types of data structures.\n\nThe formula for calculating the percentage of values is as follows:\n\n$$\n\\text{Average percentage of values} = \\frac{\\sum_{i=1}^{\\text{Total number of records}} p_i }{Total number of records}\n$$\n\nHere, $p_i$ represents the percentage of correctly predicted values for each key. It's calculated as:\n\n$$p_i = \\frac{\\text{Number of correctly predicted values of item i}}{\\text{Total number of true values of item i}}$$","metadata":{}},{"cell_type":"code","source":"def calculate_percentage_of_values(true_dic, pred_dic):\n total_percentage = 0 # Initialize total percentage\n # Type 1: Single string values\n for key, true_value in true_dic.items(): # Loop through key-value pairs in true_dic\n \n # Check if the key exists in pred_dic, if its value is a string and if it matches the true value\n if key in pred_dic and str(pred_dic[key]) == str(true_value):\n match = 1 # Assign perfect match\n else:\n match = 0 # Assign no match\n total_percentage += match\n return total_percentage / len(true_dic) # Calculate and return the average percentage","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:11.264721Z","iopub.execute_input":"2024-04-16T13:25:11.265579Z","iopub.status.idle":"2024-04-16T13:25:11.271268Z","shell.execute_reply.started":"2024-04-16T13:25:11.265546Z","shell.execute_reply":"2024-04-16T13:25:11.270222Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"def average_percentage_value(true_list, pred_list):\n min_length = min(len(true_list), len(pred_list)) # Find the minimum length of the two lists\n score = 0\n for i in range(min_length):\n score += calculate_percentage_of_values(true_list[i], pred_list[i])\n return score / len(true_list)","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:12.845015Z","iopub.execute_input":"2024-04-16T13:25:12.845667Z","iopub.status.idle":"2024-04-16T13:25:12.850686Z","shell.execute_reply.started":"2024-04-16T13:25:12.845635Z","shell.execute_reply":"2024-04-16T13:25:12.849668Z"},"trusted":true},"execution_count":12,"outputs":[]},{"cell_type":"code","source":"# Example true and predicted lists\ntrue_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 6}, {'key1': 7, 'key2': 8, 'key3': 9}]\npred_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 7}, {'key1': 7, 'key2': 8, 'key3': 9}]\n\n# Test the function\nresult = average_percentage_value(true_list, pred_list)\nprint(\"Average percentage of keys:\", result)","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:14.665414Z","iopub.execute_input":"2024-04-16T13:25:14.665787Z","iopub.status.idle":"2024-04-16T13:25:14.672461Z","shell.execute_reply.started":"2024-04-16T13:25:14.665760Z","shell.execute_reply":"2024-04-16T13:25:14.671470Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"Average percentage of keys: 0.8888888888888888\n","output_type":"stream"}]},{"cell_type":"markdown","source":"\n#
III | Clean Response Obtained by LLM
\n","metadata":{}},{"cell_type":"code","source":"import json\n\ndef parse_json(data_str):\n # Remove leading/trailing whitespace and newlines\n data_str = data_str.strip()\n\n # Check if the string is enclosed within triple backticks (\"```json\" and \"```\")\n if data_str.startswith(\"```json\") and data_str.endswith(\"```\"):\n # Remove the leading/trailing \"```json\" and \"```\"\n data_str = data_str[len(\"```json\"): -len(\"```\")]\n\n try:\n # Parse JSON\n data = json.loads(data_str)\n return data\n except json.JSONDecodeError as e:\n print(\"JSON parsing error:\", e)\n return None","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:16.641838Z","iopub.execute_input":"2024-04-16T13:25:16.642200Z","iopub.status.idle":"2024-04-16T13:25:16.648128Z","shell.execute_reply.started":"2024-04-16T13:25:16.642172Z","shell.execute_reply":"2024-04-16T13:25:16.647185Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"response_str = \"\"\"[{\"aircraft\": \"robinson r - 22\",\n \"description\": \"light utility helicopter\",\n \"max gross weight\": \"1370 lb (635 kg)\",\n \"total disk area\": \"497 ft square (46.2 m square)\",\n \"max disk loading\": \"2.6 lb / ft square (14 kg / m square)\"},\n{\"aircraft\": \"bell 206b3 jetranger\",\n \"description\": \"turboshaft utility helicopter\",\n \"max gross weight\": \"3200 lb (1451 kg)\",\n \"total disk area\": \"872 ft square (81.1 m square)\",\n \"max disk loading\": \"3.7 lb / ft square (18 kg / m square)\"},\n{\"aircraft\": \"ch - 47d chinook\",\n \"description\": \"tandem rotor helicopter\",\n \"max gross weight\": \"50000 lb (22680 kg)\",\n \"total disk area\": \"5655 ft square (526 m square)\",\n \"max disk loading\": \"8.8 lb / ft square (43 kg / m square)\"},\n{\"aircraft\": \"mil mi - 26\",\n \"description\": \"heavy - lift helicopter\",\n \"max gross weight\": \"123500 lb (56000 kg)\",\n \"total disk area\": \"8495 ft square (789 m square)\",\n \"max disk loading\": \"14.5 lb / ft square (71 kg / m square)\"},\n{\"aircraft\": \"ch - 53e super stallion\",\n \"description\": \"heavy - lift helicopter\",\n \"max gross weight\": \"73500 lb (33300 kg)\",\n \"total disk area\": \"4900 ft square (460 m square)\",\n \"max disk loading\": \"15 lb / ft square (72 kg / m square)\"}]\"\"\"\n\n# Convert the string representation to a list of dictionaries\nparse_json(response_str)","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:18.225851Z","iopub.execute_input":"2024-04-16T13:25:18.226646Z","iopub.status.idle":"2024-04-16T13:25:18.236378Z","shell.execute_reply.started":"2024-04-16T13:25:18.226611Z","shell.execute_reply":"2024-04-16T13:25:18.235495Z"},"trusted":true},"execution_count":15,"outputs":[{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"[{'aircraft': 'robinson r - 22',\n 'description': 'light utility helicopter',\n 'max gross weight': '1370 lb (635 kg)',\n 'total disk area': '497 ft square (46.2 m square)',\n 'max disk loading': '2.6 lb / ft square (14 kg / m square)'},\n {'aircraft': 'bell 206b3 jetranger',\n 'description': 'turboshaft utility helicopter',\n 'max gross weight': '3200 lb (1451 kg)',\n 'total disk area': '872 ft square (81.1 m square)',\n 'max disk loading': '3.7 lb / ft square (18 kg / m square)'},\n {'aircraft': 'ch - 47d chinook',\n 'description': 'tandem rotor helicopter',\n 'max gross weight': '50000 lb (22680 kg)',\n 'total disk area': '5655 ft square (526 m square)',\n 'max disk loading': '8.8 lb / ft square (43 kg / m square)'},\n {'aircraft': 'mil mi - 26',\n 'description': 'heavy - lift helicopter',\n 'max gross weight': '123500 lb (56000 kg)',\n 'total disk area': '8495 ft square (789 m square)',\n 'max disk loading': '14.5 lb / ft square (71 kg / m square)'},\n {'aircraft': 'ch - 53e super stallion',\n 'description': 'heavy - lift helicopter',\n 'max gross weight': '73500 lb (33300 kg)',\n 'total disk area': '4900 ft square (460 m square)',\n 'max disk loading': '15 lb / ft square (72 kg / m square)'}]"},"metadata":{}}]},{"cell_type":"markdown","source":"\n#
IV | Data Preparation
\n","metadata":{}},{"cell_type":"markdown","source":"I'll extract a sample of 100 records from the dataset excluding those with Arabic names, and then simplify the output to enhance performance.","metadata":{}},{"cell_type":"code","source":"df = pd.read_csv(\"/kaggle/input/table-extraction/table_extract.csv\")\ndf.head(5)","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:21.195467Z","iopub.execute_input":"2024-04-16T13:25:21.196506Z","iopub.status.idle":"2024-04-16T13:25:22.639971Z","shell.execute_reply.started":"2024-04-16T13:25:21.196467Z","shell.execute_reply":"2024-04-16T13:25:22.638999Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":" context \\\n0 aircraft ... \n1 order year manufacturer mod... \n2 player no nationality ... \n3 player no nationali... \n4 player no nationality ... \n\n answer \n0 {\"aircraft\":{\"0\":\"robinson r - 22\",\"1\":\"bell 2... \n1 {\"order year\":{\"0\":\"1992 - 93\",\"1\":\"1996\",\"2\":... \n2 {\"player\":{\"0\":\"quincy acy\",\"1\":\"hassan adams\"... \n3 {\"player\":{\"0\":\"patrick o'bryant\",\"1\":\"jermain... \n4 {\"player\":{\"0\":\"mark baker\",\"1\":\"marcus banks\"... ","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
contextanswer
0aircraft ...{\"aircraft\":{\"0\":\"robinson r - 22\",\"1\":\"bell 2...
1order year manufacturer mod...{\"order year\":{\"0\":\"1992 - 93\",\"1\":\"1996\",\"2\":...
2player no nationality ...{\"player\":{\"0\":\"quincy acy\",\"1\":\"hassan adams\"...
3player no nationali...{\"player\":{\"0\":\"patrick o'bryant\",\"1\":\"jermain...
4player no nationality ...{\"player\":{\"0\":\"mark baker\",\"1\":\"marcus banks\"...
\n
"},"metadata":{}}]},{"cell_type":"code","source":"def is_arabic_name(name):\n \"\"\"\n Checks if a name contains Arabic characters.\n\n Args:\n name: The name string to check.\n\n Returns:\n True if Arabic characters are found, False otherwise.\n \"\"\"\n # Regular expression to match Arabic characters\n arabic_pattern = re.compile(\"[\\u0600-\\u06FF]+\")\n\n # Search for Arabic characters in the name\n match = arabic_pattern.search(name)\n\n # Return True if a match is found, False otherwise\n return bool(match)","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:22.641598Z","iopub.execute_input":"2024-04-16T13:25:22.642000Z","iopub.status.idle":"2024-04-16T13:25:22.647637Z","shell.execute_reply.started":"2024-04-16T13:25:22.641973Z","shell.execute_reply":"2024-04-16T13:25:22.646545Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"df = df[~df['context'].apply(lambda x: is_arabic_name(x))]","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:24.520859Z","iopub.execute_input":"2024-04-16T13:25:24.521712Z","iopub.status.idle":"2024-04-16T13:25:25.489780Z","shell.execute_reply.started":"2024-04-16T13:25:24.521677Z","shell.execute_reply":"2024-04-16T13:25:25.488506Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"df_sample =df.loc[:100]","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:26.231506Z","iopub.execute_input":"2024-04-16T13:25:26.232186Z","iopub.status.idle":"2024-04-16T13:25:26.239060Z","shell.execute_reply.started":"2024-04-16T13:25:26.232150Z","shell.execute_reply":"2024-04-16T13:25:26.237913Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"def transform_json_to_records(json_data):\n \"\"\"\n Transforms a structured JSON object into a list of records.\n\n The function assumes the structure of the JSON object is a dictionary of dictionaries,\n where each top-level key is a field name, and its value is a dictionary mapping indices\n to field values. All sub-dictionaries must have the same keys.\n\n Parameters:\n - json_data: A dictionary representing the structured JSON object to transform.\n\n Returns:\n - A list of dictionaries, where each dictionary represents a record with fields and values\n derived from the input JSON.\n \"\"\"\n json_data = json.loads(json_data)\n # Extract keys from the first dictionary item to use as indices\n indices = list(next(iter(json_data.values())).keys())\n # Initialize the list to store transformed records\n records = []\n\n # Loop over each index to create a record\n for index in indices:\n record = {field: values[index] for field, values in json_data.items()}\n records.append(record)\n\n return records","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:27.953737Z","iopub.execute_input":"2024-04-16T13:25:27.954385Z","iopub.status.idle":"2024-04-16T13:25:27.961373Z","shell.execute_reply.started":"2024-04-16T13:25:27.954351Z","shell.execute_reply":"2024-04-16T13:25:27.960387Z"},"trusted":true},"execution_count":20,"outputs":[]},{"cell_type":"code","source":"df_sample.loc[:, 'answer'] = df_sample['answer'].map(transform_json_to_records)","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:31.129263Z","iopub.execute_input":"2024-04-16T13:25:31.130144Z","iopub.status.idle":"2024-04-16T13:25:31.142875Z","shell.execute_reply.started":"2024-04-16T13:25:31.130106Z","shell.execute_reply":"2024-04-16T13:25:31.141806Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"df_sample.head()","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:25:32.664227Z","iopub.execute_input":"2024-04-16T13:25:32.664981Z","iopub.status.idle":"2024-04-16T13:25:32.698044Z","shell.execute_reply.started":"2024-04-16T13:25:32.664945Z","shell.execute_reply":"2024-04-16T13:25:32.697121Z"},"trusted":true},"execution_count":22,"outputs":[{"execution_count":22,"output_type":"execute_result","data":{"text/plain":" context \\\n0 aircraft ... \n1 order year manufacturer mod... \n2 player no nationality ... \n3 player no nationali... \n4 player no nationality ... \n\n answer \n0 [{'aircraft': 'robinson r - 22', 'description'... \n1 [{'order year': '1992 - 93', 'manufacturer': '... \n2 [{'player': 'quincy acy', 'no': '4', 'national... \n3 [{'player': 'patrick o'bryant', 'no': 13, 'nat... \n4 [{'player': 'mark baker', 'no': '3', 'national... ","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
contextanswer
0aircraft ...[{'aircraft': 'robinson r - 22', 'description'...
1order year manufacturer mod...[{'order year': '1992 - 93', 'manufacturer': '...
2player no nationality ...[{'player': 'quincy acy', 'no': '4', 'national...
3player no nationali...[{'player': 'patrick o'bryant', 'no': 13, 'nat...
4player no nationality ...[{'player': 'mark baker', 'no': '3', 'national...
\n
"},"metadata":{}}]},{"cell_type":"markdown","source":"\n#
V | Benchmark
\n","metadata":{}},{"cell_type":"markdown","source":"\n>## Prompt","metadata":{}},{"cell_type":"code","source":"prompt = \"\"\"Your task is to extract relevant information from the provided context and format it into a list of records, following the template below.\n A JSON object representing the extracted table structure. The list of records follows this format: \n [ { \"column_1\": \"val1\",\"column_2\": \"val1\",\"column_3\": \"val1\",...},\n { \"column_1\": \"val2\",\"column_2\": \"val2\",\"column_3\": \"val3\",...},\n ...\n ]\n Each key in the records represents a column header, and the corresponding value is another object containing key-value pairs for each row in that column.\n\nINPUT example:\n# do not use the data from the examples & template; they are just for reference only. The following data contains actual information. If a value is not found, leave it empty. \n\n aircraft description max gross weight total disk area max disk loading\n0 robinson r - 22 light utility helicopter 1370 lb (635 kg) 497 ft square (46.2 m square) 2.6 lb / ft square (14 kg / m square)\n1 bell 206b3 jetranger turboshaft utility helicopter 3200 lb (1451 kg) 872 ft square (81.1 m square) 3.7 lb / ft square (18 kg / m square)\n2 ch - 47d chinook tandem rotor helicopter 50000 lb (22680 kg) 5655 ft square (526 m square) 8.8 lb / ft square (43 kg / m square)\n3 mil mi - 26 heavy - lift helicopter 123500 lb (56000 kg) 8495 ft square (789 m square) 14.5 lb / ft square (71 kg / m square)\n4 ch - 53e super stallion heavy - lift helicopter 73500 lb (33300 kg) 4900 ft square (460 m square) 15 lb / ft square (72 kg / m square)\n\nOUTPUT example:\n# do not use the data from the examples & template; they are just for reference only. The following data contains actual information. If a value is not found, leave it empty. \n[{\"aircraft\": \"robinson r - 22\",\n \"description\": \"light utility helicopter\",\n \"max gross weight\": \"1370 lb (635 kg)\",\n \"total disk area\": \"497 ft square (46.2 m square)\",\n \"max disk loading\": \"2.6 lb / ft square (14 kg / m square)\"},\n{\"aircraft\": \"bell 206b3 jetranger\",\n \"description\": \"turboshaft utility helicopter\",\n \"max gross weight\": \"3200 lb (1451 kg)\",\n \"total disk area\": \"872 ft square (81.1 m square)\",\n \"max disk loading\": \"3.7 lb / ft square (18 kg / m square)\"},\n{\"aircraft\": \"ch - 47d chinook\",\n \"description\": \"tandem rotor helicopter\",\n \"max gross weight\": \"50000 lb (22680 kg)\",\n \"total disk area\": \"5655 ft square (526 m square)\",\n \"max disk loading\": \"8.8 lb / ft square (43 kg / m square)\"},\n{\"aircraft\": \"mil mi - 26\",\n \"description\": \"heavy - lift helicopter\",\n \"max gross weight\": \"123500 lb (56000 kg)\",\n \"total disk area\": \"8495 ft square (789 m square)\",\n \"max disk loading\": \"14.5 lb / ft square (71 kg / m square)\"},\n{\"aircraft\": \"ch - 53e super stallion\",\n \"description\": \"heavy - lift helicopter\",\n \"max gross weight\": \"73500 lb (33300 kg)\",\n \"total disk area\": \"4900 ft square (460 m square)\",\n \"max disk loading\": \"15 lb / ft square (72 kg / m square)\"}]\n\"\"\"","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:50:12.658286Z","iopub.execute_input":"2024-04-16T13:50:12.659037Z","iopub.status.idle":"2024-04-16T13:50:12.666395Z","shell.execute_reply.started":"2024-04-16T13:50:12.659000Z","shell.execute_reply":"2024-04-16T13:50:12.665323Z"},"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"markdown","source":"\n>## Gemini-1.5-Pro-latest","metadata":{}},{"cell_type":"code","source":"# Set up the model\ngeneration_config = {\n \"temperature\": 1,\n \"top_p\": 0.75,\n \"max_output_tokens\": 6000,\n}","metadata":{"execution":{"iopub.status.busy":"2024-04-15T09:41:32.066112Z","iopub.execute_input":"2024-04-15T09:41:32.066601Z","iopub.status.idle":"2024-04-15T09:41:32.073318Z","shell.execute_reply.started":"2024-04-15T09:41:32.066568Z","shell.execute_reply":"2024-04-15T09:41:32.071569Z"},"trusted":true},"execution_count":58,"outputs":[]},{"cell_type":"code","source":"safety_settings = [\n {\n \"category\": \"HARM_CATEGORY_HARASSMENT\",\n \"threshold\": \"BLOCK_MEDIUM_AND_ABOVE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n \"threshold\": \"BLOCK_MEDIUM_AND_ABOVE\"\n },\n {\n \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n \"threshold\": \"BLOCK_MEDIUM_AND_ABOVE\"\n },\n {\n \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n \"threshold\": \"BLOCK_MEDIUM_AND_ABOVE\"\n },\n]","metadata":{"execution":{"iopub.status.busy":"2024-04-15T09:41:34.759048Z","iopub.execute_input":"2024-04-15T09:41:34.759580Z","iopub.status.idle":"2024-04-15T09:41:34.765339Z","shell.execute_reply.started":"2024-04-15T09:41:34.759549Z","shell.execute_reply":"2024-04-15T09:41:34.763792Z"},"trusted":true},"execution_count":59,"outputs":[]},{"cell_type":"code","source":"import google.generativeai as genai\n\nprint('Available base models:', [m.name for m in genai.list_models()])","metadata":{"execution":{"iopub.status.busy":"2024-04-15T09:41:38.212330Z","iopub.execute_input":"2024-04-15T09:41:38.213763Z","iopub.status.idle":"2024-04-15T09:41:39.130091Z","shell.execute_reply.started":"2024-04-15T09:41:38.213716Z","shell.execute_reply":"2024-04-15T09:41:39.128583Z"},"trusted":true},"execution_count":60,"outputs":[{"name":"stdout","text":"Available base models: ['models/chat-bison-001', 'models/text-bison-001', 'models/embedding-gecko-001', 'models/gemini-1.0-pro', 'models/gemini-1.0-pro-001', 'models/gemini-1.0-pro-latest', 'models/gemini-1.0-pro-vision-latest', 'models/gemini-1.5-pro-latest', 'models/gemini-pro', 'models/gemini-pro-vision', 'models/embedding-001', 'models/text-embedding-004', 'models/aqa']\n","output_type":"stream"}]},{"cell_type":"code","source":"model = genai.GenerativeModel(model_name=\"gemini-1.5-pro-latest\",\n generation_config=generation_config,\n safety_settings=safety_settings)","metadata":{"execution":{"iopub.status.busy":"2024-04-15T09:41:42.126280Z","iopub.execute_input":"2024-04-15T09:41:42.127602Z","iopub.status.idle":"2024-04-15T09:41:42.134692Z","shell.execute_reply.started":"2024-04-15T09:41:42.127541Z","shell.execute_reply":"2024-04-15T09:41:42.133090Z"},"trusted":true},"execution_count":61,"outputs":[]},{"cell_type":"code","source":"# Create a copy of the DataFrame\ndf_copy = df_sample.copy()\ndf_copy['pred_response'] = None\n\n# Iterate through each row in the DataFrame with tqdm for progress visualization\nfor i in tqdm(df_copy.index, desc=\"Generating Predictions\", total=len(df_copy)):\n try:\n template = f\"Instruction:\\n{prompt}\\nINPUTDATA:{df_copy.loc[i,'context']}\\nResponse:\\n\"\n response = model.generate_content(template)\n pred_reponse = response.text\n # Update the 'pred_response' column with the generated prediction\n df_copy.loc[i,'pred_response'] = pred_reponse.replace(template,'')\n time.sleep(5)\n except:\n df_copy.loc[i,'pred_response'] = None","metadata":{"execution":{"iopub.status.busy":"2024-04-15T09:41:50.805580Z","iopub.execute_input":"2024-04-15T09:41:50.805916Z","iopub.status.idle":"2024-04-15T10:43:00.527855Z","shell.execute_reply.started":"2024-04-15T09:41:50.805894Z","shell.execute_reply":"2024-04-15T10:43:00.526913Z"},"trusted":true},"execution_count":62,"outputs":[{"name":"stderr","text":"Generating Predictions: 100%|██████████| 99/99 [1:01:09<00:00, 37.07s/it]\n","output_type":"stream"}]},{"cell_type":"code","source":"df_copy.dropna(inplace=True)\ndf_copy.shape","metadata":{"execution":{"iopub.status.busy":"2024-04-15T10:44:51.628044Z","iopub.execute_input":"2024-04-15T10:44:51.628446Z","iopub.status.idle":"2024-04-15T10:44:51.642891Z","shell.execute_reply.started":"2024-04-15T10:44:51.628422Z","shell.execute_reply":"2024-04-15T10:44:51.640954Z"},"trusted":true},"execution_count":63,"outputs":[{"execution_count":63,"output_type":"execute_result","data":{"text/plain":"(75, 3)"},"metadata":{}}]},{"cell_type":"code","source":"sum_key=0\nsum_val = 0\ncount_errors = 0\nfor i in df_copy.index:\n\n pred_records = parse_json(df_copy.loc[i,'pred_response'])\n if pred_records==None:\n count_errors +=1\n continue\n true_records = df_copy.loc[i,'answer']\n \n sum_key += average_percentage_key(true_records,pred_records)\n \n sum_val += average_percentage_value(true_records,pred_records)\n# print(i)\n# print(average_percentage_key(true_records,pred_records))","metadata":{"execution":{"iopub.status.busy":"2024-04-15T11:31:28.697391Z","iopub.execute_input":"2024-04-15T11:31:28.697909Z","iopub.status.idle":"2024-04-15T11:31:28.727007Z","shell.execute_reply.started":"2024-04-15T11:31:28.697870Z","shell.execute_reply":"2024-04-15T11:31:28.724017Z"},"trusted":true},"execution_count":120,"outputs":[{"name":"stdout","text":"JSON parsing error: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)\nJSON parsing error: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)\nJSON parsing error: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)\n","output_type":"stream"}]},{"cell_type":"code","source":"print(\"Average Percentage of Predicted Keys:\", sum_key/(len(df_copy)-count_errors))\nprint(\"Average Percentage of Predicted values:\", sum_val/(len(df_copy)-count_errors))","metadata":{"execution":{"iopub.status.busy":"2024-04-15T11:31:30.890876Z","iopub.execute_input":"2024-04-15T11:31:30.891270Z","iopub.status.idle":"2024-04-15T11:31:30.898589Z","shell.execute_reply.started":"2024-04-15T11:31:30.891247Z","shell.execute_reply":"2024-04-15T11:31:30.896750Z"},"trusted":true},"execution_count":121,"outputs":[{"name":"stdout","text":"Average Percentage of Predicted Keys: 0.9818672839506173\nAverage Percentage of Predicted values: 0.9734204067537401\n","output_type":"stream"}]},{"cell_type":"markdown","source":"\n>## Gemini-pro","metadata":{}},{"cell_type":"code","source":"model2 = genai.GenerativeModel(model_name=\"gemini-1.0-pro\",\n generation_config=generation_config,\n safety_settings=safety_settings)","metadata":{"execution":{"iopub.status.busy":"2024-04-15T11:57:35.111910Z","iopub.execute_input":"2024-04-15T11:57:35.113113Z","iopub.status.idle":"2024-04-15T11:57:35.118681Z","shell.execute_reply.started":"2024-04-15T11:57:35.113068Z","shell.execute_reply":"2024-04-15T11:57:35.117351Z"},"trusted":true},"execution_count":123,"outputs":[]},{"cell_type":"code","source":"# Create a copy of the DataFrame\ndf_copy2 = df_sample.copy()\ndf_copy2['pred_response'] = None\n\n# Iterate through each row in the DataFrame with tqdm for progress visualization\nfor i in tqdm(df_copy2.index, desc=\"Generating Predictions\", total=len(df_copy2)):\n try:\n template = f\"Instruction:\\n{prompt}\\nINPUTDATA:{df_copy2.loc[i,'context']}\\nResponse:\\n\"\n response = model2.generate_content(template)\n pred_reponse = response.text\n # Update the 'pred_response' column with the generated prediction\n df_copy2.loc[i,'pred_response'] = pred_reponse.replace(template,'')\n time.sleep(5)\n except:\n df_copy2.loc[i,'pred_response'] = None","metadata":{"execution":{"iopub.status.busy":"2024-04-15T11:57:44.126477Z","iopub.execute_input":"2024-04-15T11:57:44.126863Z","iopub.status.idle":"2024-04-15T12:27:49.160681Z","shell.execute_reply.started":"2024-04-15T11:57:44.126838Z","shell.execute_reply":"2024-04-15T12:27:49.158409Z"},"trusted":true},"execution_count":124,"outputs":[{"name":"stderr","text":"Generating Predictions: 100%|██████████| 99/99 [30:05<00:00, 18.23s/it]\n","output_type":"stream"}]},{"cell_type":"code","source":"df_copy2.dropna(inplace=True)\ndf_copy2.shape","metadata":{"execution":{"iopub.status.busy":"2024-04-15T12:30:12.484951Z","iopub.execute_input":"2024-04-15T12:30:12.485662Z","iopub.status.idle":"2024-04-15T12:30:12.506200Z","shell.execute_reply.started":"2024-04-15T12:30:12.485603Z","shell.execute_reply":"2024-04-15T12:30:12.503883Z"},"trusted":true},"execution_count":125,"outputs":[{"execution_count":125,"output_type":"execute_result","data":{"text/plain":"(73, 3)"},"metadata":{}}]},{"cell_type":"code","source":"sum_key=0\nsum_val = 0\ncount_errors = 0\nfor i in df_copy2.index:\n\n pred_records = parse_json(df_copy2.loc[i,'pred_response'])\n if pred_records==None:\n count_errors +=1\n continue\n true_records = df_copy2.loc[i,'answer']\n \n sum_key += average_percentage_key(true_records,pred_records)\n \n sum_val += average_percentage_value(true_records,pred_records))","metadata":{"execution":{"iopub.status.busy":"2024-04-15T12:30:14.843887Z","iopub.execute_input":"2024-04-15T12:30:14.844306Z","iopub.status.idle":"2024-04-15T12:30:14.886908Z","shell.execute_reply.started":"2024-04-15T12:30:14.844279Z","shell.execute_reply":"2024-04-15T12:30:14.883006Z"},"trusted":true},"execution_count":126,"outputs":[{"name":"stdout","text":"JSON parsing error: Expecting value: line 6 column 225 (char 1764)\nJSON parsing error: Expecting value: line 1 column 1 (char 0)\n","output_type":"stream"}]},{"cell_type":"code","source":"print(\"Average Percentage of Predicted Keys:\", sum_key/(len(df_copy2)-count_errors))\nprint(\"Average Percentage of Predicted values:\", sum_val/(len(df_copy2)-count_errors))","metadata":{"execution":{"iopub.status.busy":"2024-04-15T12:30:23.862222Z","iopub.execute_input":"2024-04-15T12:30:23.862742Z","iopub.status.idle":"2024-04-15T12:30:23.874907Z","shell.execute_reply.started":"2024-04-15T12:30:23.862703Z","shell.execute_reply":"2024-04-15T12:30:23.872561Z"},"trusted":true},"execution_count":127,"outputs":[{"name":"stdout","text":"Average Percentage of Predicted Keys: 0.961524703778225\nAverage Percentage of Predicted values: 0.9308108939374473\n","output_type":"stream"}]},{"cell_type":"markdown","source":"\n>## OpenHermes-Mistral","metadata":{}},{"cell_type":"code","source":"df_sample =df.loc[:50]","metadata":{"execution":{"iopub.status.busy":"2024-04-16T13:26:27.142847Z","iopub.execute_input":"2024-04-16T13:26:27.143519Z","iopub.status.idle":"2024-04-16T13:26:27.148692Z","shell.execute_reply.started":"2024-04-16T13:26:27.143484Z","shell.execute_reply":"2024-04-16T13:26:27.147685Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"base_model_id = \"teknium/OpenHermes-2.5-Mistral-7B\"\nbnb_config = BitsAndBytesConfig(\n load_in_4bit=True,\n bnb_4bit_use_double_quant=True,\n bnb_4bit_quant_type=\"nf4\",\n bnb_4bit_compute_dtype=torch.bfloat16\n)\n\nmodel = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map=\"auto\")\ntokenizer = AutoTokenizer.from_pretrained(\"teknium/OpenHermes-2.5-Mistral-7B\")","metadata":{"execution":{"iopub.status.busy":"2024-04-15T14:05:17.856576Z","iopub.execute_input":"2024-04-15T14:05:17.856943Z","iopub.status.idle":"2024-04-15T14:07:31.245528Z","shell.execute_reply.started":"2024-04-15T14:05:17.856914Z","shell.execute_reply":"2024-04-15T14:07:31.244574Z"},"trusted":true},"execution_count":25,"outputs":[{"output_type":"display_data","data":{"text/plain":"config.json: 0%| | 0.00/624 [00:00\n>## Mistral-7B-Instruct-v0.2","metadata":{}},{"cell_type":"code","source":"base_model_id = \"mistralai/Mistral-7B-Instruct-v0.2\"\nbnb_config = BitsAndBytesConfig(\n load_in_4bit=True,\n bnb_4bit_use_double_quant=True,\n bnb_4bit_quant_type=\"nf4\",\n bnb_4bit_compute_dtype=torch.bfloat16\n)\n\nmodel = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map=\"auto\")\ntokenizer = AutoTokenizer.from_pretrained(\"mistralai/Mistral-7B-Instruct-v0.2\")","metadata":{"execution":{"iopub.status.busy":"2024-04-16T09:20:39.889611Z","iopub.execute_input":"2024-04-16T09:20:39.889980Z","iopub.status.idle":"2024-04-16T09:22:41.899623Z","shell.execute_reply.started":"2024-04-16T09:20:39.889952Z","shell.execute_reply":"2024-04-16T09:22:41.898471Z"},"trusted":true},"execution_count":27,"outputs":[{"output_type":"display_data","data":{"text/plain":"config.json: 0%| | 0.00/596 [00:00