diff --git "a/04-poe-eval.ipynb" "b/04-poe-eval.ipynb" --- "a/04-poe-eval.ipynb" +++ "b/04-poe-eval.ipynb" @@ -14,7 +14,7 @@ "metadata": {}, "source": [ "## Config\n", - "Set the tokens based on the numbers in [02-poe-token-count-exploration.ipynb](02-poe-token-count-exploration.ipynb). I like to give a little buffer in-case an explanation goes over." + "Set the tokens based on the numbers in [03-poe-token-count-exploration.ipynb](03-poe-token-count-exploration.ipynb). I like to give a little buffer in-case an explanation goes over." ] }, { @@ -25,11 +25,11 @@ "outputs": [], "source": [ "INPUT_TOKENS = 300\n", - "OUTPUT_TOKENS = 1600\n", + "OUTPUT_TOKENS = 1650\n", "\n", - "INPUT_DATASET = 'derek-thomas/labeled-multiple-choice-explained-mistral-tokenized'\n", - "OUTPUT_DATASET = 'derek-thomas/labeled-multiple-choice-explained-mistral-results'\n", - "BASE_MODEL = 'mistralai/Mistral-7B-Instruct-v0.3'" + "INPUT_DATASET = 'derek-thomas/labeled-multiple-choice-explained-falcon-tokenized'\n", + "OUTPUT_DATASET = 'derek-thomas/labeled-multiple-choice-explained-falcon-results'\n", + "BASE_MODEL = 'tiiuae/Falcon3-7B-Instruct'" ] }, { @@ -65,21 +65,21 @@ " h = \"h\"\n", "\n", "class RFAModel(BaseModel):\n", - " reasoning: str = Field(..., alias=\"Reasoning\")\n", - " final_answer: FinalAnswerEnum = Field(..., alias=\"Final Answer\")\n", + " reasoning: str = Field(...)\n", + " final_answer: FinalAnswerEnum = Field(...)\n", "\n", " class Config:\n", " populate_by_name = True\n", " \n", "class FARModel(BaseModel):\n", - " final_answer: FinalAnswerEnum = Field(..., alias=\"Final Answer\")\n", - " reasoning: str = Field(..., alias=\"Reasoning\")\n", + " final_answer: FinalAnswerEnum = Field(...)\n", + " reasoning: str = Field(...)\n", "\n", " class Config:\n", " populate_by_name = True\n", " \n", "class FAModel(BaseModel):\n", - " final_answer: FinalAnswerEnum = Field(..., alias=\"Final Answer\")\n", + " final_answer: FinalAnswerEnum = Field(...)\n", "\n", " class Config:\n", " populate_by_name = True" @@ -90,7 +90,7 @@ "id": "7e0f51c0-c4f7-4299-9a24-a4a90d4a9f2a", "metadata": {}, "source": [ - "We generated lots of experiments in [derek-thomas/labeled-multiple-choice-explained-mistral-tokenized](https://huggingface.co/datasets/derek-thomas/labeled-multiple-choice-explained-mistral-tokenized/viewer?row=0). Now we will aggregate everything we need in `experiments` for convenience." + "We generated lots of experiments in [derek-thomas/labeled-multiple-choice-explained-falcon-tokenized](https://huggingface.co/datasets/derek-thomas/labeled-multiple-choice-explained-falcon-tokenized/viewer?row=0). Now we will aggregate everything we need in `experiments` for convenience." ] }, { @@ -104,7 +104,7 @@ { "data": { "text/plain": [ - "'derek-thomas/mistral-v03-poe-RFA-mistral,derek-thomas/mistral-v03-poe-FAR-mistral,derek-thomas/mistral-v03-poe-RFA-gpt3-5,derek-thomas/mistral-v03-poe-FAR-gpt3-5,derek-thomas/mistral-v03-poe-FA'" + "'derek-thomas/falcon-v03-poe-RFA-falcon,derek-thomas/falcon-v03-poe-FAR-falcon,derek-thomas/falcon-v03-poe-RFA-gpt3-5,derek-thomas/falcon-v03-poe-FAR-gpt3-5,derek-thomas/falcon-v03-poe-FA'" ] }, "execution_count": 3, @@ -115,29 +115,29 @@ "source": [ "\n", "experiments = {\n", - " 'RFA-mistral': {\n", + " 'RFA-falcon': {\n", " 'pydantic': RFAModel,\n", - " \"lora\": \"derek-thomas/mistral-v03-poe-RFA-mistral\",\n", + " \"lora\": \"derek-thomas/falcon-v03-poe-RFA-falcon\",\n", " \"column\": 'user_prompt_RFA',\n", " },\n", - " 'FAR-mistral': {\n", + " 'FAR-falcon': {\n", " 'pydantic': FARModel,\n", - " \"lora\": \"derek-thomas/mistral-v03-poe-FAR-mistral\",\n", + " \"lora\": \"derek-thomas/falcon-v03-poe-FAR-falcon\",\n", " \"column\": 'user_prompt_FAR',\n", " },\n", " 'RFA-gpt3-5': {\n", " 'pydantic': RFAModel,\n", - " \"lora\": \"derek-thomas/mistral-v03-poe-RFA-gpt3-5\",\n", + " \"lora\": \"derek-thomas/falcon-v03-poe-RFA-gpt3-5\",\n", " \"column\": 'user_prompt_RFA',\n", " },\n", " 'FAR-gpt3-5': {\n", " 'pydantic': FARModel,\n", - " \"lora\": \"derek-thomas/mistral-v03-poe-FAR-gpt3-5\",\n", + " \"lora\": \"derek-thomas/falcon-v03-poe-FAR-gpt3-5\",\n", " \"column\": 'user_prompt_FAR',\n", " },\n", " 'FA': {\n", " 'pydantic': FAModel,\n", - " \"lora\": \"derek-thomas/mistral-v03-poe-FA\",\n", + " \"lora\": \"derek-thomas/falcon-v03-poe-FA\",\n", " \"column\": 'user_prompt_FA',\n", " },\n", " 'base': {\n", @@ -162,7 +162,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "50dbecc676db4dc78dd1974d2f1a87dc", + "model_id": "c10cc7b616e2475f8d25dd3967b1ed79", "version_major": 2, "version_minor": 0 }, @@ -201,10 +201,69 @@ "outputs": [], "source": [ "from datasets import load_dataset\n", + "import numpy as np\n", "\n", "# Load dataset (test split)\n", "dataset = load_dataset(INPUT_DATASET, split='test')\n", - "df = dataset.to_pandas()" + "df = dataset.to_pandas()\n", + "\n", + "columns_to_convert = [\n", + " 'user_prompt_RFA',\n", + " 'conversation_RFA_gpt3_5',\n", + " 'conversation_RFA_falcon',\n", + " 'user_prompt_FAR',\n", + " 'conversation_FAR_gpt3_5',\n", + " 'conversation_FAR_falcon',\n", + " 'user_prompt_FA',\n", + " 'conversation_FA'\n", + "]\n", + "\n", + "# Convert specified columns from arrays to lists\n", + "for col in columns_to_convert:\n", + " df[col] = df[col].apply(lambda x: x.tolist() if isinstance(x, (list, np.ndarray)) else x)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "35a422c3-c985-47da-b480-6d2904cb527d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "user_prompt_RFA 1683\n", + "user_prompt_FA 1683\n", + "user_prompt_FAR 1683\n" + ] + } + ], + "source": [ + "def is_valid_entry(entry):\n", + " # Check if entry is a list with at least two elements\n", + " if not isinstance(entry, list) or len(entry) < 2:\n", + " print('here')\n", + " return False\n", + " # Check el0\n", + " el0 = entry[0]\n", + " if not (isinstance(el0, dict) and el0.get('role') == 'system' and isinstance(el0.get('content'), str)):\n", + " print('system')\n", + " return False\n", + " # Check el1\n", + " el1 = entry[1]\n", + " if not (isinstance(el1, dict) and el1.get('role') == 'user' and isinstance(el1.get('content'), str)):\n", + " print('user')\n", + " return False\n", + " return True\n", + " \n", + "\n", + "def check_column_structure(column):\n", + " return column.apply(is_valid_entry)\n", + "\n", + "# Apply checks\n", + "for col in ['user_prompt_RFA', 'user_prompt_FA', 'user_prompt_FAR']:\n", + " print(f'{col}', sum(check_column_structure(df[col])))" ] }, { @@ -226,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "619fbe5e-e35d-430b-a617-9223ba6babd0", "metadata": { "scrolled": true @@ -237,9 +296,10 @@ "from huggingface_hub import get_inference_endpoint\n", "\n", "\n", - "def get_my_endpoint():\n", - " name = f\"prompt-order-experiment\"\n", - " namespace='derek-thomas'\n", + "def get_my_endpoint(name=None):\n", + " if name is None:\n", + " name = f\"prompt-order-experiment\"\n", + " namespace='HF-test-lab'\n", " try:\n", " endpoint = get_inference_endpoint(name, namespace=namespace)\n", " except:\n", @@ -265,7 +325,7 @@ " endpoint = create_inference_endpoint(\n", " name=name,\n", " namespace=namespace,\n", - " repository='mistralai/Mistral-7B-Instruct-v0.3',\n", + " repository=BASE_MODEL,\n", " framework=\"pytorch\",\n", " accelerator=\"gpu\",\n", " instance_size=\"x1\",\n", @@ -279,14 +339,14 @@ " secrets=secrets\n", " )\n", " \n", - " print(\"Your model is ready to use!\")\n", " endpoint.wait()\n", + " print(\"Your model is ready to use!\")\n", " return endpoint" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "000b907a-224d-4dbf-aa0d-e0dbee1b8787", "metadata": {}, "outputs": [ @@ -295,14 +355,14 @@ "output_type": "stream", "text": [ "Your model is ready to use!\n", - "CPU times: user 21.1 ms, sys: 10 ms, total: 31.1 ms\n", - "Wall time: 1.72 s\n" + "CPU times: user 8.29 ms, sys: 747 μs, total: 9.03 ms\n", + "Wall time: 98.6 ms\n" ] } ], "source": [ "%%time\n", - "endpoint = get_my_endpoint()" + "endpoint = get_my_endpoint('prompt-order-experiment')" ] }, { @@ -320,35 +380,38 @@ "metadata": {}, "source": [ "### Reasoning Final Answer\n", - "In both mistral and gpt3-5 we should see the **Reasoning** first and then the **Final Answer** in the prompt and the responses." + "In both falcon and gpt3-5 we should see the **Reasoning** first and then the **Final Answer** in the prompt and the responses." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "47af6191-7765-4047-bd8f-64aadb08434e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'[INST] Answer the Question and include your Reasoning and the Final Answer in a json like: {\"Reasoning: \"...\", \"Final Answer\": \"x\"} where x is a letter that corresponds to the answer choice which is a letter between a and h.\\nQuestion: What are busses used for?\\nAnswer Choices: (a) Protective shelter (b) Transporting humans (c) Help other species benefit (d) Transporting airplanes (e) A backbone (f) Communication (g) Safe operation (h) Safe driving[/INST]'" + "[{'content': 'Answer the Question and include your reasoning and the final answer in a json like: {\"reasoning\": , \"final_answer\": }.',\n", + " 'role': 'system'},\n", + " {'content': 'Question: What are busses used for?\\nAnswer Choices: (a) Protective shelter (b) Transporting humans (c) Help other species benefit (d) Transporting airplanes (e) A backbone (f) Communication (g) Safe operation (h) Safe driving',\n", + " 'role': 'user'}]" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "key = 'RFA-mistral'\n", + "key = 'RFA-falcon'\n", "user_prompt_RFA = df.iloc[0][experiments[key]['column']]\n", "user_prompt_RFA" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "1f976218-f33c-4db3-9797-3935e121e6b2", "metadata": { "scrolled": true @@ -357,27 +420,28 @@ { "data": { "text/plain": [ - "'{\"Reasoning\": \"Busses are primarily used for transporting people, so the correct answer is (b) Transporting humans. The other options are not related to the function of a bus.\", \"Final Answer\": \"b\"}'" + "{'reasoning': 'Busses are primarily designed to transport people from one location to another. They are a common mode of public transportation used by many for commuting, school, work, and other activities. None of the other choices directly relate to the main function of a bus.',\n", + " 'final_answer': 'b'}" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "response = endpoint.client.text_generation(\n", - " prompt=user_prompt_RFA,\n", - " max_new_tokens=OUTPUT_TOKENS,\n", - " adapter_id=experiments[key]['lora'],\n", - " grammar={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n", + "response = endpoint.client.chat_completion(\n", + " messages=user_prompt_RFA,\n", + " max_tokens=INPUT_TOKENS + OUTPUT_TOKENS,\n", + " model=experiments[key]['lora'],\n", + " response_format={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n", ")\n", - "response" + "json.loads(response.choices[0].message.content)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "222e33b7-0158-44f8-8848-da5318e699b4", "metadata": { "scrolled": true @@ -386,23 +450,24 @@ { "data": { "text/plain": [ - "'{\"Reasoning\": \"Busses are primarily used for transporting humans, especially in urban areas where public transportation is necessary. They provide a means of transportation for a large number of people at once, reducing traffic congestion and carbon emissions. Therefore, the correct answer is (b) transporting humans.\", \"Final Answer\": \"b\"}'" + "{'reasoning': \"Busses are large vehicles designed to transport people from one place to another. They operate on fixed routes and schedules, offering a convenient mode of public transportation for many individuals. The choice of 'Transporting humans' best encapsulates the primary function of busses, as they are not intended for carrying other items or species, nor are they part of an airplane's structure.\",\n", + " 'final_answer': 'b'}" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "key = 'RFA-gpt3-5'\n", - "response = endpoint.client.text_generation(\n", - " prompt=user_prompt_RFA,\n", - " max_new_tokens=OUTPUT_TOKENS,\n", - " adapter_id=experiments[key]['lora'],\n", - " grammar={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n", + "response = endpoint.client.chat_completion(\n", + " messages=user_prompt_RFA,\n", + " max_tokens=INPUT_TOKENS + OUTPUT_TOKENS,\n", + " model=experiments[key]['lora'],\n", + " response_format={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n", ")\n", - "response" + "json.loads(response.choices[0].message.content)" ] }, { @@ -411,22 +476,25 @@ "metadata": {}, "source": [ "### Final Answer Reasoning \n", - "In both mistral and gpt3-5 we should see the **Final Answer** first and then the **Reasoning** in the prompt and the responses." + "In both falcon and gpt3-5 we should see the **Final Answer** first and then the **Reasoning** in the prompt and the responses." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "ec3e574b-f63f-4513-a6ae-335136543a8c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'[INST] Answer the Question and include your Final Answer and the Reasoning in a json like: {\"Final Answer\": \"x\", \"Reasoning: \"...\"} where x is a letter that corresponds to the answer choice which is a letter between a and h.\\nQuestion: What are busses used for?\\nAnswer Choices: (a) Protective shelter (b) Transporting humans (c) Help other species benefit (d) Transporting airplanes (e) A backbone (f) Communication (g) Safe operation (h) Safe driving[/INST]'" + "[{'content': 'Answer the Question and include your Final Answer and the Reasoning in a json like: {\"final_answer\": , \"reasoning\": }.',\n", + " 'role': 'system'},\n", + " {'content': 'Question: What are busses used for?\\nAnswer Choices: (a) Protective shelter (b) Transporting humans (c) Help other species benefit (d) Transporting airplanes (e) A backbone (f) Communication (g) Safe operation (h) Safe driving',\n", + " 'role': 'user'}]" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -439,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "24f30a15-5ec0-4f26-b32f-b4ccb429e6f9", "metadata": { "scrolled": true @@ -448,27 +516,28 @@ { "data": { "text/plain": [ - "'{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are primarily used for transporting humans, especially in urban areas, to facilitate commuting and travel. They are not used for protective shelter, helping other species benefit, transporting airplanes, serving as a backbone, communication, safe operation, or safe driving.\"}'" + "{'final_answer': 'b',\n", + " 'reasoning': 'Buses are vehicles primarily used for transporting humans from one place to another. They provide a convenient and efficient way for people to travel together on public transit. The other options are not accurate representations of the main purpose of a bus. Protective shelter, help other species benefit, transport airplanes, backbone, communication, safe operation, and safe driving are not the primary functions of a bus.'}" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "response = endpoint.client.text_generation(\n", - " prompt=user_prompt_FAR,\n", - " max_new_tokens=575,\n", - " adapter_id=experiments[key]['lora'],\n", - " grammar={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n", + "response = endpoint.client.chat_completion(\n", + " messages=user_prompt_FAR,\n", + " max_tokens=INPUT_TOKENS + OUTPUT_TOKENS,\n", + " model=experiments[key]['lora'],\n", + " response_format={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n", ")\n", - "response" + "json.loads(response.choices[0].message.content)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "32536844-211d-4856-983c-d5787734d420", "metadata": { "scrolled": true @@ -477,23 +546,24 @@ { "data": { "text/plain": [ - "'{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are primarily used for transporting humans, especially in urban areas, to facilitate their travel from one place to another. The other options are incorrect because busses do not provide protective shelter, do not help other species benefit, are not used to transport airplanes, do not serve as a backbone, are not used for communication, and are not related to safe operation or driving.\"}'" + "{'final_answer': 'b',\n", + " 'reasoning': 'Busses are primarily used for transporting humans from one place to another, making option (b) the most accurate choice among the given answers.'}" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "key = 'FAR-mistral'\n", - "response = endpoint.client.text_generation(\n", - " prompt=user_prompt_FAR,\n", - " max_new_tokens=575,\n", - " adapter_id=experiments[key]['lora'],\n", - " grammar={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n", + "key = 'FAR-falcon'\n", + "response = endpoint.client.chat_completion(\n", + " messages=user_prompt_FAR,\n", + " max_tokens=INPUT_TOKENS + OUTPUT_TOKENS,\n", + " model=experiments[key]['lora'],\n", + " response_format={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n", ")\n", - "response" + "json.loads(response.choices[0].message.content)" ] }, { @@ -507,17 +577,20 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "71a9f634-319c-40c2-8f66-18e282732320", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'[INST] Answer the Question and include your Final Answer in a json like: {\"Final Answer\": \"x\"} where x is a letter that corresponds to the answer choice which is a letter between a and h.\\nQuestion: What are busses used for?\\nAnswer Choices: (a) Protective shelter (b) Transporting humans (c) Help other species benefit (d) Transporting airplanes (e) A backbone (f) Communication (g) Safe operation (h) Safe driving[/INST]'" + "[{'content': 'Answer the Question and include your Final Answer in a json like: {\"final_answer\": }.',\n", + " 'role': 'system'},\n", + " {'content': 'Question: What are busses used for?\\nAnswer Choices: (a) Protective shelter (b) Transporting humans (c) Help other species benefit (d) Transporting airplanes (e) A backbone (f) Communication (g) Safe operation (h) Safe driving',\n", + " 'role': 'user'}]" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -530,7 +603,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "1cded37d-b907-4f4d-9b8c-c2167c6ba213", "metadata": { "scrolled": true @@ -539,21 +612,22 @@ { "data": { "text/plain": [ - "\"{'Final Answer': 'b'}\"" + "{'final_answer': 'b'}" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "response = endpoint.client.text_generation(\n", - " prompt=user_prompt_FA,\n", - " max_new_tokens=575,\n", - " adapter_id=experiments[key]['lora'],\n", + "response = endpoint.client.chat_completion(\n", + " messages=user_prompt_FA,\n", + " max_tokens=INPUT_TOKENS + OUTPUT_TOKENS,\n", + " model=experiments[key]['lora'],\n", + " response_format={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n", ")\n", - "response" + "json.loads(response.choices[0].message.content)\n" ] }, { @@ -565,96 +639,16 @@ "I used 20x the prefill than the input and 8 replicas so I should capacity for ~160 parallel requests. Im only using 128 but it should be pretty fast." ] }, - { - "cell_type": "code", - "execution_count": 17, - "id": "fbc69ea7-e2ca-4405-82f0-5af95375ec88", - "metadata": {}, - "outputs": [], - "source": [ - "import nest_asyncio\n", - "import asyncio\n", - "from transformers import AutoTokenizer\n", - "from tqdm.auto import tqdm\n", - "\n", - "# Allow nested event loops in Jupyter\n", - "nest_asyncio.apply()\n", - "\n", - "\n", - "# Semaphore to limit concurrency\n", - "CONCURRENCY_LIMIT = 100 \n", - "MAX_NEW_TOKENS = OUTPUT_TOKENS\n", - "semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)\n", - "\n", - "# Progress bar\n", - "progress_bar = None # Global to allow updates from within async functions\n", - "\n", - "# Function to send asynchronous requests to the endpoint\n", - "async def fetch_response_async(async_client, prompt, lora_id, pydantic_model):\n", - " async with semaphore: # Limit the number of concurrent requests\n", - " response = await async_client.text_generation(\n", - " prompt=prompt,\n", - " max_new_tokens=MAX_NEW_TOKENS,\n", - " adapter_id=lora_id if lora_id else None,\n", - " grammar={\"type\": \"json\", \"value\": pydantic_model.schema()}\n", - " )\n", - " progress_bar.update(1) # Update the progress bar when the request is complete\n", - " return response\n", - "\n", - "# Function to process a single conversation type asynchronously\n", - "async def process_conversation_type(conversation_type, model_info, df, tokenizer, async_client):\n", - " response_column = f\"responses_{conversation_type.replace('-','_')}\"\n", - " responses = [] # Temporary list to hold responses for the current conversation type\n", - "\n", - " tasks = []\n", - " for _, item in df.iterrows():\n", - " prompt = item.get(model_info[\"column\"])\n", - " tasks.append(fetch_response_async(async_client, prompt, model_info[\"lora\"], model_info[\"pydantic\"]))\n", - "\n", - " # Wait for all tasks to complete\n", - " responses = await asyncio.gather(*tasks)\n", - "\n", - " # If responses are strings, use them directly; otherwise, extract 'generated_text'\n", - " try:\n", - " df[response_column] = [resp[\"generated_text\"] for resp in responses]\n", - " except TypeError: # Fallback in case responses are raw strings\n", - " df[response_column] = responses\n", - "\n", - "# Main function to handle all conversation types\n", - "async def main(df, models, tokenizer, async_client):\n", - " global progress_bar\n", - " total_requests = len(df) * len(models) # Total number of requests across all conversation types\n", - " progress_bar = tqdm(total=total_requests, desc=\"Processing requests\")\n", - "\n", - " tasks = []\n", - " for conversation_type, model_info in models.items():\n", - " tasks.append(process_conversation_type(conversation_type, model_info, df, tokenizer, async_client))\n", - " await asyncio.gather(*tasks)\n", - "\n", - " progress_bar.close() # Close the progress bar when done\n", - "\n", - "# Define parameters and run\n", - "# await main(df, experiments, tokenizer, endpoint.async_client)" - ] - }, - { - "cell_type": "markdown", - "id": "9662adde-4ea4-4254-a6be-c619ed9557c8", - "metadata": {}, - "source": [ - "This is the same as above but with a couple nice features like time-out in case you run into any issues." - ] - }, { "cell_type": "code", "execution_count": 18, - "id": "e2671c78-6cd1-4bb7-aa04-b695bfb02115", + "id": "379cd952-0ef4-41c4-be3b-5e7ac97e9d78", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "be0bd3e278ae4d90a161918772ee71e8", + "model_id": "85d5f7f69b8c426f90ae2ac79935d1ee", "version_major": 2, "version_minor": 0 }, @@ -664,6 +658,62 @@ }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n", + "Error: 424, message='Failed Dependency', url='https://f3b2osxktsilpdce.us-east4.gcp.endpoints.huggingface.cloud/v1/chat/completions'. Retrying... (1/3)\n" + ] } ], "source": [ @@ -677,8 +727,8 @@ "nest_asyncio.apply()\n", "\n", "# Semaphore to limit concurrency\n", - "CONCURRENCY_LIMIT = 100 \n", - "MAX_NEW_TOKENS = OUTPUT_TOKENS\n", + "CONCURRENCY_LIMIT = 50 \n", + "MAX_NEW_TOKENS = INPUT_TOKENS + OUTPUT_TOKENS\n", "semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)\n", "\n", "# Progress bar\n", @@ -694,14 +744,14 @@ " while retries < MAX_RETRIES:\n", " try:\n", " async with semaphore: # Limit the number of concurrent requests\n", - " response = await async_client.text_generation(\n", - " prompt=prompt,\n", - " max_new_tokens=MAX_NEW_TOKENS,\n", - " adapter_id=lora_id if lora_id else None,\n", - " grammar={\"type\": \"json\", \"value\": pydantic_model.schema()}\n", + " response = await async_client.chat_completion(\n", + " messages=prompt,\n", + " max_tokens=MAX_NEW_TOKENS,\n", + " model=lora_id if lora_id else None,\n", + " response_format={\"type\": \"json\", \"value\": pydantic_model.schema()}\n", " )\n", " progress_bar.update(1) # Update the progress bar when the request is complete\n", - " return response\n", + " return response.choices[0].message.content\n", " except Exception as e:\n", " retries += 1\n", " if retries >= MAX_RETRIES:\n", @@ -743,7 +793,7 @@ " progress_bar.close() # Close the progress bar when done\n", "\n", "# Define parameters and run\n", - "await main(df, experiments, tokenizer, endpoint.async_client)\n" + "await main(df, experiments, tokenizer, endpoint.async_client)" ] }, { @@ -751,22 +801,22 @@ "id": "d562e0b4-96ae-4259-925d-4d62b8c49641", "metadata": {}, "source": [ - "It took `00:10:43`. Not bad! That should be around `$1.14` total at `$0.80/gpu/hr`." + "It took `00:17:02`. Not bad! That should be around `$1.14` total at `$80/gpu/hr`." ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 28, "id": "8f81466e-80fb-4915-9c68-dfbe168e052b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "InferenceEndpoint(name='prompt-order-experiment', namespace='derek-thomas', repository='mistralai/Mistral-7B-Instruct-v0.3', status='paused', url=None)" + "InferenceEndpoint(name='prompt-order-experiment', namespace='HF-test-lab', repository='tiiuae/Falcon3-7B-Instruct', status='paused', url=None)" ] }, - "execution_count": 19, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -777,7 +827,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "f5a79dad-2475-4324-8a2b-77b33e9c0822", "metadata": {}, "outputs": [], @@ -787,7 +837,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "id": "3473d555-927a-49ab-8e15-9097ed455c48", "metadata": {}, "outputs": [ @@ -816,23 +866,22 @@ " question_text\n", " answer_key\n", " gpt3_5_reasoning\n", - " mistral_reasoning\n", + " falcon_reasoning\n", " answer_choices\n", - " user_prompt\n", " user_prompt_RFA\n", " conversation_RFA_gpt3_5\n", - " conversation_RFA_mistral\n", - " ...\n", + " conversation_RFA_falcon\n", + " user_prompt_FAR\n", " conversation_FAR_gpt3_5\n", - " conversation_FAR_mistral\n", + " conversation_FAR_falcon\n", " user_prompt_FA\n", " conversation_FA\n", - " responses_RFA_mistral\n", - " responses_FAR_mistral\n", + " responses_RFA_falcon\n", + " responses_FAR_falcon\n", " responses_RFA_gpt3_5\n", - " responses_FAR_gpt3_5\n", - " responses_FA\n", " responses_base\n", + " responses_FA\n", + " responses_FAR_gpt3_5\n", " \n", " \n", " \n", @@ -842,23 +891,22 @@ " What are busses used for?\n", " b\n", " a) Protective shelter: This option is incorrec...\n", - " 1. Start by reading the question carefully: \"C...\n", + " (a) Protective shelter - \\nErroneous. Busses a...\n", " (a) Protective shelter (b) Transporting humans...\n", - " Question: What are busses used for?\\nAnswer Ch...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " {\"Reasoning\": \"Busses are primarily used for t...\n", - " {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are...\n", - " {\"Reasoning\": \"Busses are primarily used for t...\n", - " {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are...\n", - " { \"Final Answer\": \"b\" }\n", - " {\"Final Answer\": \"b\"}\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " {\"reasoning\": \"Busses are primarily used for t...\n", + " {\"final_answer\": \"b\", \"reasoning\": \"Busses are...\n", + " {\"reasoning\": \"Busses are vehicles used primar...\n", + " {\"final_answer\": \"b\"}\n", + " {\"final_answer\": \"b\"}\n", + " {\"final_answer\": \"b\", \"reasoning\": \"Busses are...\n", " \n", " \n", " 1\n", @@ -866,23 +914,22 @@ " Which of the following does not contribute to ...\n", " g\n", " a) Nucleus of a cell: This option is not relat...\n", - " To solve this question, let's first understand...\n", + " (a) Nucleus of a cell: This option is incorrec...\n", " (a) Nucleus of a cell (b) Flying in a plane (c...\n", - " Question: Which of the following does not cont...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " {\"Reasoning\": \"Global warming is primarily cau...\n", - " {\"Final Answer\": \"g\", \"Reasoning\": \"Riding a b...\n", - " {\"Reasoning\": \"The nucleus of a cell (option a...\n", - " { \"Final Answer\": \"a\" ,\"Reasoning\": \"The nucle...\n", - " {\"Final Answer\": \"a\"}\n", - " {\"Final Answer\": \"a\"}\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " {\"reasoning\": \"Nucleus of a cell (a) does not ...\n", + " {\"final_answer\": \"a\", \"reasoning\": \"The nucleu...\n", + " {\"reasoning\": \"The question asks which of the ...\n", + " {\"final_answer\": \"g\"}\n", + " {\"final_answer\": \"g\"}\n", + " {\"final_answer\": \"d\", \"reasoning\": \"The questi...\n", " \n", " \n", " 2\n", @@ -890,23 +937,22 @@ " What uses electrical energy converted from che...\n", " b\n", " a) Sunlight: Sunlight is a form of energy that...\n", - " 1. Read the question and options carefully: Th...\n", + " (a) Sunlight: Sunlight is a form of energy tha...\n", " (a) Sunlight (b) Cameras (c) Cells (d) Buses (...\n", - " Question: What uses electrical energy converte...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " {\"Reasoning\": \"The question asks for an organi...\n", - " {\"Final Answer\": \"b\", \"Reasoning\": \"Cameras ar...\n", - " {\"Reasoning\": \"The correct answer is (c) Cells...\n", - " {\"Final Answer\": \"c\", \"Reasoning\": \"Cells are ...\n", - " {\"Final Answer\": \"b\"}\n", - " {\"Final Answer\": \"c\"}\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " {\"reasoning\": \"Cells convert chemical energy s...\n", + " {\"final_answer\": \"c\", \"reasoning\": \"Cells, spe...\n", + " {\"reasoning\": \"Cells use electrical energy for...\n", + " {\"final_answer\": \"c\"}\n", + " {\"final_answer\": \"f\"}\n", + " {\"final_answer\": \"f\", \"reasoning\": \"Cars use e...\n", " \n", " \n", " 3\n", @@ -914,23 +960,22 @@ " Bacteria causes what to be harmed?\n", " a\n", " Now, let's go through each option and explain ...\n", - " To answer this question correctly, let's follo...\n", + " 1. **Plants (a) - Correct Answer:**\\n - Bact...\n", " (a) Plants (b) Electronics (c) Fossils (d) Hum...\n", - " Question: Bacteria causes what to be harmed?\\n...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " {\"Reasoning\": \"The question asks about what is...\n", - " {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c...\n", - " {\"Reasoning\": \"Bacteria are microorganisms tha...\n", - " {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c...\n", - " {\"Final Answer\": \"e\"}\n", - " {\"Final Answer\": \"d\"}\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " {\"reasoning\": \"Bacteria can cause harm to vari...\n", + " {\"final_answer\": \"d\", \"reasoning\": \"Bacteria c...\n", + " {\"reasoning\": \"Bacteria can harm various livin...\n", + " {\"final_answer\": \"d\"}\n", + " {\"final_answer\": \"d\"}\n", + " {\"final_answer\": \"d\", \"reasoning\": \"The questi...\n", " \n", " \n", " 4\n", @@ -938,23 +983,22 @@ " Plants and snakes live _.?\n", " a\n", " b) Important habitats: This option is incorrec...\n", - " 1. Read the question and options carefully: Th...\n", + " **Answer: (a) Almost everywhere**\\n\\n**Explana...\n", " (a) Almost everywhere (b) Important habitats (...\n", - " Question: Plants and snakes live _.?\\nAnswer C...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " {\"Reasoning\": \"The question asks for the livin...\n", - " {\"Final Answer\": \"a\", \"Reasoning\": \"Plants and...\n", - " {\"Reasoning\": \"The question asks about the liv...\n", - " { \"Final Answer\": \"a\" ,\"Reasoning\": \"Plants an...\n", - " {\"Final Answer\": \"a\"}\n", - " {\"Final Answer\": \"g\"}\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " {\"reasoning\": \"Plants and snakes, as different...\n", + " {\"final_answer\": \"g\", \"reasoning\": \"The correc...\n", + " {\"reasoning\": \"Plants and snakes are both comm...\n", + " {\"final_answer\": \"g\"}\n", + " {\"final_answer\": \"a\"}\n", + " {\"final_answer\": \"f\", \"reasoning\": \"Plants and...\n", " \n", " \n", " ...\n", @@ -978,7 +1022,6 @@ " ...\n", " ...\n", " ...\n", - " ...\n", " \n", " \n", " 1678\n", @@ -986,23 +1029,22 @@ " New resources required for creation can be red...\n", " g\n", " a) Mining: Mining involves extracting minerals...\n", - " 1. Start by reading the question and options c...\n", + " (a) Mining: Mining is the process of extractin...\n", " (a) Mining (b) Mutations (c) Fossil fuels (d) ...\n", - " Question: New resources required for creation ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " {\"Reasoning\": \"The question asks for a way to ...\n", - " {\"Final Answer\": \"g\", \"Reasoning\": \"Recycling ...\n", - " {\"Reasoning\": \"Mining, fossil fuels, deforesta...\n", - " { \"Final Answer\": \"g\" ,\"Reasoning\": \"Recycling...\n", - " {\"Final Answer\": \"g\"}\n", - " {\"Final Answer\": \"g\"}\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " {\"reasoning\": \"Recycling (g) is the option tha...\n", + " {\"final_answer\": \"g\", \"reasoning\": \"Recycling ...\n", + " {\"reasoning\": \"New resources required for crea...\n", + " {\"final_answer\": \"g\"}\n", + " {\"final_answer\": \"g\"}\n", + " {\"final_answer\": \"g\", \"reasoning\": \"The correc...\n", " \n", " \n", " 1679\n", @@ -1010,23 +1052,22 @@ " A drought dehydrates an entire what?\n", " d\n", " a) Body water: This option is incorrect becaus...\n", - " (a) Watershred - This is not a scientific term...\n", + " The correct answer is (d) Environment. \\n\\nNow...\n", " (a) Body water (b) Dried fruit (c) Bodily wate...\n", - " Question: A drought dehydrates an entire what?...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " {\"Reasoning\": \"The question asks for a term th...\n", - " {\"Final Answer\": \"d\", \"Reasoning\": \"A drought ...\n", - " {\"Reasoning\": \"A drought is a prolonged period...\n", - " { \"Final Answer\": \"d\" ,\"Reasoning\": \"A drought...\n", - " {\"Final Answer\": \"a\"}\n", - " {\"Final Answer\": \"d\"}\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " {\"reasoning\": \"A drought is a period of abnorm...\n", + " {\"final_answer\": \"d\", \"reasoning\": \"A drought ...\n", + " {\"reasoning\": \"Drought is a long-term lack of ...\n", + " {\"final_answer\": \"d\"}\n", + " {\"final_answer\": \"d\"}\n", + " {\"final_answer\": \"d\", \"reasoning\": \"A drought ...\n", " \n", " \n", " 1680\n", @@ -1034,23 +1075,22 @@ " An animal requires ingestion to do what?\n", " e\n", " a) Aerobic capacity: This option is not logica...\n", - " 1. Read the question and options carefully: \"W...\n", + " (a) Aerobic capacity: This refers to an animal...\n", " (a) Aerobic capacity (b) Die (c) Water conserv...\n", - " Question: An animal requires ingestion to do w...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " {\"Reasoning\": \"Ingestion is the process of tak...\n", - " {\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ...\n", - " {\"Reasoning\": \"Ingestion is the process of tak...\n", - " {\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ...\n", - " {\"Final Answer\": \"e\"}\n", - " {\"Final Answer\": \"d\"}\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " {\"reasoning\": \"Ingestion is the process of tak...\n", + " {\"final_answer\": \"e\", \"reasoning\": \"Ingestion ...\n", + " {\"reasoning\": \"Ingestion is the process by whi...\n", + " {\"final_answer\": \"e\"}\n", + " {\"final_answer\": \"c\"}\n", + " {\"final_answer\": \"e\", \"reasoning\": \"Animals re...\n", " \n", " \n", " 1681\n", @@ -1058,23 +1098,22 @@ " Ultraviolet light can cause what?\n", " b\n", " a) Ultraviolet light does not cause heat energ...\n", - " 1. First, read the question and options carefu...\n", + " Let's examine each option and determine why so...\n", " (a) Heat energy (b) Skin cancer (c) Killing in...\n", - " Question: Ultraviolet light can cause what?\\nA...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " {\"Reasoning\": \"Ultraviolet (UV) light is a typ...\n", - " {\"Final Answer\": \"b\", \"Reasoning\": \"Ultraviole...\n", - " {\"Reasoning\": \"Ultraviolet (UV) light is a typ...\n", - " { \"Final Answer\": \"b\" ,\"Reasoning\": \"Ultraviol...\n", - " {\"Final Answer\": \"b\"}\n", - " {\"Final Answer\": \"b\"}\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " {\"reasoning\": \"Ultraviolet light is known to h...\n", + " {\"final_answer\": \"b\", \"reasoning\": \"Ultraviole...\n", + " {\"reasoning\": \"Ultraviolet (UV) light is a typ...\n", + " {\"final_answer\": \"b\"}\n", + " {\"final_answer\": \"b\"}\n", + " {\"final_answer\": \"b\", \"reasoning\": \"Ultraviole...\n", " \n", " \n", " 1682\n", @@ -1082,27 +1121,26 @@ " What can increase a body's strength?\n", " c\n", " a) Four limbs: This option is not correct beca...\n", - " (a) Communication: In this context, the questi...\n", + " (a) Four limbs: Having four limbs doesn't dire...\n", " (a) Four limbs (b) Disease (c) Running (d) Bic...\n", - " Question: What can increase a body's strength?...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " {\"Reasoning\": \"The correct answer is 'Exercise...\n", - " {\"Final Answer\": \"c\", \"Reasoning\": \"Running is...\n", - " {\"Reasoning\": \"A body's strength is primarily ...\n", - " {\"Final Answer\": \"c\", \"Reasoning\": \"Running is...\n", - " {\"Final Answer\": \"f\"}\n", - " {\"Final Answer\": \"c\"}\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " {\"reasoning\": \"Among the choices provided, run...\n", + " {\"final_answer\": \"c\", \"reasoning\": \"Running is...\n", + " {\"reasoning\": \"Running involves physical activ...\n", + " {\"final_answer\": \"c\"}\n", + " {\"final_answer\": \"f\"}\n", + " {\"final_answer\": \"c\", \"reasoning\": \"a) Four li...\n", " \n", " \n", "\n", - "

1683 rows × 21 columns

\n", + "

1683 rows × 20 columns

\n", "" ], "text/plain": [ @@ -1132,18 +1170,18 @@ "1681 b a) Ultraviolet light does not cause heat energ... \n", "1682 c a) Four limbs: This option is not correct beca... \n", "\n", - " mistral_reasoning \\\n", - "0 1. Start by reading the question carefully: \"C... \n", - "1 To solve this question, let's first understand... \n", - "2 1. Read the question and options carefully: Th... \n", - "3 To answer this question correctly, let's follo... \n", - "4 1. Read the question and options carefully: Th... \n", + " falcon_reasoning \\\n", + "0 (a) Protective shelter - \\nErroneous. Busses a... \n", + "1 (a) Nucleus of a cell: This option is incorrec... \n", + "2 (a) Sunlight: Sunlight is a form of energy tha... \n", + "3 1. **Plants (a) - Correct Answer:**\\n - Bact... \n", + "4 **Answer: (a) Almost everywhere**\\n\\n**Explana... \n", "... ... \n", - "1678 1. Start by reading the question and options c... \n", - "1679 (a) Watershred - This is not a scientific term... \n", - "1680 1. Read the question and options carefully: \"W... \n", - "1681 1. First, read the question and options carefu... \n", - "1682 (a) Communication: In this context, the questi... \n", + "1678 (a) Mining: Mining is the process of extractin... \n", + "1679 The correct answer is (d) Environment. \\n\\nNow... \n", + "1680 (a) Aerobic capacity: This refers to an animal... \n", + "1681 Let's examine each option and determine why so... \n", + "1682 (a) Four limbs: Having four limbs doesn't dire... \n", "\n", " answer_choices \\\n", "0 (a) Protective shelter (b) Transporting humans... \n", @@ -1158,179 +1196,179 @@ "1681 (a) Heat energy (b) Skin cancer (c) Killing in... \n", "1682 (a) Four limbs (b) Disease (c) Running (d) Bic... \n", "\n", - " user_prompt \\\n", - "0 Question: What are busses used for?\\nAnswer Ch... \n", - "1 Question: Which of the following does not cont... \n", - "2 Question: What uses electrical energy converte... \n", - "3 Question: Bacteria causes what to be harmed?\\n... \n", - "4 Question: Plants and snakes live _.?\\nAnswer C... \n", - "... ... \n", - "1678 Question: New resources required for creation ... \n", - "1679 Question: A drought dehydrates an entire what?... \n", - "1680 Question: An animal requires ingestion to do w... \n", - "1681 Question: Ultraviolet light can cause what?\\nA... \n", - "1682 Question: What can increase a body's strength?... \n", - "\n", " user_prompt_RFA \\\n", - "0 [INST] Answer the Question and include your... \n", - "1 [INST] Answer the Question and include your... \n", - "2 [INST] Answer the Question and include your... \n", - "3 [INST] Answer the Question and include your... \n", - "4 [INST] Answer the Question and include your... \n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", "... ... \n", - "1678 [INST] Answer the Question and include your... \n", - "1679 [INST] Answer the Question and include your... \n", - "1680 [INST] Answer the Question and include your... \n", - "1681 [INST] Answer the Question and include your... \n", - "1682 [INST] Answer the Question and include your... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", "\n", " conversation_RFA_gpt3_5 \\\n", - "0 [INST] Answer the Question and include your... \n", - "1 [INST] Answer the Question and include your... \n", - "2 [INST] Answer the Question and include your... \n", - "3 [INST] Answer the Question and include your... \n", - "4 [INST] Answer the Question and include your... \n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", "... ... \n", - "1678 [INST] Answer the Question and include your... \n", - "1679 [INST] Answer the Question and include your... \n", - "1680 [INST] Answer the Question and include your... \n", - "1681 [INST] Answer the Question and include your... \n", - "1682 [INST] Answer the Question and include your... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", "\n", - " conversation_RFA_mistral ... \\\n", - "0 [INST] Answer the Question and include your... ... \n", - "1 [INST] Answer the Question and include your... ... \n", - "2 [INST] Answer the Question and include your... ... \n", - "3 [INST] Answer the Question and include your... ... \n", - "4 [INST] Answer the Question and include your... ... \n", - "... ... ... \n", - "1678 [INST] Answer the Question and include your... ... \n", - "1679 [INST] Answer the Question and include your... ... \n", - "1680 [INST] Answer the Question and include your... ... \n", - "1681 [INST] Answer the Question and include your... ... \n", - "1682 [INST] Answer the Question and include your... ... \n", + " conversation_RFA_falcon \\\n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", + "... ... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", + "\n", + " user_prompt_FAR \\\n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", + "... ... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", "\n", " conversation_FAR_gpt3_5 \\\n", - "0 [INST] Answer the Question and include your... \n", - "1 [INST] Answer the Question and include your... \n", - "2 [INST] Answer the Question and include your... \n", - "3 [INST] Answer the Question and include your... \n", - "4 [INST] Answer the Question and include your... \n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", "... ... \n", - "1678 [INST] Answer the Question and include your... \n", - "1679 [INST] Answer the Question and include your... \n", - "1680 [INST] Answer the Question and include your... \n", - "1681 [INST] Answer the Question and include your... \n", - "1682 [INST] Answer the Question and include your... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", "\n", - " conversation_FAR_mistral \\\n", - "0 [INST] Answer the Question and include your... \n", - "1 [INST] Answer the Question and include your... \n", - "2 [INST] Answer the Question and include your... \n", - "3 [INST] Answer the Question and include your... \n", - "4 [INST] Answer the Question and include your... \n", + " conversation_FAR_falcon \\\n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", "... ... \n", - "1678 [INST] Answer the Question and include your... \n", - "1679 [INST] Answer the Question and include your... \n", - "1680 [INST] Answer the Question and include your... \n", - "1681 [INST] Answer the Question and include your... \n", - "1682 [INST] Answer the Question and include your... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", "\n", " user_prompt_FA \\\n", - "0 [INST] Answer the Question and include your... \n", - "1 [INST] Answer the Question and include your... \n", - "2 [INST] Answer the Question and include your... \n", - "3 [INST] Answer the Question and include your... \n", - "4 [INST] Answer the Question and include your... \n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", "... ... \n", - "1678 [INST] Answer the Question and include your... \n", - "1679 [INST] Answer the Question and include your... \n", - "1680 [INST] Answer the Question and include your... \n", - "1681 [INST] Answer the Question and include your... \n", - "1682 [INST] Answer the Question and include your... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", "\n", " conversation_FA \\\n", - "0 [INST] Answer the Question and include your... \n", - "1 [INST] Answer the Question and include your... \n", - "2 [INST] Answer the Question and include your... \n", - "3 [INST] Answer the Question and include your... \n", - "4 [INST] Answer the Question and include your... \n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", "... ... \n", - "1678 [INST] Answer the Question and include your... \n", - "1679 [INST] Answer the Question and include your... \n", - "1680 [INST] Answer the Question and include your... \n", - "1681 [INST] Answer the Question and include your... \n", - "1682 [INST] Answer the Question and include your... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", "\n", - " responses_RFA_mistral \\\n", - "0 {\"Reasoning\": \"Busses are primarily used for t... \n", - "1 {\"Reasoning\": \"Global warming is primarily cau... \n", - "2 {\"Reasoning\": \"The question asks for an organi... \n", - "3 {\"Reasoning\": \"The question asks about what is... \n", - "4 {\"Reasoning\": \"The question asks for the livin... \n", + " responses_RFA_falcon \\\n", + "0 {\"reasoning\": \"Busses are primarily used for t... \n", + "1 {\"reasoning\": \"Nucleus of a cell (a) does not ... \n", + "2 {\"reasoning\": \"Cells convert chemical energy s... \n", + "3 {\"reasoning\": \"Bacteria can cause harm to vari... \n", + "4 {\"reasoning\": \"Plants and snakes, as different... \n", "... ... \n", - "1678 {\"Reasoning\": \"The question asks for a way to ... \n", - "1679 {\"Reasoning\": \"The question asks for a term th... \n", - "1680 {\"Reasoning\": \"Ingestion is the process of tak... \n", - "1681 {\"Reasoning\": \"Ultraviolet (UV) light is a typ... \n", - "1682 {\"Reasoning\": \"The correct answer is 'Exercise... \n", + "1678 {\"reasoning\": \"Recycling (g) is the option tha... \n", + "1679 {\"reasoning\": \"A drought is a period of abnorm... \n", + "1680 {\"reasoning\": \"Ingestion is the process of tak... \n", + "1681 {\"reasoning\": \"Ultraviolet light is known to h... \n", + "1682 {\"reasoning\": \"Among the choices provided, run... \n", "\n", - " responses_FAR_mistral \\\n", - "0 {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are... \n", - "1 {\"Final Answer\": \"g\", \"Reasoning\": \"Riding a b... \n", - "2 {\"Final Answer\": \"b\", \"Reasoning\": \"Cameras ar... \n", - "3 {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c... \n", - "4 {\"Final Answer\": \"a\", \"Reasoning\": \"Plants and... \n", + " responses_FAR_falcon \\\n", + "0 {\"final_answer\": \"b\", \"reasoning\": \"Busses are... \n", + "1 {\"final_answer\": \"a\", \"reasoning\": \"The nucleu... \n", + "2 {\"final_answer\": \"c\", \"reasoning\": \"Cells, spe... \n", + "3 {\"final_answer\": \"d\", \"reasoning\": \"Bacteria c... \n", + "4 {\"final_answer\": \"g\", \"reasoning\": \"The correc... \n", "... ... \n", - "1678 {\"Final Answer\": \"g\", \"Reasoning\": \"Recycling ... \n", - "1679 {\"Final Answer\": \"d\", \"Reasoning\": \"A drought ... \n", - "1680 {\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ... \n", - "1681 {\"Final Answer\": \"b\", \"Reasoning\": \"Ultraviole... \n", - "1682 {\"Final Answer\": \"c\", \"Reasoning\": \"Running is... \n", + "1678 {\"final_answer\": \"g\", \"reasoning\": \"Recycling ... \n", + "1679 {\"final_answer\": \"d\", \"reasoning\": \"A drought ... \n", + "1680 {\"final_answer\": \"e\", \"reasoning\": \"Ingestion ... \n", + "1681 {\"final_answer\": \"b\", \"reasoning\": \"Ultraviole... \n", + "1682 {\"final_answer\": \"c\", \"reasoning\": \"Running is... \n", "\n", " responses_RFA_gpt3_5 \\\n", - "0 {\"Reasoning\": \"Busses are primarily used for t... \n", - "1 {\"Reasoning\": \"The nucleus of a cell (option a... \n", - "2 {\"Reasoning\": \"The correct answer is (c) Cells... \n", - "3 {\"Reasoning\": \"Bacteria are microorganisms tha... \n", - "4 {\"Reasoning\": \"The question asks about the liv... \n", + "0 {\"reasoning\": \"Busses are vehicles used primar... \n", + "1 {\"reasoning\": \"The question asks which of the ... \n", + "2 {\"reasoning\": \"Cells use electrical energy for... \n", + "3 {\"reasoning\": \"Bacteria can harm various livin... \n", + "4 {\"reasoning\": \"Plants and snakes are both comm... \n", "... ... \n", - "1678 {\"Reasoning\": \"Mining, fossil fuels, deforesta... \n", - "1679 {\"Reasoning\": \"A drought is a prolonged period... \n", - "1680 {\"Reasoning\": \"Ingestion is the process of tak... \n", - "1681 {\"Reasoning\": \"Ultraviolet (UV) light is a typ... \n", - "1682 {\"Reasoning\": \"A body's strength is primarily ... \n", + "1678 {\"reasoning\": \"New resources required for crea... \n", + "1679 {\"reasoning\": \"Drought is a long-term lack of ... \n", + "1680 {\"reasoning\": \"Ingestion is the process by whi... \n", + "1681 {\"reasoning\": \"Ultraviolet (UV) light is a typ... \n", + "1682 {\"reasoning\": \"Running involves physical activ... \n", "\n", - " responses_FAR_gpt3_5 \\\n", - "0 {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are... \n", - "1 { \"Final Answer\": \"a\" ,\"Reasoning\": \"The nucle... \n", - "2 {\"Final Answer\": \"c\", \"Reasoning\": \"Cells are ... \n", - "3 {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c... \n", - "4 { \"Final Answer\": \"a\" ,\"Reasoning\": \"Plants an... \n", - "... ... \n", - "1678 { \"Final Answer\": \"g\" ,\"Reasoning\": \"Recycling... \n", - "1679 { \"Final Answer\": \"d\" ,\"Reasoning\": \"A drought... \n", - "1680 {\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ... \n", - "1681 { \"Final Answer\": \"b\" ,\"Reasoning\": \"Ultraviol... \n", - "1682 {\"Final Answer\": \"c\", \"Reasoning\": \"Running is... \n", + " responses_base responses_FA \\\n", + "0 {\"final_answer\": \"b\"} {\"final_answer\": \"b\"} \n", + "1 {\"final_answer\": \"g\"} {\"final_answer\": \"g\"} \n", + "2 {\"final_answer\": \"c\"} {\"final_answer\": \"f\"} \n", + "3 {\"final_answer\": \"d\"} {\"final_answer\": \"d\"} \n", + "4 {\"final_answer\": \"g\"} {\"final_answer\": \"a\"} \n", + "... ... ... \n", + "1678 {\"final_answer\": \"g\"} {\"final_answer\": \"g\"} \n", + "1679 {\"final_answer\": \"d\"} {\"final_answer\": \"d\"} \n", + "1680 {\"final_answer\": \"e\"} {\"final_answer\": \"c\"} \n", + "1681 {\"final_answer\": \"b\"} {\"final_answer\": \"b\"} \n", + "1682 {\"final_answer\": \"c\"} {\"final_answer\": \"f\"} \n", "\n", - " responses_FA responses_base \n", - "0 { \"Final Answer\": \"b\" } {\"Final Answer\": \"b\"} \n", - "1 {\"Final Answer\": \"a\"} {\"Final Answer\": \"a\"} \n", - "2 {\"Final Answer\": \"b\"} {\"Final Answer\": \"c\"} \n", - "3 {\"Final Answer\": \"e\"} {\"Final Answer\": \"d\"} \n", - "4 {\"Final Answer\": \"a\"} {\"Final Answer\": \"g\"} \n", - "... ... ... \n", - "1678 {\"Final Answer\": \"g\"} {\"Final Answer\": \"g\"} \n", - "1679 {\"Final Answer\": \"a\"} {\"Final Answer\": \"d\"} \n", - "1680 {\"Final Answer\": \"e\"} {\"Final Answer\": \"d\"} \n", - "1681 {\"Final Answer\": \"b\"} {\"Final Answer\": \"b\"} \n", - "1682 {\"Final Answer\": \"f\"} {\"Final Answer\": \"c\"} \n", + " responses_FAR_gpt3_5 \n", + "0 {\"final_answer\": \"b\", \"reasoning\": \"Busses are... \n", + "1 {\"final_answer\": \"d\", \"reasoning\": \"The questi... \n", + "2 {\"final_answer\": \"f\", \"reasoning\": \"Cars use e... \n", + "3 {\"final_answer\": \"d\", \"reasoning\": \"The questi... \n", + "4 {\"final_answer\": \"f\", \"reasoning\": \"Plants and... \n", + "... ... \n", + "1678 {\"final_answer\": \"g\", \"reasoning\": \"The correc... \n", + "1679 {\"final_answer\": \"d\", \"reasoning\": \"A drought ... \n", + "1680 {\"final_answer\": \"e\", \"reasoning\": \"Animals re... \n", + "1681 {\"final_answer\": \"b\", \"reasoning\": \"Ultraviole... \n", + "1682 {\"final_answer\": \"c\", \"reasoning\": \"a) Four li... \n", "\n", - "[1683 rows x 21 columns]" + "[1683 rows x 20 columns]" ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1341,14 +1379,14 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 21, "id": "8619f9f5-9fe4-433e-b524-51c2b12e8d12", "metadata": {}, "outputs": [], "source": [ "def extract_final_answer(response):\n", " try:\n", - " answer = json.loads(response).get(\"Final Answer\")\n", + " answer = json.loads(response).get(\"final_answer\")\n", " except:\n", " answer = 'x'\n", " return answer\n", @@ -1356,15 +1394,51 @@ "# Create new columns for predictions\n", "df['predictions_base'] = df['responses_base'].apply(extract_final_answer)\n", "df['predictions_FA'] = df['responses_FA'].apply(extract_final_answer)\n", - "df['predictions_RFA_mistral'] = df['responses_RFA_mistral'].apply(extract_final_answer)\n", - "df['predictions_FAR_mistral'] = df['responses_FAR_mistral'].apply(extract_final_answer)\n", + "df['predictions_RFA_falcon'] = df['responses_RFA_falcon'].apply(extract_final_answer)\n", + "df['predictions_FAR_falcon'] = df['responses_FAR_falcon'].apply(extract_final_answer)\n", "df['predictions_RFA_gpt3_5'] = df['responses_RFA_gpt3_5'].apply(extract_final_answer)\n", "df['predictions_FAR_gpt3_5'] = df['responses_FAR_gpt3_5'].apply(extract_final_answer)\n" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 22, + "id": "271b402f-6696-4a9c-94ba-a6071c0c2252", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'predictions_base': 0.0,\n", + " 'predictions_FA': 0.0,\n", + " 'predictions_RFA_falcon': 0.0,\n", + " 'predictions_FAR_falcon': 0.0,\n", + " 'predictions_RFA_gpt3_5': 0.0,\n", + " 'predictions_FAR_gpt3_5': 0.0}" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prediction_cols = ['predictions_base',\n", + " 'predictions_FA',\n", + " 'predictions_RFA_falcon',\n", + " 'predictions_FAR_falcon',\n", + " 'predictions_RFA_gpt3_5',\n", + " 'predictions_FAR_gpt3_5']\n", + "percentages = {\n", + " col: (df[col] == 'x').mean() * 100\n", + " for col in prediction_cols\n", + "}\n", + "percentages" + ] + }, + { + "cell_type": "code", + "execution_count": 25, "id": "938cf2a3-2fed-42a3-82ec-a56cb0ea9f37", "metadata": {}, "outputs": [ @@ -1372,12 +1446,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Base: \t\t\t\t\t\t45.22%\n", - "Final Answer: \t\t\t\t\t64.53%\n", - "Reasoning and then the Final Answer (Mistral): \t55.02%\n", - "Final Answer and then the Reasoning (Mistral): \t61.79%\n", - "Reasoning and then the Final Answer (GPT-3.5): \t57.28%\n", - "Final Answer and then the Reasoning (GPT-3.5): \t61.62%\n" + "Base: \t\t\t\t\t\t55.02%\n", + "Final Answer: \t\t\t\t\t53.71%\n", + "Reasoning and then the Final Answer (Falcon): \t56.98%\n", + "Final Answer and then the Reasoning (Falcon): \t54.37%\n", + "Reasoning and then the Final Answer (GPT-3.5): \t57.52%\n", + "Final Answer and then the Reasoning (GPT-3.5): \t56.21%\n" ] } ], @@ -1386,15 +1460,15 @@ "\n", "print(f\"Base: \\t\\t\\t\\t\\t\\t{round(accuracy_score(y_true=df['answer_key'], y_pred=df['predictions_base']) * 100, 2)}%\")\n", "print(f\"Final Answer: \\t\\t\\t\\t\\t{round(accuracy_score(y_true=df['answer_key'], y_pred=df['predictions_FA']) * 100, 2)}%\")\n", - "print(f\"Reasoning and then the Final Answer (Mistral): \\t{round(accuracy_score(y_true=df['answer_key'], y_pred=df['predictions_RFA_mistral']) * 100, 2)}%\")\n", - "print(f\"Final Answer and then the Reasoning (Mistral): \\t{round(accuracy_score(y_true=df['answer_key'], y_pred=df['predictions_FAR_mistral']) * 100, 2)}%\")\n", + "print(f\"Reasoning and then the Final Answer (Falcon): \\t{round(accuracy_score(y_true=df['answer_key'], y_pred=df['predictions_RFA_falcon']) * 100, 2)}%\")\n", + "print(f\"Final Answer and then the Reasoning (Falcon): \\t{round(accuracy_score(y_true=df['answer_key'], y_pred=df['predictions_FAR_falcon']) * 100, 2)}%\")\n", "print(f\"Reasoning and then the Final Answer (GPT-3.5): \\t{round(accuracy_score(y_true=df['answer_key'], y_pred=df['predictions_RFA_gpt3_5']) * 100, 2)}%\")\n", "print(f\"Final Answer and then the Reasoning (GPT-3.5): \\t{round(accuracy_score(y_true=df['answer_key'], y_pred=df['predictions_FAR_gpt3_5']) * 100, 2)}%\")" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 26, "id": "83aae472-513b-43c3-9ee8-64d4cda775e0", "metadata": {}, "outputs": [ @@ -1423,23 +1497,23 @@ " question_text\n", " answer_key\n", " gpt3_5_reasoning\n", - " mistral_reasoning\n", + " falcon_reasoning\n", " answer_choices\n", - " user_prompt\n", " user_prompt_RFA\n", " conversation_RFA_gpt3_5\n", - " conversation_RFA_mistral\n", + " conversation_RFA_falcon\n", + " user_prompt_FAR\n", " ...\n", " responses_RFA_gpt3_5\n", - " responses_FAR_gpt3_5\n", - " responses_FA\n", " responses_base\n", + " responses_FA\n", + " responses_FAR_gpt3_5\n", " predictions_base\n", " predictions_FA\n", - " predictions_FAR_mistral\n", + " predictions_RFA_falcon\n", + " predictions_FAR_falcon\n", " predictions_RFA_gpt3_5\n", " predictions_FAR_gpt3_5\n", - " predictions_RFA_mistral\n", " \n", " \n", " \n", @@ -1449,17 +1523,17 @@ " What are busses used for?\n", " b\n", " a) Protective shelter: This option is incorrec...\n", - " 1. Start by reading the question carefully: \"C...\n", + " (a) Protective shelter - \\nErroneous. Busses a...\n", " (a) Protective shelter (b) Transporting humans...\n", - " Question: What are busses used for?\\nAnswer Ch...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", " ...\n", - " {\"Reasoning\": \"Busses are primarily used for t...\n", - " {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are...\n", - " { \"Final Answer\": \"b\" }\n", - " {\"Final Answer\": \"b\"}\n", + " {\"reasoning\": \"Busses are vehicles used primar...\n", + " {\"final_answer\": \"b\"}\n", + " {\"final_answer\": \"b\"}\n", + " {\"final_answer\": \"b\", \"reasoning\": \"Busses are...\n", " b\n", " b\n", " b\n", @@ -1473,23 +1547,23 @@ " Which of the following does not contribute to ...\n", " g\n", " a) Nucleus of a cell: This option is not relat...\n", - " To solve this question, let's first understand...\n", + " (a) Nucleus of a cell: This option is incorrec...\n", " (a) Nucleus of a cell (b) Flying in a plane (c...\n", - " Question: Which of the following does not cont...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", " ...\n", - " {\"Reasoning\": \"The nucleus of a cell (option a...\n", - " { \"Final Answer\": \"a\" ,\"Reasoning\": \"The nucle...\n", - " {\"Final Answer\": \"a\"}\n", - " {\"Final Answer\": \"a\"}\n", - " a\n", - " a\n", + " {\"reasoning\": \"The question asks which of the ...\n", + " {\"final_answer\": \"g\"}\n", + " {\"final_answer\": \"g\"}\n", + " {\"final_answer\": \"d\", \"reasoning\": \"The questi...\n", + " g\n", " g\n", " a\n", " a\n", " g\n", + " d\n", " \n", " \n", " 2\n", @@ -1497,23 +1571,23 @@ " What uses electrical energy converted from che...\n", " b\n", " a) Sunlight: Sunlight is a form of energy that...\n", - " 1. Read the question and options carefully: Th...\n", + " (a) Sunlight: Sunlight is a form of energy tha...\n", " (a) Sunlight (b) Cameras (c) Cells (d) Buses (...\n", - " Question: What uses electrical energy converte...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", " ...\n", - " {\"Reasoning\": \"The correct answer is (c) Cells...\n", - " {\"Final Answer\": \"c\", \"Reasoning\": \"Cells are ...\n", - " {\"Final Answer\": \"b\"}\n", - " {\"Final Answer\": \"c\"}\n", + " {\"reasoning\": \"Cells use electrical energy for...\n", + " {\"final_answer\": \"c\"}\n", + " {\"final_answer\": \"f\"}\n", + " {\"final_answer\": \"f\", \"reasoning\": \"Cars use e...\n", " c\n", - " b\n", - " b\n", + " f\n", " c\n", " c\n", - " e\n", + " c\n", + " f\n", " \n", " \n", " 3\n", @@ -1521,23 +1595,23 @@ " Bacteria causes what to be harmed?\n", " a\n", " Now, let's go through each option and explain ...\n", - " To answer this question correctly, let's follo...\n", + " 1. **Plants (a) - Correct Answer:**\\n - Bact...\n", " (a) Plants (b) Electronics (c) Fossils (d) Hum...\n", - " Question: Bacteria causes what to be harmed?\\n...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", " ...\n", - " {\"Reasoning\": \"Bacteria are microorganisms tha...\n", - " {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c...\n", - " {\"Final Answer\": \"e\"}\n", - " {\"Final Answer\": \"d\"}\n", + " {\"reasoning\": \"Bacteria can harm various livin...\n", + " {\"final_answer\": \"d\"}\n", + " {\"final_answer\": \"d\"}\n", + " {\"final_answer\": \"d\", \"reasoning\": \"The questi...\n", + " d\n", + " d\n", " d\n", - " e\n", " d\n", " d\n", " d\n", - " e\n", " \n", " \n", " 4\n", @@ -1545,23 +1619,23 @@ " Plants and snakes live _.?\n", " a\n", " b) Important habitats: This option is incorrec...\n", - " 1. Read the question and options carefully: Th...\n", + " **Answer: (a) Almost everywhere**\\n\\n**Explana...\n", " (a) Almost everywhere (b) Important habitats (...\n", - " Question: Plants and snakes live _.?\\nAnswer C...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", " ...\n", - " {\"Reasoning\": \"The question asks about the liv...\n", - " { \"Final Answer\": \"a\" ,\"Reasoning\": \"Plants an...\n", - " {\"Final Answer\": \"a\"}\n", - " {\"Final Answer\": \"g\"}\n", + " {\"reasoning\": \"Plants and snakes are both comm...\n", + " {\"final_answer\": \"g\"}\n", + " {\"final_answer\": \"a\"}\n", + " {\"final_answer\": \"f\", \"reasoning\": \"Plants and...\n", " g\n", " a\n", " a\n", - " f\n", + " g\n", " a\n", - " b\n", + " f\n", " \n", " \n", " ...\n", @@ -1593,17 +1667,17 @@ " New resources required for creation can be red...\n", " g\n", " a) Mining: Mining involves extracting minerals...\n", - " 1. Start by reading the question and options c...\n", + " (a) Mining: Mining is the process of extractin...\n", " (a) Mining (b) Mutations (c) Fossil fuels (d) ...\n", - " Question: New resources required for creation ...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", " ...\n", - " {\"Reasoning\": \"Mining, fossil fuels, deforesta...\n", - " { \"Final Answer\": \"g\" ,\"Reasoning\": \"Recycling...\n", - " {\"Final Answer\": \"g\"}\n", - " {\"Final Answer\": \"g\"}\n", + " {\"reasoning\": \"New resources required for crea...\n", + " {\"final_answer\": \"g\"}\n", + " {\"final_answer\": \"g\"}\n", + " {\"final_answer\": \"g\", \"reasoning\": \"The correc...\n", " g\n", " g\n", " g\n", @@ -1617,19 +1691,19 @@ " A drought dehydrates an entire what?\n", " d\n", " a) Body water: This option is incorrect becaus...\n", - " (a) Watershred - This is not a scientific term...\n", + " The correct answer is (d) Environment. \\n\\nNow...\n", " (a) Body water (b) Dried fruit (c) Bodily wate...\n", - " Question: A drought dehydrates an entire what?...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", " ...\n", - " {\"Reasoning\": \"A drought is a prolonged period...\n", - " { \"Final Answer\": \"d\" ,\"Reasoning\": \"A drought...\n", - " {\"Final Answer\": \"a\"}\n", - " {\"Final Answer\": \"d\"}\n", + " {\"reasoning\": \"Drought is a long-term lack of ...\n", + " {\"final_answer\": \"d\"}\n", + " {\"final_answer\": \"d\"}\n", + " {\"final_answer\": \"d\", \"reasoning\": \"A drought ...\n", + " d\n", " d\n", - " a\n", " d\n", " d\n", " d\n", @@ -1641,23 +1715,23 @@ " An animal requires ingestion to do what?\n", " e\n", " a) Aerobic capacity: This option is not logica...\n", - " 1. Read the question and options carefully: \"W...\n", + " (a) Aerobic capacity: This refers to an animal...\n", " (a) Aerobic capacity (b) Die (c) Water conserv...\n", - " Question: An animal requires ingestion to do w...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", " ...\n", - " {\"Reasoning\": \"Ingestion is the process of tak...\n", - " {\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ...\n", - " {\"Final Answer\": \"e\"}\n", - " {\"Final Answer\": \"d\"}\n", - " d\n", + " {\"reasoning\": \"Ingestion is the process by whi...\n", + " {\"final_answer\": \"e\"}\n", + " {\"final_answer\": \"c\"}\n", + " {\"final_answer\": \"e\", \"reasoning\": \"Animals re...\n", + " e\n", + " c\n", " e\n", " e\n", " e\n", " e\n", - " c\n", " \n", " \n", " 1681\n", @@ -1665,22 +1739,22 @@ " Ultraviolet light can cause what?\n", " b\n", " a) Ultraviolet light does not cause heat energ...\n", - " 1. First, read the question and options carefu...\n", + " Let's examine each option and determine why so...\n", " (a) Heat energy (b) Skin cancer (c) Killing in...\n", - " Question: Ultraviolet light can cause what?\\nA...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", " ...\n", - " {\"Reasoning\": \"Ultraviolet (UV) light is a typ...\n", - " { \"Final Answer\": \"b\" ,\"Reasoning\": \"Ultraviol...\n", - " {\"Final Answer\": \"b\"}\n", - " {\"Final Answer\": \"b\"}\n", - " b\n", - " b\n", + " {\"reasoning\": \"Ultraviolet (UV) light is a typ...\n", + " {\"final_answer\": \"b\"}\n", + " {\"final_answer\": \"b\"}\n", + " {\"final_answer\": \"b\", \"reasoning\": \"Ultraviole...\n", " b\n", " b\n", + " g\n", " b\n", + " f\n", " b\n", " \n", " \n", @@ -1689,27 +1763,27 @@ " What can increase a body's strength?\n", " c\n", " a) Four limbs: This option is not correct beca...\n", - " (a) Communication: In this context, the questi...\n", + " (a) Four limbs: Having four limbs doesn't dire...\n", " (a) Four limbs (b) Disease (c) Running (d) Bic...\n", - " Question: What can increase a body's strength?...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", - " <s>[INST] Answer the Question and include your...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", + " [{'content': 'Answer the Question and include ...\n", " ...\n", - " {\"Reasoning\": \"A body's strength is primarily ...\n", - " {\"Final Answer\": \"c\", \"Reasoning\": \"Running is...\n", - " {\"Final Answer\": \"f\"}\n", - " {\"Final Answer\": \"c\"}\n", + " {\"reasoning\": \"Running involves physical activ...\n", + " {\"final_answer\": \"c\"}\n", + " {\"final_answer\": \"f\"}\n", + " {\"final_answer\": \"c\", \"reasoning\": \"a) Four li...\n", " c\n", " f\n", " c\n", - " d\n", + " c\n", " c\n", " c\n", " \n", " \n", "\n", - "

1683 rows × 27 columns

\n", + "

1683 rows × 26 columns

\n", "" ], "text/plain": [ @@ -1739,18 +1813,18 @@ "1681 b a) Ultraviolet light does not cause heat energ... \n", "1682 c a) Four limbs: This option is not correct beca... \n", "\n", - " mistral_reasoning \\\n", - "0 1. Start by reading the question carefully: \"C... \n", - "1 To solve this question, let's first understand... \n", - "2 1. Read the question and options carefully: Th... \n", - "3 To answer this question correctly, let's follo... \n", - "4 1. Read the question and options carefully: Th... \n", + " falcon_reasoning \\\n", + "0 (a) Protective shelter - \\nErroneous. Busses a... \n", + "1 (a) Nucleus of a cell: This option is incorrec... \n", + "2 (a) Sunlight: Sunlight is a form of energy tha... \n", + "3 1. **Plants (a) - Correct Answer:**\\n - Bact... \n", + "4 **Answer: (a) Almost everywhere**\\n\\n**Explana... \n", "... ... \n", - "1678 1. Start by reading the question and options c... \n", - "1679 (a) Watershred - This is not a scientific term... \n", - "1680 1. Read the question and options carefully: \"W... \n", - "1681 1. First, read the question and options carefu... \n", - "1682 (a) Communication: In this context, the questi... \n", + "1678 (a) Mining: Mining is the process of extractin... \n", + "1679 The correct answer is (d) Environment. \\n\\nNow... \n", + "1680 (a) Aerobic capacity: This refers to an animal... \n", + "1681 Let's examine each option and determine why so... \n", + "1682 (a) Four limbs: Having four limbs doesn't dire... \n", "\n", " answer_choices \\\n", "0 (a) Protective shelter (b) Transporting humans... \n", @@ -1765,127 +1839,127 @@ "1681 (a) Heat energy (b) Skin cancer (c) Killing in... \n", "1682 (a) Four limbs (b) Disease (c) Running (d) Bic... \n", "\n", - " user_prompt \\\n", - "0 Question: What are busses used for?\\nAnswer Ch... \n", - "1 Question: Which of the following does not cont... \n", - "2 Question: What uses electrical energy converte... \n", - "3 Question: Bacteria causes what to be harmed?\\n... \n", - "4 Question: Plants and snakes live _.?\\nAnswer C... \n", - "... ... \n", - "1678 Question: New resources required for creation ... \n", - "1679 Question: A drought dehydrates an entire what?... \n", - "1680 Question: An animal requires ingestion to do w... \n", - "1681 Question: Ultraviolet light can cause what?\\nA... \n", - "1682 Question: What can increase a body's strength?... \n", - "\n", " user_prompt_RFA \\\n", - "0 [INST] Answer the Question and include your... \n", - "1 [INST] Answer the Question and include your... \n", - "2 [INST] Answer the Question and include your... \n", - "3 [INST] Answer the Question and include your... \n", - "4 [INST] Answer the Question and include your... \n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", "... ... \n", - "1678 [INST] Answer the Question and include your... \n", - "1679 [INST] Answer the Question and include your... \n", - "1680 [INST] Answer the Question and include your... \n", - "1681 [INST] Answer the Question and include your... \n", - "1682 [INST] Answer the Question and include your... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", "\n", " conversation_RFA_gpt3_5 \\\n", - "0 [INST] Answer the Question and include your... \n", - "1 [INST] Answer the Question and include your... \n", - "2 [INST] Answer the Question and include your... \n", - "3 [INST] Answer the Question and include your... \n", - "4 [INST] Answer the Question and include your... \n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", "... ... \n", - "1678 [INST] Answer the Question and include your... \n", - "1679 [INST] Answer the Question and include your... \n", - "1680 [INST] Answer the Question and include your... \n", - "1681 [INST] Answer the Question and include your... \n", - "1682 [INST] Answer the Question and include your... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", "\n", - " conversation_RFA_mistral ... \\\n", - "0 [INST] Answer the Question and include your... ... \n", - "1 [INST] Answer the Question and include your... ... \n", - "2 [INST] Answer the Question and include your... ... \n", - "3 [INST] Answer the Question and include your... ... \n", - "4 [INST] Answer the Question and include your... ... \n", + " conversation_RFA_falcon \\\n", + "0 [{'content': 'Answer the Question and include ... \n", + "1 [{'content': 'Answer the Question and include ... \n", + "2 [{'content': 'Answer the Question and include ... \n", + "3 [{'content': 'Answer the Question and include ... \n", + "4 [{'content': 'Answer the Question and include ... \n", + "... ... \n", + "1678 [{'content': 'Answer the Question and include ... \n", + "1679 [{'content': 'Answer the Question and include ... \n", + "1680 [{'content': 'Answer the Question and include ... \n", + "1681 [{'content': 'Answer the Question and include ... \n", + "1682 [{'content': 'Answer the Question and include ... \n", + "\n", + " user_prompt_FAR ... \\\n", + "0 [{'content': 'Answer the Question and include ... ... \n", + "1 [{'content': 'Answer the Question and include ... ... \n", + "2 [{'content': 'Answer the Question and include ... ... \n", + "3 [{'content': 'Answer the Question and include ... ... \n", + "4 [{'content': 'Answer the Question and include ... ... \n", "... ... ... \n", - "1678 [INST] Answer the Question and include your... ... \n", - "1679 [INST] Answer the Question and include your... ... \n", - "1680 [INST] Answer the Question and include your... ... \n", - "1681 [INST] Answer the Question and include your... ... \n", - "1682 [INST] Answer the Question and include your... ... \n", + "1678 [{'content': 'Answer the Question and include ... ... \n", + "1679 [{'content': 'Answer the Question and include ... ... \n", + "1680 [{'content': 'Answer the Question and include ... ... \n", + "1681 [{'content': 'Answer the Question and include ... ... \n", + "1682 [{'content': 'Answer the Question and include ... ... \n", "\n", " responses_RFA_gpt3_5 \\\n", - "0 {\"Reasoning\": \"Busses are primarily used for t... \n", - "1 {\"Reasoning\": \"The nucleus of a cell (option a... \n", - "2 {\"Reasoning\": \"The correct answer is (c) Cells... \n", - "3 {\"Reasoning\": \"Bacteria are microorganisms tha... \n", - "4 {\"Reasoning\": \"The question asks about the liv... \n", + "0 {\"reasoning\": \"Busses are vehicles used primar... \n", + "1 {\"reasoning\": \"The question asks which of the ... \n", + "2 {\"reasoning\": \"Cells use electrical energy for... \n", + "3 {\"reasoning\": \"Bacteria can harm various livin... \n", + "4 {\"reasoning\": \"Plants and snakes are both comm... \n", "... ... \n", - "1678 {\"Reasoning\": \"Mining, fossil fuels, deforesta... \n", - "1679 {\"Reasoning\": \"A drought is a prolonged period... \n", - "1680 {\"Reasoning\": \"Ingestion is the process of tak... \n", - "1681 {\"Reasoning\": \"Ultraviolet (UV) light is a typ... \n", - "1682 {\"Reasoning\": \"A body's strength is primarily ... \n", + "1678 {\"reasoning\": \"New resources required for crea... \n", + "1679 {\"reasoning\": \"Drought is a long-term lack of ... \n", + "1680 {\"reasoning\": \"Ingestion is the process by whi... \n", + "1681 {\"reasoning\": \"Ultraviolet (UV) light is a typ... \n", + "1682 {\"reasoning\": \"Running involves physical activ... \n", "\n", - " responses_FAR_gpt3_5 \\\n", - "0 {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are... \n", - "1 { \"Final Answer\": \"a\" ,\"Reasoning\": \"The nucle... \n", - "2 {\"Final Answer\": \"c\", \"Reasoning\": \"Cells are ... \n", - "3 {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c... \n", - "4 { \"Final Answer\": \"a\" ,\"Reasoning\": \"Plants an... \n", - "... ... \n", - "1678 { \"Final Answer\": \"g\" ,\"Reasoning\": \"Recycling... \n", - "1679 { \"Final Answer\": \"d\" ,\"Reasoning\": \"A drought... \n", - "1680 {\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ... \n", - "1681 { \"Final Answer\": \"b\" ,\"Reasoning\": \"Ultraviol... \n", - "1682 {\"Final Answer\": \"c\", \"Reasoning\": \"Running is... \n", + " responses_base responses_FA \\\n", + "0 {\"final_answer\": \"b\"} {\"final_answer\": \"b\"} \n", + "1 {\"final_answer\": \"g\"} {\"final_answer\": \"g\"} \n", + "2 {\"final_answer\": \"c\"} {\"final_answer\": \"f\"} \n", + "3 {\"final_answer\": \"d\"} {\"final_answer\": \"d\"} \n", + "4 {\"final_answer\": \"g\"} {\"final_answer\": \"a\"} \n", + "... ... ... \n", + "1678 {\"final_answer\": \"g\"} {\"final_answer\": \"g\"} \n", + "1679 {\"final_answer\": \"d\"} {\"final_answer\": \"d\"} \n", + "1680 {\"final_answer\": \"e\"} {\"final_answer\": \"c\"} \n", + "1681 {\"final_answer\": \"b\"} {\"final_answer\": \"b\"} \n", + "1682 {\"final_answer\": \"c\"} {\"final_answer\": \"f\"} \n", "\n", - " responses_FA responses_base predictions_base \\\n", - "0 { \"Final Answer\": \"b\" } {\"Final Answer\": \"b\"} b \n", - "1 {\"Final Answer\": \"a\"} {\"Final Answer\": \"a\"} a \n", - "2 {\"Final Answer\": \"b\"} {\"Final Answer\": \"c\"} c \n", - "3 {\"Final Answer\": \"e\"} {\"Final Answer\": \"d\"} d \n", - "4 {\"Final Answer\": \"a\"} {\"Final Answer\": \"g\"} g \n", - "... ... ... ... \n", - "1678 {\"Final Answer\": \"g\"} {\"Final Answer\": \"g\"} g \n", - "1679 {\"Final Answer\": \"a\"} {\"Final Answer\": \"d\"} d \n", - "1680 {\"Final Answer\": \"e\"} {\"Final Answer\": \"d\"} d \n", - "1681 {\"Final Answer\": \"b\"} {\"Final Answer\": \"b\"} b \n", - "1682 {\"Final Answer\": \"f\"} {\"Final Answer\": \"c\"} c \n", + " responses_FAR_gpt3_5 predictions_base \\\n", + "0 {\"final_answer\": \"b\", \"reasoning\": \"Busses are... b \n", + "1 {\"final_answer\": \"d\", \"reasoning\": \"The questi... g \n", + "2 {\"final_answer\": \"f\", \"reasoning\": \"Cars use e... c \n", + "3 {\"final_answer\": \"d\", \"reasoning\": \"The questi... d \n", + "4 {\"final_answer\": \"f\", \"reasoning\": \"Plants and... g \n", + "... ... ... \n", + "1678 {\"final_answer\": \"g\", \"reasoning\": \"The correc... g \n", + "1679 {\"final_answer\": \"d\", \"reasoning\": \"A drought ... d \n", + "1680 {\"final_answer\": \"e\", \"reasoning\": \"Animals re... e \n", + "1681 {\"final_answer\": \"b\", \"reasoning\": \"Ultraviole... b \n", + "1682 {\"final_answer\": \"c\", \"reasoning\": \"a) Four li... c \n", "\n", - " predictions_FA predictions_FAR_mistral predictions_RFA_gpt3_5 \\\n", - "0 b b b \n", - "1 a g a \n", - "2 b b c \n", - "3 e d d \n", - "4 a a f \n", - "... ... ... ... \n", - "1678 g g g \n", - "1679 a d d \n", - "1680 e e e \n", - "1681 b b b \n", - "1682 f c d \n", + " predictions_FA predictions_RFA_falcon predictions_FAR_falcon \\\n", + "0 b b b \n", + "1 g a a \n", + "2 f c c \n", + "3 d d d \n", + "4 a a g \n", + "... ... ... ... \n", + "1678 g g g \n", + "1679 d d d \n", + "1680 c e e \n", + "1681 b g b \n", + "1682 f c c \n", "\n", - " predictions_FAR_gpt3_5 predictions_RFA_mistral \n", - "0 b b \n", - "1 a g \n", - "2 c e \n", - "3 d e \n", - "4 a b \n", - "... ... ... \n", - "1678 g g \n", - "1679 d d \n", - "1680 e c \n", - "1681 b b \n", - "1682 c c \n", + " predictions_RFA_gpt3_5 predictions_FAR_gpt3_5 \n", + "0 b b \n", + "1 g d \n", + "2 c f \n", + "3 d d \n", + "4 a f \n", + "... ... ... \n", + "1678 g g \n", + "1679 d d \n", + "1680 e e \n", + "1681 f b \n", + "1682 c c \n", "\n", - "[1683 rows x 27 columns]" + "[1683 rows x 26 columns]" ] }, - "execution_count": 29, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1896,14 +1970,14 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 27, "id": "45c08dd4-0b98-4e0f-b487-549f60518a4e", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "23d5dbd0a91d436fb9920dfe81e4803a", + "model_id": "4a4a20a26dc649a3a6cebdb7856fbd4b", "version_major": 2, "version_minor": 0 }, @@ -1917,7 +1991,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0b25bcb277574e8792b14e838a32fe25", + "model_id": "b2b7dd4e0f3f46b08f6c8ca21e7d6363", "version_major": 2, "version_minor": 0 }, @@ -1931,10 +2005,10 @@ { "data": { "text/plain": [ - "CommitInfo(commit_url='https://huggingface.co/datasets/derek-thomas/labeled-multiple-choice-explained-mistral-results/commit/796d0867b715f2fad05d6e54ad1e0e0504ca670c', commit_message='Upload dataset', commit_description='', oid='796d0867b715f2fad05d6e54ad1e0e0504ca670c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/derek-thomas/labeled-multiple-choice-explained-mistral-results', endpoint='https://huggingface.co', repo_type='dataset', repo_id='derek-thomas/labeled-multiple-choice-explained-mistral-results'), pr_revision=None, pr_num=None)" + "CommitInfo(commit_url='https://huggingface.co/datasets/derek-thomas/labeled-multiple-choice-explained-falcon-results/commit/c457e5db8fd2474a04c1338e5f03a0410e1dd10b', commit_message='Upload dataset', commit_description='', oid='c457e5db8fd2474a04c1338e5f03a0410e1dd10b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/derek-thomas/labeled-multiple-choice-explained-falcon-results', endpoint='https://huggingface.co', repo_type='dataset', repo_id='derek-thomas/labeled-multiple-choice-explained-falcon-results'), pr_revision=None, pr_num=None)" ] }, - "execution_count": 30, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1975,7 +2049,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.10" + "version": "3.11.11" } }, "nbformat": 4,