{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import requests \n", "import datetime as dt\n", "import re\n", "import json\n", "from tqdm import tqdm\n", "import os\n", "\n", "from openai import OpenAI" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Calculate" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "if \"OPENAI_API_KEY\" not in os.environ:\n", " with open('secrets/keys.txt', 'r') as f:\n", " keys = json.loads(f.read())\n", "else : \n", " keys=os.environ" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "save_path = 'save'\n", "content_path = 'extract_sciences_po'\n", "\n", "\n", "def retrieve_classifications(name, mapping_prompt):\n", "\n", " df = pd.read_csv('extract_sciences_po.csv')\n", "\n", "\n", " if os.path.exists(f\"{save_path}/output_{name}.txt\"):\n", " with open(f\"{save_path}/output_{name}.txt\", 'r') as f : \n", " out_dict = json.loads(f.read())\n", " out_df = pd.DataFrame.from_dict(out_dict)\n", " out = out_dict\n", " else : \n", " out_df = pd.DataFrame(columns = ['item_id', 'categorie_principale', 'categorie_secondaire'])\n", " out = []\n", "\n", " df_to_process = df.loc[~df.item_id.isin(out_df.item_id)]\n", "\n", " if mapping_prompt[name]['client']=='deepseek':\n", " client = OpenAI(api_key=keys[\"DEEPSEEK_API_KEY\"], base_url=\"https://api.deepseek.com\")\n", " model=\"deepseek-chat\"\n", " else:\n", " client=OpenAI(api_key=keys['OPENAI_API_KEY'])\n", " model=\"gpt-4o\"\n", "\n", " df_to_process = df.loc[~df.item_id.isin(out_df.item_id)]\n", "\n", "\n", " with open(mapping_prompt[name]['path_prompt'], 'r') as f:\n", " prompt = f.read()\n", "\n", " with tqdm(total=df_to_process.shape[0]) as pbar:\n", " for i, row in df_to_process.iterrows():\n", " titre_brut = f\"{row.item_id}_\"+row.titre.lower().strip().replace(f\"\\xa0\", ' ').replace(' : ', ':').replace(' ', '_').replace('/', '')\n", " \n", " with open(f'{content_path}/{titre_brut}.txt', 'r') as f:\n", " text = f.read()\n", "\n", " messages = [{\"role\": \"system\", \"content\": prompt},\n", " {\"role\": \"user\", \"content\": text}]\n", "\n", " response = client.chat.completions.create(\n", " model=model,\n", " messages=messages,\n", " response_format={\n", " 'type': 'json_object'\n", " }\n", " )\n", " try : \n", " cat_json = json.loads(response.choices[0].message.content)\n", "\n", " out.append({\n", " 'item_id':row.item_id, \n", " 'categorie_principale': cat_json['categorie_principale'],\n", " 'categorie_secondaire': cat_json['categorie_secondaire'],\n", " })\n", " \n", " with open(f'{save_path}/output_{name}.txt', 'w+') as f : \n", " f.write(json.dumps(out))\n", "\n", " except Exception as e : \n", " print(f'Error with article {row.item_id}')\n", " pass\n", "\n", " \n", " pbar.update(1)\n", "\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sans_titre_1\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "0it [00:00, ?it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "favarel_et_al\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%| | 6/516 [00:39<55:50, 6.57s/it]" ] } ], "source": [ "with open('mapping_prompts.txt', 'r') as f : \n", " mapping = json.loads(f.read())\n", "\n", "for name in mapping.keys():\n", " print(name)\n", " retrieve_classifications(name, mapping)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Ajouter images" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 }