Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Runtime error

App Files Files Community

Petr Tsvetkov commited on Apr 4, 2024

Commit

9513395

0 Parent(s):

release

Browse files

Files changed (24) hide show

.gitattributes +35 -0
.gitignore +282 -0
README.md +26 -0
api_wrappers/__init__.py +0 -0
api_wrappers/grazie_wrapper.py +64 -0
api_wrappers/hf_data_loader.py +120 -0
change_visualizer.py +353 -0
chart.ipynb +0 -0
config.py +39 -0
data_stats.ipynb +759 -0
dataset_statistics.py +71 -0
generate_annotated_diffs.py +38 -0
generated_message_length_comparison.ipynb +314 -0
generation_steps/__init__.py +0 -0
generation_steps/examples.py +51 -0
generation_steps/for_labeling.py +58 -0
generation_steps/metrics_analysis.py +94 -0
generation_steps/synthetic_backward.py +111 -0
generation_steps/synthetic_forward.py +107 -0
metrics_analysis.ipynb +0 -0
poetry.lock +0 -0
pyproject.toml +199 -0
requirements.txt +167 -0
run_pipeline.py +17 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,282 @@

+# Created by https://www.toptal.com/developers/gitignore/api/pycharm+all,venv,python
+# Edit at https://www.toptal.com/developers/gitignore?templates=pycharm+all,venv,python
+### PyCharm+all ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# AWS User-specific
+.idea/**/aws.xml
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# SonarLint plugin
+.idea/sonarlint/
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+### PyCharm+all Patch ###
+# Ignore everything but code style settings and run configurations
+# that are supposed to be shared within teams.
+.idea/*
+!.idea/codeStyles
+!.idea/runConfigurations
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+### venv ###
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+pip-selfcheck.json
+# End of https://www.toptal.com/developers/gitignore/api/pycharm+all,venv,python
+.idea
+cache
+output
+data

README.md ADDED Viewed

	@@ -0,0 +1,26 @@

+---
+title: Commit Message Editing Visualization
+emoji: 📈
+sdk: gradio
+sdk_version: 4.37.2
+app_file: change_visualizer.py
+---
+# Commit Message Editing Visualisation ✍️🔍📊
+This space provides a visualization app for exploring the commit message edits datasets (🤗 [expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and 🤗 [synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
+from 📜 [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper and also hosts important artifacts from our work.
+## Artifacts
+* 📊[`metrics_analysis.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
+* 📈[`chart.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/chart.ipynb) contains the code for Figure 4 with edit distance distribution;
+* 🗃️[`data_stats.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1;
+* ⬅️[`generation_steps/synthetic_backward.py`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/generation_steps/synthetic_backward.py) contains the code for *Synthetic Backward* generation proposed in our paper;
+* ➡️[`generation_steps/synthetic_forward.py`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/generation_steps/synthetic_forward.py) contains the code for *Synthetic Forward* generation proposed in our paper.
+## Visualization
+* 🔍 Click on `Examples Exploration` tab to browse through nicely-formatted examples from our dataset.
+* 📈 Click on `Dataset Statistics` tab to see the major statistics for our dataset.
+* 📊 Click on `Experimental Results` tab to see additional metrics tested as target online metrics alongside our main edit distance results.

api_wrappers/__init__.py ADDED Viewed

File without changes

api_wrappers/grazie_wrapper.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import pickle
+import time
+from grazie.api.client.chat.prompt import ChatPrompt
+from grazie.api.client.endpoints import GrazieApiGatewayUrls
+from grazie.api.client.gateway import AuthType, GrazieAgent, GrazieApiGatewayClient
+from grazie.api.client.profiles import LLMProfile
+import config
+client = GrazieApiGatewayClient(
+    grazie_agent=GrazieAgent("grazie-toolformers", "v1.0"),
+    url=GrazieApiGatewayUrls.STAGING,
+    auth_type=AuthType.APPLICATION,
+    grazie_jwt_token=config.GRAZIE_API_JWT_TOKEN,
+)
+LLM_CACHE_FILE = config.CACHE_DIR / f"{config.LLM_MODEL}.cache.pkl"
+LLM_CACHE = {}
+LLM_CACHE_USED = {}
+if not LLM_CACHE_FILE.exists():
+    with open(LLM_CACHE_FILE, "wb") as file:
+        pickle.dump(obj=LLM_CACHE, file=file)
+with open(LLM_CACHE_FILE, "rb") as file:
+    LLM_CACHE = pickle.load(file=file)
+def llm_request(prompt):
+    output = None
+    while output is None:
+        try:
+            output = client.chat(
+                chat=ChatPrompt().add_system("You are a helpful assistant.").add_user(prompt),
+                profile=LLMProfile(config.LLM_MODEL),
+            ).content
+        except Exception:
+            time.sleep(config.GRAZIE_TIMEOUT_SEC)
+    assert output is not None
+    return output
+def generate_for_prompt(prompt):
+    if prompt not in LLM_CACHE:
+        LLM_CACHE[prompt] = []
+    if prompt not in LLM_CACHE_USED:
+        LLM_CACHE_USED[prompt] = 0
+    while LLM_CACHE_USED[prompt] >= len(LLM_CACHE[prompt]):
+        new_response = llm_request(prompt)
+        LLM_CACHE[prompt].append(new_response)
+        with open(LLM_CACHE_FILE, "wb") as file:
+            pickle.dump(obj=LLM_CACHE, file=file)
+    result = LLM_CACHE[prompt][LLM_CACHE_USED[prompt]]
+    LLM_CACHE_USED[prompt] += 1
+    return result

api_wrappers/hf_data_loader.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import json
+import os
+from datetime import datetime, timedelta
+import pandas as pd
+from datasets import load_dataset
+from huggingface_hub import hf_hub_download, list_repo_tree
+import config
+def load_raw_rewriting_as_pandas():
+    return load_dataset(
+        config.HF_RAW_DATASET_NAME, split=config.HF_RAW_DATASET_SPLIT, token=config.HF_TOKEN, cache_dir=config.CACHE_DIR
+    ).to_pandas()
+def load_full_commit_as_pandas():
+    return (
+        load_dataset(
+            path=config.HF_FULL_COMMITS_DATASET_NAME,
+            name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
+            split=config.HF_FULL_COMMITS_DATASET_SPLIT,
+            cache_dir=config.CACHE_DIR,
+        )
+        .to_pandas()
+        .rename(columns={"message": "reference"})
+    )
+def edit_time_from_history(history_str):
+    history = json.loads(history_str)
+    if len(history) == 0:
+        return 0
+    timestamps = list(map(lambda e: datetime.fromisoformat(e["ts"]), history))
+    delta = max(timestamps) - min(timestamps)
+    return delta // timedelta(milliseconds=1)
+def edit_time_from_timestamps(row):
+    loaded_ts = datetime.fromisoformat(row["loaded_ts"])
+    submitted_ts = datetime.fromisoformat(row["submitted_ts"])
+    delta = submitted_ts - loaded_ts
+    result = delta // timedelta(milliseconds=1)
+    return result if result >= 0 else None
+def load_processed_rewriting_as_pandas():
+    manual_rewriting = load_raw_rewriting_as_pandas()[
+        [
+            "hash",
+            "repo",
+            "commit_msg_start",
+            "commit_msg_end",
+            "session",
+            "commit_msg_history",
+            "loaded_ts",
+            "submitted_ts",
+        ]
+    ]
+    manual_rewriting["edit_time_hist"] = manual_rewriting["commit_msg_history"].apply(edit_time_from_history)
+    manual_rewriting["edit_time"] = manual_rewriting.apply(edit_time_from_timestamps, axis=1)
+    manual_rewriting.drop(columns=["commit_msg_history", "loaded_ts", "submitted_ts"])
+    manual_rewriting.set_index(["hash", "repo"], inplace=True)
+    mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]]
+    mods_dataset.set_index(["hash", "repo"], inplace=True)
+    return manual_rewriting.join(other=mods_dataset, how="left").reset_index()
+def load_synthetic_as_pandas():
+    return load_dataset(
+        config.HF_SYNTHETIC_DATASET_NAME,
+        "all_pairs_with_metrics",
+        split=config.HF_SYNTHETIC_DATASET_SPLIT,
+        token=config.HF_TOKEN,
+        cache_dir=config.CACHE_DIR,
+    ).to_pandas()
+def load_full_commit_with_predictions_as_pandas():
+    full_dataset = load_full_commit_as_pandas()
+    predictions_paths = []
+    for prediction_file in list_repo_tree(
+        repo_id=config.HF_PREDICTIONS_DATASET_NAME,
+        path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL),
+        repo_type="dataset",
+    ):
+        predictions_paths.append(
+            hf_hub_download(
+                prediction_file.path,
+                repo_id=config.HF_PREDICTIONS_DATASET_NAME,
+                repo_type="dataset",
+                cache_dir=config.CACHE_DIR,
+            )
+        )
+    dfs = []
+    for path in predictions_paths:
+        dfs.append(pd.read_json(path, orient="records", lines=True))
+    predictions_dataset = pd.concat(dfs, axis=0, ignore_index=True)
+    predictions_dataset = predictions_dataset.sample(frac=1, random_state=config.RANDOM_STATE).set_index(
+        ["hash", "repo"]
+    )[["prediction"]]
+    predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep="first")]
+    dataset = full_dataset.join(other=predictions_dataset, on=("hash", "repo"))
+    return dataset.reset_index()

change_visualizer.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import gradio as gr
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from datasets import load_dataset
+from evaluate.utils import parse_readme
+from scipy.stats import gaussian_kde, spearmanr
+import generate_annotated_diffs
+from api_wrappers import hf_data_loader
+from generation_steps.metrics_analysis import AGGR_METRICS, edit_distance_fn
+colors = {
+    "Expert-labeled": "#C19C0B",
+    "Synthetic Backward": "#913632",
+    "Synthetic Forward": "#58136a",
+    "Full": "#000000",
+}
+METRICS = {
+    "Edit Distance": "editdist",
+    "Edit Similarity": "editsim",
+    "BLEU": "bleu",
+    "METEOR": "meteor",
+    "ROUGE-1": "rouge1",
+    "ROUGE-2": "rouge2",
+    "ROUGE-L": "rougeL",
+    "BERTScore": "bertscore",
+    "ChrF": "chrF",
+}
+df_related = generate_annotated_diffs.data_with_annotated_diffs()
+def golden():
+    return df_related.loc[(df_related["G_type"] == "initial") & (df_related["E_type"] == "expert_labeled")].reset_index(
+        drop=True
+    )
+def backward():
+    return df_related.loc[
+        (df_related["G_type"] == "synthetic_backward") & (df_related["E_type"] == "expert_labeled")
+    ].reset_index(drop=True)
+def forward():
+    return df_related.loc[
+        (df_related["G_type"] == "initial") & (df_related["E_type"] == "synthetic_forward")
+    ].reset_index(drop=True)
+def forward_from_backward():
+    return df_related.loc[
+        (df_related.G_type == "synthetic_backward")
+        & (df_related.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))
+    ].reset_index(drop=True)
+n_diffs_manual = len(golden())
+n_diffs_synthetic_backward = len(backward())
+n_diffs_synthetic_forward = len(forward())
+n_diffs_synthetic_forward_backward = len(forward_from_backward())
+def update_dataset_view(diff_idx, df):
+    diff_idx -= 1
+    return (
+        df.iloc[diff_idx]["annotated_diff"],
+        df.iloc[diff_idx]["commit_msg_start"] if "commit_msg_start" in df.columns else df.iloc[diff_idx]["G_text"],
+        df.iloc[diff_idx]["commit_msg_end"] if "commit_msg_end" in df.columns else df.iloc[diff_idx]["E_text"],
+        f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",
+    )
+def update_dataset_view_manual(diff_idx):
+    return update_dataset_view(diff_idx, golden())
+def update_dataset_view_synthetic_backward(diff_idx):
+    return update_dataset_view(diff_idx, backward())
+def update_dataset_view_synthetic_forward(diff_idx):
+    return update_dataset_view(diff_idx, forward())
+def update_dataset_view_synthetic_forward_backward(diff_idx):
+    return update_dataset_view(diff_idx, forward_from_backward())
+def number_of_pairs_plot():
+    related_plot_dict = {
+        "Full": df_related,
+        "Synthetic Backward": backward(),
+        "Synthetic Forward": pd.concat([forward(), forward_from_backward()], axis=0, ignore_index=True),
+        "Expert-labeled": golden(),
+    }
+    df_unrelated = hf_data_loader.load_synthetic_as_pandas()
+    df_unrelated = df_unrelated.loc[~df_unrelated.is_related].copy()
+    unrelated_plot_dict = {
+        "Full": df_unrelated,
+        "Synthetic Backward": df_unrelated.loc[
+            (df_unrelated["G_type"] == "synthetic_backward")
+            & (~df_unrelated.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))
+        ],
+        "Synthetic Forward": df_unrelated.loc[
+            ((df_unrelated["G_type"] == "initial") & (df_unrelated["E_type"] == "synthetic_forward"))
+            | (
+                (df_unrelated["G_type"] == "synthetic_backward")
+                & (df_unrelated["E_type"].isin(["synthetic_forward", "synthetic_forward_from_backward"]))
+            )
+        ],
+        "Expert-labeled": df_unrelated.loc[
+            (df_unrelated.G_type == "initial") & (df_unrelated.E_type == "expert_labeled")
+        ],
+    }
+    traces = []
+    for split in related_plot_dict.keys():
+        related_count = len(related_plot_dict[split])
+        unrelated_count = len(unrelated_plot_dict[split])
+        traces.append(
+            go.Bar(
+                name=f"{split} - Related pairs",
+                x=[split],
+                y=[related_count],
+                marker=dict(
+                    color=colors[split],
+                ),
+            )
+        )
+        traces.append(
+            go.Bar(
+                name=f"{split} - Conditionally independent pairs",
+                x=[split],
+                y=[unrelated_count],
+                marker=dict(
+                    color=colors[split],
+                    pattern=dict(
+                        shape="/",  # Crosses
+                        fillmode="overlay",
+                        solidity=0.5,
+                    ),
+                ),
+            )
+        )
+    fig = go.Figure(data=traces)
+    fig.update_layout(
+        barmode="stack",
+        bargap=0.2,
+        xaxis=dict(title="Split", showgrid=True, gridcolor="lightgrey"),
+        yaxis=dict(title="Number of Examples", showgrid=True, gridcolor="lightgrey"),
+        legend=dict(title="Pair Type", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
+        plot_bgcolor="rgba(0,0,0,0)",
+        paper_bgcolor="rgba(0,0,0,0)",
+        width=1100,
+    )
+    return fig
+def edit_distance_plot():
+    df_edit_distance = {
+        "Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in df_related.iterrows()],
+        "Synthetic Backward": [
+            edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in backward().iterrows()
+        ],
+        "Synthetic Forward": [
+            edit_distance_fn(pred=row["G_text"], ref=row["E_text"])
+            for _, row in pd.concat([forward(), forward_from_backward()], axis=0, ignore_index=True).iterrows()
+        ],
+        "Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in golden().iterrows()],
+    }
+    traces = []
+    for key in df_edit_distance:
+        kde_x = np.linspace(0, 1200, 1000)
+        kde = gaussian_kde(df_edit_distance[key])
+        kde_line = go.Scatter(x=kde_x, y=kde(kde_x), mode="lines", name=key, line=dict(color=colors[key], width=5))
+        traces.append(kde_line)
+    fig = go.Figure(data=traces)
+    fig.update_layout(
+        bargap=0.1,
+        xaxis=dict(title=dict(text="Edit Distance"), range=[0, 1200], showgrid=True, gridcolor="lightgrey"),
+        yaxis=dict(
+            title=dict(text="Probability Density"),
+            range=[0, 0.004],
+            showgrid=True,
+            gridcolor="lightgrey",
+            tickvals=[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004],
+            tickformat=".4f",
+        ),
+        plot_bgcolor="rgba(0,0,0,0)",
+        paper_bgcolor="rgba(0,0,0,0)",
+        width=1100,
+    )
+    return fig
+def get_correlations_table(online_metric_name: str) -> pd.DataFrame:
+    df = load_dataset(
+        "JetBrains-Research/synthetic-commit-msg-edits", "all_pairs_with_metrics_other_online_metrics", split="train"
+    ).to_pandas()
+    corr_df = (
+        df.loc[~df.is_related]
+        .groupby(["G_text", "G_type", "hash", "repo"] + [f"online_{online_metric_name}"])
+        .apply(lambda g: g.to_dict(orient="records"), include_groups=False)
+        .reset_index(name="unrelated_pairs")
+        .copy()
+    )
+    _ = corr_df.copy()
+    for metric in AGGR_METRICS:
+        if metric in ["editdist"]:
+            _[metric] = _.unrelated_pairs.apply(lambda pairs: min(pair[metric] for pair in pairs))
+        else:
+            _[metric] = _.unrelated_pairs.apply(lambda pairs: max(pair[metric] for pair in pairs))
+    results = []
+    for metric in AGGR_METRICS:
+        x = _[metric].to_numpy()
+        y = _[f"online_{online_metric_name}"].to_numpy()
+        corr, p_value = spearmanr(x, y)
+        results.append({"metric": metric, "corr": corr, "p_value": p_value})
+    __ = pd.DataFrame(results)
+    __["p_value"] = ["< 0.05" if p < 0.05 else p for p in __.p_value]
+    __["corr_abs"] = abs(__["corr"])
+    __["corr"] = __["corr"].round(2)
+    __["metric"] = __["metric"].map({v: k for k, v in METRICS.items()})
+    return (
+        __.sort_values(by=["corr_abs"], ascending=False)
+        .drop(columns=["corr_abs"])
+        .rename(columns={"metric": "Metric m", "corr": "Correlation Q(m, m*)", "p_value": "p-value"})
+    )
+force_light_theme_js_func = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'light') {
+        url.searchParams.set('__theme', 'light');
+        window.location.href = url.href;
+    }
+}
+"""
+if __name__ == "__main__":
+    with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
+        gr.Markdown(parse_readme("README.md"))
+        def dataset_view_tab(n_items):
+            slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1, label=f"Sample number (total: {n_items})")
+            diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={"+": "green", "-": "red"})
+            start_view = gr.Textbox(interactive=False, label="Initial message G", container=True)
+            end_view = gr.Textbox(interactive=False, label="Edited message E", container=True)
+            link_view = gr.Markdown()
+            view = [diff_view, start_view, end_view, link_view]
+            return slider, view
+        with gr.Tab("Examples Exploration"):
+            with gr.Tab("Manual"):
+                slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
+                slider_manual.change(update_dataset_view_manual, inputs=slider_manual, outputs=view_manual)
+            with gr.Tab("Synthetic Backward"):
+                slider_synthetic_backward, view_synthetic_backward = dataset_view_tab(n_diffs_synthetic_backward)
+                slider_synthetic_backward.change(
+                    update_dataset_view_synthetic_backward,
+                    inputs=slider_synthetic_backward,
+                    outputs=view_synthetic_backward,
+                )
+            with gr.Tab("Synthetic Forward (from initial)"):
+                slider_synthetic_forward, view_synthetic_forward = dataset_view_tab(n_diffs_synthetic_forward)
+                slider_synthetic_forward.change(
+                    update_dataset_view_synthetic_forward,
+                    inputs=slider_synthetic_forward,
+                    outputs=view_synthetic_forward,
+                )
+            with gr.Tab("Synthetic Forward (from backward)"):
+                slider_synthetic_forward_backward, view_synthetic_forward_backward = dataset_view_tab(
+                    n_diffs_synthetic_forward_backward
+                )
+                slider_synthetic_forward_backward.change(
+                    update_dataset_view_synthetic_forward_backward,
+                    inputs=slider_synthetic_forward_backward,
+                    outputs=view_synthetic_forward_backward,
+                )
+        with gr.Tab("Dataset Statistics"):
+            gr.Markdown("## Number of examples per split")
+            number_of_pairs_gr_plot = gr.Plot(number_of_pairs_plot, label=None)
+            gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")
+            edit_distance_gr_plot = gr.Plot(edit_distance_plot(), label=None)
+        with gr.Tab("Experimental Results"):
+            gr.Markdown(
+                "Here, we provide the additional experimental results with different text similarity metrics used as the target online metric, "
+                "in addition to edit distance between generated messages G and their edited counterparts E."
+            )
+            gr.Markdown(
+                "Please, select one of the available metrics **m*** below to see the correlations **Q(m, m\*)** of offline text similarity metrics with **m*** as an online metric."
+            )
+            for metric in METRICS:
+                with gr.Tab(metric):
+                    gr.Markdown(
+                        f"The table below presents the correlation coefficients **Q(m, m\*)** where {metric} is used as an online metric **m***."
+                    )
+                    result_df = get_correlations_table(METRICS[metric])
+                    gr.DataFrame(result_df)
+        application.load(update_dataset_view_manual, inputs=slider_manual, outputs=view_manual)
+        application.load(
+            update_dataset_view_synthetic_backward, inputs=slider_synthetic_backward, outputs=view_synthetic_backward
+        )
+        application.load(
+            update_dataset_view_synthetic_forward, inputs=slider_synthetic_forward, outputs=view_synthetic_forward
+        )
+        application.load(
+            update_dataset_view_synthetic_forward_backward,
+            inputs=slider_synthetic_forward_backward,
+            outputs=view_synthetic_forward_backward,
+        )
+    application.launch()

chart.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

config.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+from pathlib import Path
+RANDOM_STATE = 42
+GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_JWT_TOKEN")
+GRAZIE_TIMEOUT_SEC = 1.0
+HF_TOKEN = os.environ.get("HF_TOKEN")
+HF_RAW_DATASET_NAME = "JetBrains-Research/commit-msg-rewriting"
+HF_RAW_DATASET_SPLIT = "train"
+HF_FULL_COMMITS_DATASET_NAME = "JetBrains-Research/lca-commit-message-generation"
+HF_FULL_COMMITS_DATASET_SUBNAME = "commitchronicle-py-long"
+HF_FULL_COMMITS_DATASET_SPLIT = "test"
+HF_PREDICTIONS_DATASET_NAME = "JetBrains-Research/lca-results"
+HF_PREDICTIONS_MODEL = "gpt_4_0613"
+HF_SYNTHETIC_DATASET_NAME = "JetBrains-Research/synthetic-commit-msg-rewriting"
+HF_SYNTHETIC_DATASET_SPLIT = "train"
+LLM_MODEL = "gpt-4-1106-preview"
+CACHE_DIR = Path("cache")
+CACHE_DIR.mkdir(exist_ok=True)
+OUTPUT_DIR = Path("output")
+OUTPUT_DIR.mkdir(exist_ok=True)
+END_TO_START_ARTIFACT = OUTPUT_DIR / "end_to_start.csv"
+START_TO_END_ARTIFACT = OUTPUT_DIR / "start_to_end.csv"
+SYNTHETIC_DATASET_ARTIFACT = OUTPUT_DIR / "synthetic.csv"
+METRICS_CORRELATIONS_ARTIFACT = OUTPUT_DIR / "metrics_correlations.csv"
+DATA_FOR_LABELING_ARTIFACT = OUTPUT_DIR / "data_for_labeling.csv"
+OUTPUT_CHARTS_DIR = OUTPUT_DIR / "charts"
+OUTPUT_CHARTS_DIR.mkdir(exist_ok=True)

data_stats.ipynb ADDED Viewed

	@@ -0,0 +1,759 @@

+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Data Stats",
+   "id": "694a6cc631d4ab93"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:43:07.644299Z",
+     "start_time": "2024-10-15T18:43:02.316453Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "\n",
+    "df = load_dataset(\"JetBrains-Research/synthetic-commit-msg-edits\", \"all_pairs\", split=\"train\").to_pandas()\n",
+    "df.head()"
+   ],
+   "id": "ed42f4f83199feb2",
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading data: 100%|██████████| 6.35M/6.35M [00:00<00:00, 9.95MB/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ],
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "1a0523289d424b29974b60d017643280"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "                                       hash              repo  \\\n",
+       "0  2febb99eee8ed71c9122db88ca58dd33be0b9550  mesonbuild/meson   \n",
+       "1  2febb99eee8ed71c9122db88ca58dd33be0b9550  mesonbuild/meson   \n",
+       "2  2febb99eee8ed71c9122db88ca58dd33be0b9550  mesonbuild/meson   \n",
+       "3  2febb99eee8ed71c9122db88ca58dd33be0b9550  mesonbuild/meson   \n",
+       "4  2febb99eee8ed71c9122db88ca58dd33be0b9550  mesonbuild/meson   \n",
+       "\n",
+       "                                              G_text  \\\n",
+       "0  Enhance OptionOverrideProxy and simplify optio...   \n",
+       "1  Enhance OptionOverrideProxy and simplify optio...   \n",
+       "2  Enhance OptionOverrideProxy and simplify optio...   \n",
+       "3  Enhance OptionOverrideProxy and simplify optio...   \n",
+       "4  Enhance OptionOverrideProxy and simplify optio...   \n",
+       "\n",
+       "                                              E_text              G_type  \\\n",
+       "0  Enhance OptionOverrideProxy for multiple optio...  synthetic_backward   \n",
+       "1  Refactor OptionOverrideProxy and Backend class...  synthetic_backward   \n",
+       "2  Refactor OptionOverrideProxy and backend optio...  synthetic_backward   \n",
+       "3  Refactor: Enhance OptionOverrideProxy for mult...  synthetic_backward   \n",
+       "4  Refactor OptionOverrideProxy and add target-sp...  synthetic_backward   \n",
+       "\n",
+       "                            E_type  is_related  \n",
+       "0                   expert_labeled        True  \n",
+       "1                synthetic_forward        True  \n",
+       "2                synthetic_forward        True  \n",
+       "3                synthetic_forward        True  \n",
+       "4  synthetic_forward_from_backward       False  "
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>hash</th>\n",
+       "      <th>repo</th>\n",
+       "      <th>G_text</th>\n",
+       "      <th>E_text</th>\n",
+       "      <th>G_type</th>\n",
+       "      <th>E_type</th>\n",
+       "      <th>is_related</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
+       "      <td>mesonbuild/meson</td>\n",
+       "      <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
+       "      <td>Enhance OptionOverrideProxy for multiple optio...</td>\n",
+       "      <td>synthetic_backward</td>\n",
+       "      <td>expert_labeled</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
+       "      <td>mesonbuild/meson</td>\n",
+       "      <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
+       "      <td>Refactor OptionOverrideProxy and Backend class...</td>\n",
+       "      <td>synthetic_backward</td>\n",
+       "      <td>synthetic_forward</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
+       "      <td>mesonbuild/meson</td>\n",
+       "      <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
+       "      <td>Refactor OptionOverrideProxy and backend optio...</td>\n",
+       "      <td>synthetic_backward</td>\n",
+       "      <td>synthetic_forward</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
+       "      <td>mesonbuild/meson</td>\n",
+       "      <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
+       "      <td>Refactor: Enhance OptionOverrideProxy for mult...</td>\n",
+       "      <td>synthetic_backward</td>\n",
+       "      <td>synthetic_forward</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
+       "      <td>mesonbuild/meson</td>\n",
+       "      <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
+       "      <td>Refactor OptionOverrideProxy and add target-sp...</td>\n",
+       "      <td>synthetic_backward</td>\n",
+       "      <td>synthetic_forward_from_backward</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 3
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Full",
+   "id": "922e7a73f11a4aec"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:43:14.266540Z",
+     "start_time": "2024-10-15T18:43:14.262103Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(df.loc[df.is_related])",
+   "id": "562d9c53da109d1a",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "656"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 4
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:43:18.073966Z",
+     "start_time": "2024-10-15T18:43:18.069219Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "df.loc[df.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
+   "id": "b4f3c96a4b676a0d",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "43.733333333333334"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 5
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:43:19.026689Z",
+     "start_time": "2024-10-15T18:43:19.021680Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(df.loc[~df.is_related])",
+   "id": "54d9f32f1d18844f",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "5140"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 6
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:43:19.484304Z",
+     "start_time": "2024-10-15T18:43:19.480012Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "df.loc[~df.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
+   "id": "679761631517b9e4",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "342.6666666666667"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 7
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Expert-labeled",
+   "id": "84561ea89717d61a"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:45:52.905631Z",
+     "start_time": "2024-10-15T18:45:52.901913Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_ = df.loc[(df.G_type == \"initial\") & (df.E_type == \"expert_labeled\")]",
+   "id": "be1c800f45cef26e",
+   "outputs": [],
+   "execution_count": 36
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:45:53.234109Z",
+     "start_time": "2024-10-15T18:45:53.230986Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(_.loc[_.is_related])",
+   "id": "1d092dff4d39bcd1",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "57"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 37
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:45:53.629311Z",
+     "start_time": "2024-10-15T18:45:53.625620Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
+   "id": "a06a532cd5a29725",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3.8"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 38
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:45:53.956790Z",
+     "start_time": "2024-10-15T18:45:53.953842Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(_.loc[~_.is_related])",
+   "id": "5e19c8a6309b62aa",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 39
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:46:02.554527Z",
+     "start_time": "2024-10-15T18:46:02.551084Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
+   "id": "e43179c5dcab5eb2",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "nan"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 40
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Backward",
+   "id": "70ee052fae2f88e3"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:44:33.559606Z",
+     "start_time": "2024-10-15T18:44:33.556802Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_ = df.loc[(df.G_type == \"synthetic_backward\") & (~df.E_type.isin([\"synthetic_forward\", \"synthetic_forward_from_backward\"]))]",
+   "id": "99f51ecc55c4db35",
+   "outputs": [],
+   "execution_count": 20
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:44:33.958325Z",
+     "start_time": "2024-10-15T18:44:33.955847Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(_.loc[_.is_related])",
+   "id": "6ff29390c8e127c2",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "104"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 21
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:44:34.455560Z",
+     "start_time": "2024-10-15T18:44:34.452303Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
+   "id": "e1ae04e1ecfb2040",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7.428571428571429"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 22
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:44:34.903849Z",
+     "start_time": "2024-10-15T18:44:34.901226Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(_.loc[~_.is_related])",
+   "id": "125c4c335e7761da",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1048"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 23
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:44:35.783538Z",
+     "start_time": "2024-10-15T18:44:35.778676Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
+   "id": "4782f1d6e6863f89",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "74.85714285714286"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 24
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Forward",
+   "id": "bf61a4b422f779fa"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "### From human",
+   "id": "1429f9f99acf75d"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:46:21.359807Z",
+     "start_time": "2024-10-15T18:46:21.356451Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_ = df.loc[(df.G_type == \"initial\") & (df.E_type == \"synthetic_forward\")]",
+   "id": "e13d55b0124f04b3",
+   "outputs": [],
+   "execution_count": 41
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:46:21.798508Z",
+     "start_time": "2024-10-15T18:46:21.795885Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(_.loc[_.is_related])",
+   "id": "b8353390df7da427",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "177"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 42
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:46:22.163595Z",
+     "start_time": "2024-10-15T18:46:22.160176Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
+   "id": "ac89afde65efd73d",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "11.8"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 43
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:46:22.552314Z",
+     "start_time": "2024-10-15T18:46:22.549570Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(_.loc[~_.is_related])",
+   "id": "9b6cb335e3bbb7ff",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 44
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:46:23.237736Z",
+     "start_time": "2024-10-15T18:46:23.234085Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "__.loc[~__.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
+   "id": "fe22189a70fc4149",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "nan"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 45
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "### From backward",
+   "id": "ace7afb876fb88a0"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:47:06.641374Z",
+     "start_time": "2024-10-15T18:47:06.637018Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_ = df.loc[(df.G_type == \"synthetic_backward\") & (df.E_type.isin([\"synthetic_forward\", \"synthetic_forward_from_backward\"]))]",
+   "id": "88800960dbff619a",
+   "outputs": [],
+   "execution_count": 53
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:47:15.358650Z",
+     "start_time": "2024-10-15T18:47:15.355108Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(_.loc[_.is_related])",
+   "id": "890613156e005c83",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "318"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 56
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:47:15.579415Z",
+     "start_time": "2024-10-15T18:47:15.576016Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
+   "id": "999f91382a2c8ff6",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "22.714285714285715"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 57
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:47:15.834218Z",
+     "start_time": "2024-10-15T18:47:15.831258Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(_.loc[~_.is_related])",
+   "id": "d347941cbb4b2db1",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3753"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 58
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T18:47:16.138798Z",
+     "start_time": "2024-10-15T18:47:16.133397Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
+   "id": "2db4d96713a8634d",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "268.07142857142856"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 59
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

dataset_statistics.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import pickle
+import Levenshtein
+import numpy as np
+import pandas as pd
+import plotly.figure_factory as ff
+import config
+def get_statistics_for_sample(start_msg, end_msg, row=None):
+    edit_ops = Levenshtein.editops(start_msg, end_msg)
+    n_deletes = sum([1 if op == "delete" else 0 for op, _, _ in edit_ops])
+    n_inserts = sum([1 if op == "insert" else 0 for op, _, _ in edit_ops])
+    n_replaces = sum([1 if op == "replace" else 0 for op, _, _ in edit_ops])
+    n_changes = n_deletes + n_inserts + n_replaces
+    n_deletes += n_replaces
+    n_inserts += n_replaces
+    return {
+        "deletions": n_deletes,
+        "insertions": n_inserts,
+        "changes": n_changes,
+        "deletions_norm": n_deletes / len(start_msg),
+        "insertions_norm": n_inserts / len(end_msg),
+        "changes_norm": n_changes / len(end_msg),
+        "lendiff": abs(len(start_msg) - len(end_msg)),
+        "editdist": row["editdist"] if row is not None else Levenshtein.distance(start_msg, end_msg),
+    }
+def get_statistics_for_row(row):
+    if "commit_msg_start" in row:
+        start = row["commit_msg_start"]
+    else:
+        start = row["G_text"]
+    if "commit_msg_end" in row:
+        end = row["commit_msg_end"]
+    else:
+        end = row["E_text"]
+    return get_statistics_for_sample(start, end, row=row)
+def get_statistics_for_df(df: pd.DataFrame):
+    stats = [get_statistics_for_row(row) for _, row in df.iterrows()]
+    assert len(stats) > 0
+    return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}
+def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name):
+    hist_data = [
+        stat_golden,
+        stat_e2s,
+        stat_s2e,
+        stat_e2s_s2e,
+        np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0),
+    ]
+    group_labels = ["Golden", "e2s", "s2e", "e2s+s2e", "Synthetic"]
+    fig = ff.create_distplot(hist_data, group_labels, bin_size=0.05, show_rug=False, show_hist=False)
+    fig.update_layout(title_text=stat_name)
+    with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f:
+        pickle.dump(hist_data, f)
+    return fig

generate_annotated_diffs.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import diff_match_patch as dmp_module
+from tqdm import tqdm
+from api_wrappers import hf_data_loader
+def get_annotated_diff(start_text, end_text):
+    dmp = dmp_module.diff_match_patch()
+    dmp_mapping = {-1: "-", 0: None, 1: "+"}
+    diff = dmp.diff_main(start_text, end_text)
+    dmp.diff_cleanupSemantic(diff)
+    result = [[w, dmp_mapping[t]] for t, w in diff]
+    return result
+def annotated_diff_for_row(row):
+    if "commit_msg_start" in row:
+        start = row["commit_msg_start"]
+    else:
+        start = row["G_text"]
+    if "commit_msg_end" in row:
+        end = row["commit_msg_end"]
+    else:
+        end = row["E_text"]
+    return get_annotated_diff(start, end)
+def data_with_annotated_diffs():
+    tqdm.pandas()
+    df = hf_data_loader.load_synthetic_as_pandas()
+    df = df.loc[df.is_related].copy()
+    annotated = df.progress_apply(annotated_diff_for_row, axis=1)
+    df["annotated_diff"] = annotated
+    return df

generated_message_length_comparison.ipynb ADDED Viewed

	@@ -0,0 +1,314 @@

+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "### How to run\n",
+    "\n",
+    "* Install libraries using the cell below (for grazie-api-gateway-client you will have to add a custom JB repository)\n",
+    "* Put the production prompt to file `data/prod_prompt.txt`\n",
+    "* Environment variables:\n",
+    "    - `GRAZIE_API_JWT_TOKEN` -- JWT token for grazie (check `api_wrappers/grazie_wrapper.py` to adjust the client initialization if necessary)\n",
+    "    - `HF_TOKEN` -- should _not_ be required; however, if it is, set it to a valid Hugging Face token"
+   ],
+   "id": "77d51d55b41735cf"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-06-20T16:09:07.968406Z",
+     "start_time": "2024-06-20T16:09:07.955405Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "# !pip install grazie-api-gateway-client\n",
+    "# !pip install tqdm\n",
+    "# !pip install pandas\n",
+    "# !pip install datasets"
+   ],
+   "id": "91fa273e8987f6f6",
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-06-20T16:09:10.353479Z",
+     "start_time": "2024-06-20T16:09:07.970405Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from api_wrappers.grazie_wrapper import generate_for_prompt\n",
+    "from api_wrappers.hf_data_loader import load_full_commit_with_predictions_as_pandas\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "tqdm.pandas()"
+   ],
+   "id": "ce11a4c781c152e",
+   "outputs": [],
+   "execution_count": 2
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-06-20T16:09:10.368996Z",
+     "start_time": "2024-06-20T16:09:10.354434Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "with open(\"data/prod_prompt.txt\") as f:\n",
+    "\tPROD_PROMPT = f.read().strip()\n",
+    "\n",
+    "def prod_prompt(diff):\n",
+    "\treturn PROD_PROMPT.replace(\"$diff\", diff).replace(\"$text\", \"\")\n",
+    "\n",
+    "def generate_commit_message_prod(diff):\n",
+    "\treturn generate_for_prompt(prod_prompt(diff))"
+   ],
+   "id": "84a769c8765a7b64",
+   "outputs": [],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-06-20T16:09:10.384590Z",
+     "start_time": "2024-06-20T16:09:10.371410Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "generate_commit_message_prod(\"TEST\")",
+   "id": "af2f20def94b0490",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"Certainly! I'll need to see the specific code differences (diffs) you would like to have summarized into a commit message. Please provide the diffs so I can assist you properly.\""
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 4
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-06-20T16:09:22.224167Z",
+     "start_time": "2024-06-20T16:09:10.388409Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "DATA = load_full_commit_with_predictions_as_pandas()[[\"mods\", \"prediction\"]].rename(columns={\"mods\": \"diff\", \"prediction\": \"prediction_current\"})\n",
+    "DATA.head()"
+   ],
+   "id": "a49cabf576c9d692",
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using the latest cached version of the dataset since JetBrains-Research/lca-commit-message-generation couldn't be found on the Hugging Face Hub\n",
+      "Found the latest cached dataset configuration 'commitchronicle-py-long' at cache\\JetBrains-Research___lca-commit-message-generation\\commitchronicle-py-long\\0.0.0\\58dcef83a63cccebacd3e786afd73181cc9175e5 (last modified on Sun Apr  7 11:16:22 2024).\n",
+      "Using the latest cached version of the dataset since JetBrains-Research/lca-results couldn't be found on the Hugging Face Hub\n",
+      "Found the latest cached dataset configuration 'cmg_gpt_4_0613' at cache\\JetBrains-Research___lca-results\\cmg_gpt_4_0613\\0.0.0\\4b56bbf7243da371b3e0a42a0c9db1f37af98c39 (last modified on Fri May 31 16:00:33 2024).\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "                                                diff  \\\n",
+       "0  [{'change_type': 'MODIFY', 'old_path': 'cupy/c...   \n",
+       "1  [{'change_type': 'MODIFY', 'old_path': 'tests/...   \n",
+       "2  [{'change_type': 'MODIFY', 'old_path': 'numpy/...   \n",
+       "3  [{'change_type': 'MODIFY', 'old_path': 'numpy/...   \n",
+       "4  [{'change_type': 'MODIFY', 'old_path': 'numpy/...   \n",
+       "\n",
+       "                                  prediction_current  \n",
+       "0  Extend memory management to consider CUDA stre...  \n",
+       "1  Implement utility methods for parameterized te...  \n",
+       "2  Update numpy function imports to use numpy as ...  \n",
+       "3  Switch to using internal implementation method...  \n",
+       "4  Add type hints and refine array API wrappers\\n...  "
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>diff</th>\n",
+       "      <th>prediction_current</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[{'change_type': 'MODIFY', 'old_path': 'cupy/c...</td>\n",
+       "      <td>Extend memory management to consider CUDA stre...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[{'change_type': 'MODIFY', 'old_path': 'tests/...</td>\n",
+       "      <td>Implement utility methods for parameterized te...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n",
+       "      <td>Update numpy function imports to use numpy as ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n",
+       "      <td>Switch to using internal implementation method...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n",
+       "      <td>Add type hints and refine array API wrappers\\n...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 5
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-06-20T16:21:20.410778Z",
+     "start_time": "2024-06-20T16:09:22.227258Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "DATA[\"prediction_prod\"] = DATA.progress_apply(lambda row: generate_commit_message_prod(str(row[\"diff\"])), axis=1)",
+   "id": "9ded493e087f991d",
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 163/163 [11:58<00:00,  4.41s/it]\n"
+     ]
+    }
+   ],
+   "execution_count": 6
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-06-20T16:21:20.426781Z",
+     "start_time": "2024-06-20T16:21:20.414781Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "current_avg_length = DATA[\"prediction_current\"].str.len().mean()\n",
+    "print(f\"Current average length: {current_avg_length}\")"
+   ],
+   "id": "ad38c2dce387f26d",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Current average length: 625.5644171779142\n"
+     ]
+    }
+   ],
+   "execution_count": 7
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-06-20T16:21:20.442017Z",
+     "start_time": "2024-06-20T16:21:20.429913Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "prod_avg_length = DATA[\"prediction_prod\"].str.len().mean()\n",
+    "print(f\"Prod average length: {prod_avg_length}\")"
+   ],
+   "id": "ec8b4412410794a4",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prod average length: 352.88957055214723\n"
+     ]
+    }
+   ],
+   "execution_count": 8
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-06-20T16:21:20.457884Z",
+     "start_time": "2024-06-20T16:21:20.444852Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "print(f\"Length ratio (current / prod): {current_avg_length / prod_avg_length})\")",
+   "id": "10f087784896eca3",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Length ratio (current / prod): 1.772691712591923)\n"
+     ]
+    }
+   ],
+   "execution_count": 9
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

generation_steps/__init__.py ADDED Viewed

File without changes

generation_steps/examples.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import config
+from api_wrappers import hf_data_loader
+N_EXAMPLES = 15
+def get_example_prompt_end_to_start(start_msg, end_msg):
+    return f"""START OF THE EXAMPLE
+For the following edited commit message:
+START OF THE EDITED COMMIT MESSAGE
+{end_msg}
+END OF THE EDITED COMMIT MESSAGE
+You would output the following initial commit message:
+START OF THE INITIAL COMMIT MESSAGE
+{start_msg}
+END OF THE INITIAL COMMIT MESSAGE
+END OF THE EXAMPLE"""
+def get_example_prompt_start_to_end(start_msg, end_msg):
+    return f"""START OF THE EXAMPLE
+For the following LLM-generated commit message:
+START OF THE GENERATED COMMIT MESSAGE
+{start_msg}
+END OF THE GENERATED COMMIT MESSAGE
+You would output the following improved commit message:
+START OF THE IMPROVED COMMIT MESSAGE
+{end_msg}
+END OF THE IMPROVED COMMIT MESSAGE
+END OF THE EXAMPLE"""
+manual_df = hf_data_loader.load_raw_rewriting_as_pandas()[["commit_msg_start", "commit_msg_end"]]
+manual_df = manual_df.sample(n=N_EXAMPLES, random_state=config.RANDOM_STATE)
+def generate_examples(end_to_start):
+    prompt_fn = get_example_prompt_end_to_start if end_to_start else get_example_prompt_start_to_end
+    examples = [prompt_fn(row["commit_msg_start"], row["commit_msg_end"]) for _, row in manual_df.iterrows()]
+    return "\n".join(examples)
+EXAMPLES_END_TO_START = generate_examples(end_to_start=True)
+EXAMPLES_START_TO_END = generate_examples(end_to_start=False)

generation_steps/for_labeling.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import json
+from tqdm import tqdm
+import config
+from api_wrappers import hf_data_loader
+from generation_steps import synthetic_forward
+def transform(df):
+    print("Generating data for labeling:")
+    synthetic_forward.print_config()
+    tqdm.pandas()
+    manual_df = hf_data_loader.load_raw_rewriting_as_pandas()
+    manual_df = manual_df.sample(frac=1, random_state=config.RANDOM_STATE).set_index(["hash", "repo"])[
+        ["commit_msg_start", "commit_msg_end"]
+    ]
+    manual_df = manual_df[~manual_df.index.duplicated(keep="first")]
+    def get_is_manually_rewritten(row):
+        commit_id = (row["hash"], row["repo"])
+        return commit_id in manual_df.index
+    result = df
+    result["manual_sample"] = result.progress_apply(get_is_manually_rewritten, axis=1)
+    def get_prediction_message(row):
+        commit_id = (row["hash"], row["repo"])
+        if row["manual_sample"]:
+            return manual_df.loc[commit_id]["commit_msg_start"]
+        return row["prediction"]
+    def get_enhanced_message(row):
+        commit_id = (row["hash"], row["repo"])
+        if row["manual_sample"]:
+            return manual_df.loc[commit_id]["commit_msg_end"]
+        return synthetic_forward.generate_end_msg(start_msg=row["prediction"], diff=row["mods"])
+    result["enhanced"] = result.progress_apply(get_enhanced_message, axis=1)
+    result["prediction"] = result.progress_apply(get_prediction_message, axis=1)
+    result["mods"] = result["mods"].progress_apply(json.dumps)
+    result.to_csv(config.DATA_FOR_LABELING_ARTIFACT)
+    print("Done")
+    return result
+def main():
+    synthetic_forward.GENERATION_ATTEMPTS = 3
+    df = hf_data_loader.load_full_commit_with_predictions_as_pandas()
+    transform(df)
+if __name__ == "__main__":
+    main()

generation_steps/metrics_analysis.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import evaluate
+from rapidfuzz.distance.Levenshtein import distance, normalized_similarity
+import config
+BLEU = evaluate.load("saridormi/b_norm", cache_dir=config.CACHE_DIR)
+def bleu_fn(pred, ref, **kwargs):
+    if "refs" in kwargs:
+        return BLEU.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["b_norm"]
+    return BLEU.compute(predictions=[pred], references=[ref])["b_norm"]
+METEOR = evaluate.load("meteor", cache_dir=config.CACHE_DIR)
+def meteor_fn(pred, ref, **kwargs):
+    if "refs" in kwargs:
+        return METEOR.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["meteor"]
+    return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
+ROUGE = evaluate.load("rouge", cache_dir=config.CACHE_DIR)
+def rouge1_fn(pred, ref, **kwargs):
+    if "refs" in kwargs:
+        return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge1"]
+    return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
+def rouge2_fn(pred, ref, **kwargs):
+    if "refs" in kwargs:
+        return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge2"]
+    return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
+def rougeL_fn(pred, ref, **kwargs):
+    if "refs" in kwargs:
+        return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rougeL"]
+    return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
+BERTSCORE = evaluate.load("bertscore", cache_dir=config.CACHE_DIR)
+def bertscore_fn(pred, ref, **kwargs):
+    if "refs" in kwargs:
+        return BERTSCORE.compute(predictions=[pred], references=[kwargs["refs"]], model_type="distilbert-base-uncased")[
+            "f1"
+        ][0]
+    return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
+CHRF = evaluate.load("chrf")
+def chrf_fn(pred, ref, **kwargs):
+    if "refs" in kwargs:
+        return CHRF.compute(predictions=[pred], references=[kwargs["refs"]])["score"]
+    return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
+def edit_distance_fn(pred, ref, **kwargs):
+    if "refs" in kwargs:
+        scores = [distance(pred, ref) for ref in kwargs["refs"]]
+        return sum(scores) / len(scores)
+    return distance(pred, ref)
+def edit_distance_norm_fn(pred, ref, **kwargs):
+    if "refs" in kwargs:
+        scores = [normalized_similarity(pred, ref) for ref in kwargs["refs"]]
+        return sum(scores) / len(scores)
+    return normalized_similarity(pred, ref)
+AGGR_METRICS = {
+    "editdist": edit_distance_fn,
+    "editsim": edit_distance_norm_fn,
+    "bleu": bleu_fn,
+    "meteor": meteor_fn,
+    "rouge1": rouge1_fn,
+    "rouge2": rouge2_fn,
+    "rougeL": rougeL_fn,
+    "bertscore": bertscore_fn,
+    "chrF": chrf_fn,
+}
+REL_METRICS = {
+    "editdist": edit_distance_fn,
+}

generation_steps/synthetic_backward.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from itertools import chain
+import pandas as pd
+from tqdm import tqdm
+import config
+import dataset_statistics
+from api_wrappers import grazie_wrapper, hf_data_loader
+from generation_steps import examples
+GENERATION_MULTIPLIER = 3
+REL_INSERTIONS_THRESHOLD = 0.5
+GENERATION_ATTEMPTS = 3
+def build_prompt(reference, diff):
+    return f"""A software developer uses a LLM to generate commit messages.
+They generated a commit message for the following source code changes:
+START OF THE SOURCE CODE CHANGES
+{diff}
+END OF THE SOURCE CODE CHANGES
+After generating the commit message the developer understands that it is not perfect. After making dome changes,
+they come up with an edited version of the message. Here is this edited message:
+START OF THE COMMIT MESSAGE
+{reference}
+END OF THE COMMIT MESSAGE
+Your task is to print the initial, LLM-generated commit message.
+The message you print must share some fragments with the edited message.
+Here are some examples of what you should output:
+START OF THE EXAMPLES LIST
+{examples.EXAMPLES_END_TO_START}
+END OF THE EXAMPLES LIST
+Print only the initial commit message's text after the
+token "OUTPUT".
+OUTPUT"""
+def generate_start_msg(end_msg, diff):
+    prompt = build_prompt(reference=end_msg, diff=diff)
+    results = []
+    for i in range(GENERATION_ATTEMPTS):
+        start_msg_pred = grazie_wrapper.generate_for_prompt(prompt)
+        stats = dataset_statistics.get_statistics_for_sample(
+            start_msg=start_msg_pred,
+            end_msg=end_msg,
+        )
+        if stats["insertions"] < REL_INSERTIONS_THRESHOLD:
+            return start_msg_pred
+        else:
+            results.append((stats["insertions"], start_msg_pred))
+    results.sort()
+    return results[0][1]
+COLS_TO_KEEP = ["hash", "repo", "commit_msg_end", "mods", "session"]
+COLS_TO_DEFAULT = {"edit_time": None}
+def transform(df):
+    print("End -> start synthesis:")
+    print(f"NUMBER OF EXAMPLES PER PROMPT = {examples.N_EXAMPLES}")
+    print(f"GENERATION_MULTIPLIER = {GENERATION_MULTIPLIER}")
+    print(f"REL_INSERTIONS_THRESHOLD = {REL_INSERTIONS_THRESHOLD}")
+    print(f"GENERATION_ATTEMPTS = {GENERATION_ATTEMPTS}")
+    df["end_to_start"] = False
+    generated_data = {"commit_msg_start": []}
+    for col in chain(COLS_TO_KEEP, COLS_TO_DEFAULT):
+        generated_data[col] = []
+    for _, row in tqdm(df.iterrows(), total=len(df)):
+        for i in range(GENERATION_MULTIPLIER):
+            commit_msg_start_pred = generate_start_msg(end_msg=row["commit_msg_end"], diff=row["mods"])
+            generated_data["commit_msg_start"].append(commit_msg_start_pred)
+            for col in COLS_TO_KEEP:
+                generated_data[col].append(row[col])
+            for col in COLS_TO_DEFAULT:
+                generated_data[col].append(COLS_TO_DEFAULT[col])
+    generated_df = pd.DataFrame.from_dict(generated_data)
+    generated_df["end_to_start"] = True
+    result = pd.concat([df, generated_df], ignore_index=True)
+    result.to_csv(config.END_TO_START_ARTIFACT)
+    print("Done")
+    return result
+def main():
+    df = hf_data_loader.load_processed_rewriting_as_pandas()
+    transform(df)
+if __name__ == "__main__":
+    main()

generation_steps/synthetic_forward.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import pandas as pd
+from tqdm import tqdm
+import config
+import dataset_statistics
+from api_wrappers import grazie_wrapper
+from generation_steps import examples
+GENERATION_MULTIPLIER = 3
+REL_DELETIONS_THRESHOLD = 0.75
+GENERATION_ATTEMPTS = 3
+def build_prompt(prediction, diff):
+    return f"""A LLM generated a commit message for the following source code changes:
+START OF THE SOURCE CODE CHANGES
+{diff}
+END OF THE SOURCE CODE CHANGES
+Here is the message the LLM generated:
+START OF THE COMMIT MESSAGE
+{prediction}
+END OF THE COMMIT MESSAGE
+This generated message is not perfect. Your task is to rewrite and improve it.
+You have to simulate a human software developer who manually rewrites the LLM-generated commit message,
+so the message you print must share some fragments with the generated message.
+Your message should be concise.
+Follow the Conventional Commits guidelines.
+Here are some examples of what you should output:
+START OF THE EXAMPLES LIST
+{examples.EXAMPLES_START_TO_END}
+END OF THE EXAMPLES LIST
+Print only the improved commit message's text after the
+token "OUTPUT".
+OUTPUT"""
+def generate_end_msg(start_msg, diff):
+    prompt = build_prompt(prediction=start_msg, diff=diff)
+    results = []
+    for i in range(GENERATION_ATTEMPTS):
+        end_msg_pred = grazie_wrapper.generate_for_prompt(prompt)
+        stats = dataset_statistics.get_statistics_for_sample(
+            start_msg=start_msg,
+            end_msg=end_msg_pred,
+        )
+        if stats["deletions"] < REL_DELETIONS_THRESHOLD:
+            return end_msg_pred
+        else:
+            results.append((stats["deletions"], end_msg_pred))
+    results.sort()
+    return results[0][1]
+COLS_TO_KEEP = ["hash", "repo", "commit_msg_start", "mods", "session", "end_to_start"]
+def print_config():
+    print(f"NUMBER OF EXAMPLES PER PROMPT = {examples.N_EXAMPLES}")
+    print(f"GENERATION_MULTIPLIER = {GENERATION_MULTIPLIER}")
+    print(f"REL_DELETIONS_THRESHOLD = {REL_DELETIONS_THRESHOLD}")
+    print(f"GENERATION_ATTEMPTS = {GENERATION_ATTEMPTS}")
+def transform(df):
+    print("Start -> send synthesis:")
+    print_config()
+    df["start_to_end"] = False
+    generated_data = {"commit_msg_end": []}
+    for col in COLS_TO_KEEP:
+        generated_data[col] = []
+    for _, row in tqdm(df.iterrows(), total=len(df)):
+        for i in range(GENERATION_MULTIPLIER):
+            commit_msg_end_pred = generate_end_msg(start_msg=row["commit_msg_start"], diff=row["mods"])
+            generated_data["commit_msg_end"].append(commit_msg_end_pred)
+            for col in COLS_TO_KEEP:
+                generated_data[col].append(row[col])
+    generated_df = pd.DataFrame.from_dict(generated_data)
+    generated_df["start_to_end"] = True
+    result = pd.concat([df, generated_df], ignore_index=True)
+    result.to_csv(config.START_TO_END_ARTIFACT)
+    print("Done")
+    return result
+def main():
+    df = pd.read_csv(config.END_TO_START_ARTIFACT, index_col=[0])
+    transform(df)
+if __name__ == "__main__":
+    main()

metrics_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,199 @@

+[tool.poetry]
+name = "commit-message-editing-visualization"
+version = "0.1.0"
+description = "Utilities for synthetic data generation, metrics analysis and visualization space for CMG Evaluaton."
+authors = ["Your Name <[email protected]>"]
+license = "MIT"
+[tool.poetry.dependencies]
+python = "^3.9"
+absl-py = "2.1.0"
+aiofiles = "23.2.1"
+aiohttp = "3.9.3"
+aiosignal = "1.3.1"
+altair = "5.3.0"
+annotated-types = "0.6.0"
+anyio = "4.3.0"
+argon2-cffi = "23.1.0"
+argon2-cffi-bindings = "21.2.0"
+arrow = "1.3.0"
+asttokens = "2.4.1"
+async-lru = "2.0.4"
+async-timeout = "4.0.3"
+attrs = "23.2.0"
+Babel = "2.14.0"
+beautifulsoup4 = "4.12.3"
+bert-score = "0.3.13"
+bleach = "6.1.0"
+cbor2 = "5.6.2"
+certifi = "2024.2.2"
+cffi = "1.16.0"
+charset-normalizer = "3.3.2"
+click = "8.1.7"
+colorama = "0.4.6"
+comm = "0.2.2"
+contourpy = "1.2.1"
+cycler = "0.12.1"
+datasets = "2.18.0"
+debugpy = "1.8.1"
+decorator = "5.1.1"
+defusedxml = "0.7.1"
+diff-match-patch = "20230430"
+dill = "0.3.8"
+evaluate = "0.4.1"
+exceptiongroup = "1.2.0"
+executing = "2.0.1"
+fastapi = "0.110.1"
+fastjsonschema = "2.19.1"
+ffmpy = "0.3.2"
+filelock = "3.13.3"
+fonttools = "4.50.0"
+fqdn = "1.5.1"
+frozenlist = "1.4.1"
+fsspec = "2024.2.0"
+gradio = "4.25.0"
+gradio_client = "0.15.0"
+h11 = "0.14.0"
+httpcore = "1.0.5"
+httpx = "0.27.0"
+huggingface-hub = "0.22.2"
+idna = "3.6"
+importlib_metadata = "7.1.0"
+importlib_resources = "6.4.0"
+ipykernel = "6.29.4"
+ipython = "8.18.1"
+ipywidgets = "8.1.2"
+isoduration = "20.11.0"
+jedi = "0.19.1"
+Jinja2 = "3.1.3"
+joblib = "1.4.0"
+json5 = "0.9.25"
+jsonpointer = "2.4"
+jsonschema = "4.21.1"
+jsonschema-specifications = "2023.12.1"
+kiwisolver = "1.4.5"
+lxml = "5.2.1"
+markdown-it-py = "3.0.0"
+MarkupSafe = "2.1.5"
+matplotlib = "3.8.4"
+matplotlib-inline = "0.1.7"
+mdurl = "0.1.2"
+mistune = "3.0.2"
+mpmath = "1.3.0"
+multidict = "6.0.5"
+multiprocess = "0.70.16"
+nbclient = "0.10.0"
+nbconvert = "7.16.4"
+nbformat = "5.10.4"
+nest-asyncio = "1.6.0"
+networkx = "3.2.1"
+nltk = "3.8.1"
+numpy = "1.26.4"
+orjson = "3.10.0"
+overrides = "7.7.0"
+packaging = "24.0"
+pandas = "2.2.1"
+pandocfilters = "1.5.1"
+parso = "0.8.4"
+pillow = "10.3.0"
+platformdirs = "4.2.1"
+portalocker = "2.8.2"
+prometheus_client = "0.20.0"
+prompt-toolkit = "3.0.43"
+psutil = "5.9.8"
+pure-eval = "0.2.2"
+pyarrow = "15.0.2"
+pyarrow-hotfix = "0.6"
+pycparser = "2.22"
+pydantic = "2.6.4"
+pydantic_core = "2.16.3"
+pydub = "0.25.1"
+Pygments = "2.17.2"
+pyparsing = "3.1.2"
+python-dateutil = "2.9.0.post0"
+python-json-logger = "2.0.7"
+python-multipart = "0.0.9"
+pytz = "2024.1"
+PyYAML = "6.0.1"
+pyzmq = "26.0.2"
+rapidfuzz = "3.8.1"
+referencing = "0.34.0"
+regex = "2023.12.25"
+requests = "2.31.0"
+responses = "0.18.0"
+rfc3339-validator = "0.1.4"
+rfc3986-validator = "0.1.1"
+rich = "13.7.1"
+rouge-score = "0.1.2"
+rpds-py = "0.18.0"
+ruff = "0.3.5"
+sacrebleu = "2.4.2"
+safetensors = "0.4.2"
+scikit-learn = "1.4.2"
+scipy = "1.13.0"
+semantic-version = "2.10.0"
+Send2Trash = "1.8.3"
+shellingham = "1.5.4"
+six = "1.16.0"
+sniffio = "1.3.1"
+soupsieve = "2.5"
+stack-data = "0.6.3"
+starlette = "0.37.2"
+sympy = "1.12"
+tabulate = "0.9.0"
+terminado = "0.18.1"
+threadpoolctl = "3.4.0"
+tinycss2 = "1.3.0"
+tokenizers = "0.15.2"
+tomli = "2.0.1"
+tomlkit = "0.12.0"
+toolz = "0.12.1"
+torch = "2.2.2"
+tornado = "6.4"
+tqdm = "4.66.2"
+traitlets = "5.14.3"
+transformers = "4.39.3"
+typer = "0.12.1"
+types-python-dateutil = "2.9.0.20240316"
+typing_extensions = "4.10.0"
+tzdata = "2024.1"
+uri-template = "1.3.0"
+urllib3 = "2.2.1"
+uvicorn = "0.29.0"
+wcwidth = "0.2.13"
+webcolors = "1.13"
+webencodings = "0.5.1"
+websocket-client = "1.8.0"
+websockets = "11.0.3"
+widgetsnbextension = "4.0.10"
+xxhash = "3.4.1"
+yarl = "1.9.4"
+zipp = "3.18.1"
+plotly = "5.22.0"
+tenacity = "8.2.3"
+Levenshtein = "0.25.1"
+kaleido = "0.2.1"
+jupyter = "^1.0.0"
+grazie-api-gateway-client = {version = "^0.1.3", source = "space-grazie-ml"}
+seaborn = "^0.13.2"
+[tool.ruff]
+line-length = 120
+target-version = "py310"
+[tool.ruff.lint]
+extend-select = ["I"]
+[tool.isort]
+profile = "black"
+force_sort_within_sections = true
+order_by_type = true
+[[tool.poetry.source]]
+name = "space-grazie-ml"
+url = "https://packages.jetbrains.team/pypi/p/grazi/grazie-ml/simple"
+priority="supplemental"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,167 @@

+absl-py==2.1.0
+aiofiles==23.2.1
+aiohttp==3.9.3
+aiosignal==1.3.1
+altair==5.3.0
+annotated-types==0.6.0
+anyio==4.3.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+async-timeout==4.0.3
+attrs==23.2.0
+Babel==2.14.0
+beautifulsoup4==4.12.3
+bert-score==0.3.13
+bleach==6.1.0
+cbor2==5.6.2
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+comm==0.2.2
+contourpy==1.2.1
+cycler==0.12.1
+datasets==2.18.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+diff-match-patch==20230430
+dill==0.3.8
+evaluate==0.4.1
+exceptiongroup==1.2.0
+executing==2.0.1
+fastapi==0.110.1
+fastjsonschema==2.19.1
+ffmpy==0.3.2
+filelock==3.13.3
+fonttools==4.50.0
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.2.0
+gradio==4.25.0
+gradio_client==0.15.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.22.2
+idna==3.6
+importlib_metadata==7.1.0
+importlib_resources==6.4.0
+ipykernel==6.29.4
+ipython==8.18.1
+ipywidgets==8.1.2
+isoduration==20.11.0
+jedi==0.19.1
+Jinja2==3.1.3
+joblib==1.4.0
+json5==0.9.25
+jsonpointer==2.4
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+lxml==5.2.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.8.4
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.0.2
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.4
+orjson==3.10.0
+overrides==7.7.0
+packaging==24.0
+pandas==2.2.1
+pandocfilters==1.5.1
+parso==0.8.4
+pillow==10.3.0
+platformdirs==4.2.1
+portalocker==2.8.2
+prometheus_client==0.20.0
+prompt-toolkit==3.0.43
+psutil==5.9.8
+pure-eval==0.2.2
+pyarrow==15.0.2
+pyarrow-hotfix==0.6
+pycparser==2.22
+pydantic==2.6.4
+pydantic_core==2.16.3
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+pyzmq==26.0.2
+rapidfuzz==3.8.1
+referencing==0.34.0
+regex==2023.12.25
+requests==2.31.0
+responses==0.18.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.7.1
+rouge-score==0.1.2
+rpds-py==0.18.0
+ruff==0.3.5
+sacrebleu==2.4.2
+safetensors==0.4.2
+scikit-learn==1.4.2
+scipy==1.13.0
+semantic-version==2.10.0
+Send2Trash==1.8.3
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.5
+stack-data==0.6.3
+starlette==0.37.2
+sympy==1.12
+tabulate==0.9.0
+terminado==0.18.1
+threadpoolctl==3.4.0
+tinycss2==1.3.0
+tokenizers==0.15.2
+tomli==2.0.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.2.2
+tornado==6.4
+tqdm==4.66.2
+traitlets==5.14.3
+transformers==4.39.3
+typer==0.12.1
+types-python-dateutil==2.9.0.20240316
+typing_extensions==4.10.0
+tzdata==2024.1
+uri-template==1.3.0
+urllib3==2.2.1
+uvicorn==0.29.0
+wcwidth==0.2.13
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==11.0.3
+widgetsnbextension==4.0.10
+xxhash==3.4.1
+yarl==1.9.4
+zipp==3.18.1
+plotly==5.22.0
+tenacity==8.2.3
+Levenshtein==0.25.1
+kaleido==0.2.1

run_pipeline.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import config
+from api_wrappers import hf_data_loader
+from generation_steps import metrics_analysis, synthetic_end_to_start, synthetic_start_to_end
+def run():
+    df = hf_data_loader.load_processed_rewriting_as_pandas()
+    df = synthetic_end_to_start.transform(df)
+    df = synthetic_start_to_end.transform(df)
+    df = metrics_analysis.transform(df)
+    df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
+if __name__ == "__main__":
+    run()