Petr Tsvetkov commited on
Commit
9513395
β€’
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/pycharm+all,venv,python
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=pycharm+all,venv,python
3
+
4
+ ### PyCharm+all ###
5
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
6
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
7
+
8
+ # User-specific stuff
9
+ .idea/**/workspace.xml
10
+ .idea/**/tasks.xml
11
+ .idea/**/usage.statistics.xml
12
+ .idea/**/dictionaries
13
+ .idea/**/shelf
14
+
15
+ # AWS User-specific
16
+ .idea/**/aws.xml
17
+
18
+ # Generated files
19
+ .idea/**/contentModel.xml
20
+
21
+ # Sensitive or high-churn files
22
+ .idea/**/dataSources/
23
+ .idea/**/dataSources.ids
24
+ .idea/**/dataSources.local.xml
25
+ .idea/**/sqlDataSources.xml
26
+ .idea/**/dynamic.xml
27
+ .idea/**/uiDesigner.xml
28
+ .idea/**/dbnavigator.xml
29
+
30
+ # Gradle
31
+ .idea/**/gradle.xml
32
+ .idea/**/libraries
33
+
34
+ # Gradle and Maven with auto-import
35
+ # When using Gradle or Maven with auto-import, you should exclude module files,
36
+ # since they will be recreated, and may cause churn. Uncomment if using
37
+ # auto-import.
38
+ # .idea/artifacts
39
+ # .idea/compiler.xml
40
+ # .idea/jarRepositories.xml
41
+ # .idea/modules.xml
42
+ # .idea/*.iml
43
+ # .idea/modules
44
+ # *.iml
45
+ # *.ipr
46
+
47
+ # CMake
48
+ cmake-build-*/
49
+
50
+ # Mongo Explorer plugin
51
+ .idea/**/mongoSettings.xml
52
+
53
+ # File-based project format
54
+ *.iws
55
+
56
+ # IntelliJ
57
+ out/
58
+
59
+ # mpeltonen/sbt-idea plugin
60
+ .idea_modules/
61
+
62
+ # JIRA plugin
63
+ atlassian-ide-plugin.xml
64
+
65
+ # Cursive Clojure plugin
66
+ .idea/replstate.xml
67
+
68
+ # SonarLint plugin
69
+ .idea/sonarlint/
70
+
71
+ # Crashlytics plugin (for Android Studio and IntelliJ)
72
+ com_crashlytics_export_strings.xml
73
+ crashlytics.properties
74
+ crashlytics-build.properties
75
+ fabric.properties
76
+
77
+ # Editor-based Rest Client
78
+ .idea/httpRequests
79
+
80
+ # Android studio 3.1+ serialized cache file
81
+ .idea/caches/build_file_checksums.ser
82
+
83
+ ### PyCharm+all Patch ###
84
+ # Ignore everything but code style settings and run configurations
85
+ # that are supposed to be shared within teams.
86
+
87
+ .idea/*
88
+
89
+ !.idea/codeStyles
90
+ !.idea/runConfigurations
91
+
92
+ ### Python ###
93
+ # Byte-compiled / optimized / DLL files
94
+ __pycache__/
95
+ *.py[cod]
96
+ *$py.class
97
+
98
+ # C extensions
99
+ *.so
100
+
101
+ # Distribution / packaging
102
+ .Python
103
+ build/
104
+ develop-eggs/
105
+ dist/
106
+ downloads/
107
+ eggs/
108
+ .eggs/
109
+ lib/
110
+ lib64/
111
+ parts/
112
+ sdist/
113
+ var/
114
+ wheels/
115
+ share/python-wheels/
116
+ *.egg-info/
117
+ .installed.cfg
118
+ *.egg
119
+ MANIFEST
120
+
121
+ # PyInstaller
122
+ # Usually these files are written by a python script from a template
123
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
124
+ *.manifest
125
+ *.spec
126
+
127
+ # Installer logs
128
+ pip-log.txt
129
+ pip-delete-this-directory.txt
130
+
131
+ # Unit test / coverage reports
132
+ htmlcov/
133
+ .tox/
134
+ .nox/
135
+ .coverage
136
+ .coverage.*
137
+ .cache
138
+ nosetests.xml
139
+ coverage.xml
140
+ *.cover
141
+ *.py,cover
142
+ .hypothesis/
143
+ .pytest_cache/
144
+ cover/
145
+
146
+ # Translations
147
+ *.mo
148
+ *.pot
149
+
150
+ # Django stuff:
151
+ *.log
152
+ local_settings.py
153
+ db.sqlite3
154
+ db.sqlite3-journal
155
+
156
+ # Flask stuff:
157
+ instance/
158
+ .webassets-cache
159
+
160
+ # Scrapy stuff:
161
+ .scrapy
162
+
163
+ # Sphinx documentation
164
+ docs/_build/
165
+
166
+ # PyBuilder
167
+ .pybuilder/
168
+ target/
169
+
170
+ # Jupyter Notebook
171
+ .ipynb_checkpoints
172
+
173
+ # IPython
174
+ profile_default/
175
+ ipython_config.py
176
+
177
+ # pyenv
178
+ # For a library or package, you might want to ignore these files since the code is
179
+ # intended to run in multiple environments; otherwise, check them in:
180
+ # .python-version
181
+
182
+ # pipenv
183
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
184
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
185
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
186
+ # install all needed dependencies.
187
+ #Pipfile.lock
188
+
189
+ # poetry
190
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
191
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
192
+ # commonly ignored for libraries.
193
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
194
+ #poetry.lock
195
+
196
+ # pdm
197
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
198
+ #pdm.lock
199
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
200
+ # in version control.
201
+ # https://pdm.fming.dev/#use-with-ide
202
+ .pdm.toml
203
+
204
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
205
+ __pypackages__/
206
+
207
+ # Celery stuff
208
+ celerybeat-schedule
209
+ celerybeat.pid
210
+
211
+ # SageMath parsed files
212
+ *.sage.py
213
+
214
+ # Environments
215
+ .env
216
+ .venv
217
+ env/
218
+ venv/
219
+ ENV/
220
+ env.bak/
221
+ venv.bak/
222
+
223
+ # Spyder project settings
224
+ .spyderproject
225
+ .spyproject
226
+
227
+ # Rope project settings
228
+ .ropeproject
229
+
230
+ # mkdocs documentation
231
+ /site
232
+
233
+ # mypy
234
+ .mypy_cache/
235
+ .dmypy.json
236
+ dmypy.json
237
+
238
+ # Pyre type checker
239
+ .pyre/
240
+
241
+ # pytype static type analyzer
242
+ .pytype/
243
+
244
+ # Cython debug symbols
245
+ cython_debug/
246
+
247
+ # PyCharm
248
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
249
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
250
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
251
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
252
+ #.idea/
253
+
254
+ ### Python Patch ###
255
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
256
+ poetry.toml
257
+
258
+ # ruff
259
+ .ruff_cache/
260
+
261
+ # LSP config files
262
+ pyrightconfig.json
263
+
264
+ ### venv ###
265
+ # Virtualenv
266
+ # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
267
+ [Bb]in
268
+ [Ii]nclude
269
+ [Ll]ib
270
+ [Ll]ib64
271
+ [Ll]ocal
272
+ [Ss]cripts
273
+ pyvenv.cfg
274
+ pip-selfcheck.json
275
+
276
+ # End of https://www.toptal.com/developers/gitignore/api/pycharm+all,venv,python
277
+
278
+ .idea
279
+
280
+ cache
281
+ output
282
+ data
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Commit Message Editing Visualization
3
+ emoji: πŸ“ˆ
4
+ sdk: gradio
5
+ sdk_version: 4.37.2
6
+ app_file: change_visualizer.py
7
+ ---
8
+
9
+ # Commit Message Editing Visualisation βœοΈπŸ”πŸ“Š
10
+
11
+ This space provides a visualization app for exploring the commit message edits datasets (πŸ€— [expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and πŸ€— [synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
12
+ from πŸ“œ [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper and also hosts important artifacts from our work.
13
+
14
+ ## Artifacts
15
+
16
+ * πŸ“Š[`metrics_analysis.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
17
+ * πŸ“ˆ[`chart.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/chart.ipynb) contains the code for Figure 4 with edit distance distribution;
18
+ * πŸ—ƒοΈ[`data_stats.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1;
19
+ * ⬅️[`generation_steps/synthetic_backward.py`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/generation_steps/synthetic_backward.py) contains the code for *Synthetic Backward* generation proposed in our paper;
20
+ * ➑️[`generation_steps/synthetic_forward.py`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/generation_steps/synthetic_forward.py) contains the code for *Synthetic Forward* generation proposed in our paper.
21
+
22
+ ## Visualization
23
+
24
+ * πŸ” Click on `Examples Exploration` tab to browse through nicely-formatted examples from our dataset.
25
+ * πŸ“ˆ Click on `Dataset Statistics` tab to see the major statistics for our dataset.
26
+ * πŸ“Š Click on `Experimental Results` tab to see additional metrics tested as target online metrics alongside our main edit distance results.
api_wrappers/__init__.py ADDED
File without changes
api_wrappers/grazie_wrapper.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import time
3
+
4
+ from grazie.api.client.chat.prompt import ChatPrompt
5
+ from grazie.api.client.endpoints import GrazieApiGatewayUrls
6
+ from grazie.api.client.gateway import AuthType, GrazieAgent, GrazieApiGatewayClient
7
+ from grazie.api.client.profiles import LLMProfile
8
+
9
+ import config
10
+
11
+ client = GrazieApiGatewayClient(
12
+ grazie_agent=GrazieAgent("grazie-toolformers", "v1.0"),
13
+ url=GrazieApiGatewayUrls.STAGING,
14
+ auth_type=AuthType.APPLICATION,
15
+ grazie_jwt_token=config.GRAZIE_API_JWT_TOKEN,
16
+ )
17
+
18
+ LLM_CACHE_FILE = config.CACHE_DIR / f"{config.LLM_MODEL}.cache.pkl"
19
+ LLM_CACHE = {}
20
+ LLM_CACHE_USED = {}
21
+
22
+ if not LLM_CACHE_FILE.exists():
23
+ with open(LLM_CACHE_FILE, "wb") as file:
24
+ pickle.dump(obj=LLM_CACHE, file=file)
25
+
26
+ with open(LLM_CACHE_FILE, "rb") as file:
27
+ LLM_CACHE = pickle.load(file=file)
28
+
29
+
30
+ def llm_request(prompt):
31
+ output = None
32
+
33
+ while output is None:
34
+ try:
35
+ output = client.chat(
36
+ chat=ChatPrompt().add_system("You are a helpful assistant.").add_user(prompt),
37
+ profile=LLMProfile(config.LLM_MODEL),
38
+ ).content
39
+ except Exception:
40
+ time.sleep(config.GRAZIE_TIMEOUT_SEC)
41
+
42
+ assert output is not None
43
+
44
+ return output
45
+
46
+
47
+ def generate_for_prompt(prompt):
48
+ if prompt not in LLM_CACHE:
49
+ LLM_CACHE[prompt] = []
50
+
51
+ if prompt not in LLM_CACHE_USED:
52
+ LLM_CACHE_USED[prompt] = 0
53
+
54
+ while LLM_CACHE_USED[prompt] >= len(LLM_CACHE[prompt]):
55
+ new_response = llm_request(prompt)
56
+ LLM_CACHE[prompt].append(new_response)
57
+
58
+ with open(LLM_CACHE_FILE, "wb") as file:
59
+ pickle.dump(obj=LLM_CACHE, file=file)
60
+
61
+ result = LLM_CACHE[prompt][LLM_CACHE_USED[prompt]]
62
+ LLM_CACHE_USED[prompt] += 1
63
+
64
+ return result
api_wrappers/hf_data_loader.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timedelta
4
+
5
+ import pandas as pd
6
+ from datasets import load_dataset
7
+ from huggingface_hub import hf_hub_download, list_repo_tree
8
+
9
+ import config
10
+
11
+
12
+ def load_raw_rewriting_as_pandas():
13
+ return load_dataset(
14
+ config.HF_RAW_DATASET_NAME, split=config.HF_RAW_DATASET_SPLIT, token=config.HF_TOKEN, cache_dir=config.CACHE_DIR
15
+ ).to_pandas()
16
+
17
+
18
+ def load_full_commit_as_pandas():
19
+ return (
20
+ load_dataset(
21
+ path=config.HF_FULL_COMMITS_DATASET_NAME,
22
+ name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
23
+ split=config.HF_FULL_COMMITS_DATASET_SPLIT,
24
+ cache_dir=config.CACHE_DIR,
25
+ )
26
+ .to_pandas()
27
+ .rename(columns={"message": "reference"})
28
+ )
29
+
30
+
31
+ def edit_time_from_history(history_str):
32
+ history = json.loads(history_str)
33
+
34
+ if len(history) == 0:
35
+ return 0
36
+
37
+ timestamps = list(map(lambda e: datetime.fromisoformat(e["ts"]), history))
38
+ delta = max(timestamps) - min(timestamps)
39
+
40
+ return delta // timedelta(milliseconds=1)
41
+
42
+
43
+ def edit_time_from_timestamps(row):
44
+ loaded_ts = datetime.fromisoformat(row["loaded_ts"])
45
+ submitted_ts = datetime.fromisoformat(row["submitted_ts"])
46
+
47
+ delta = submitted_ts - loaded_ts
48
+
49
+ result = delta // timedelta(milliseconds=1)
50
+
51
+ return result if result >= 0 else None
52
+
53
+
54
+ def load_processed_rewriting_as_pandas():
55
+ manual_rewriting = load_raw_rewriting_as_pandas()[
56
+ [
57
+ "hash",
58
+ "repo",
59
+ "commit_msg_start",
60
+ "commit_msg_end",
61
+ "session",
62
+ "commit_msg_history",
63
+ "loaded_ts",
64
+ "submitted_ts",
65
+ ]
66
+ ]
67
+
68
+ manual_rewriting["edit_time_hist"] = manual_rewriting["commit_msg_history"].apply(edit_time_from_history)
69
+ manual_rewriting["edit_time"] = manual_rewriting.apply(edit_time_from_timestamps, axis=1)
70
+
71
+ manual_rewriting.drop(columns=["commit_msg_history", "loaded_ts", "submitted_ts"])
72
+
73
+ manual_rewriting.set_index(["hash", "repo"], inplace=True)
74
+
75
+ mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]]
76
+ mods_dataset.set_index(["hash", "repo"], inplace=True)
77
+
78
+ return manual_rewriting.join(other=mods_dataset, how="left").reset_index()
79
+
80
+
81
+ def load_synthetic_as_pandas():
82
+ return load_dataset(
83
+ config.HF_SYNTHETIC_DATASET_NAME,
84
+ "all_pairs_with_metrics",
85
+ split=config.HF_SYNTHETIC_DATASET_SPLIT,
86
+ token=config.HF_TOKEN,
87
+ cache_dir=config.CACHE_DIR,
88
+ ).to_pandas()
89
+
90
+
91
+ def load_full_commit_with_predictions_as_pandas():
92
+ full_dataset = load_full_commit_as_pandas()
93
+
94
+ predictions_paths = []
95
+ for prediction_file in list_repo_tree(
96
+ repo_id=config.HF_PREDICTIONS_DATASET_NAME,
97
+ path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL),
98
+ repo_type="dataset",
99
+ ):
100
+ predictions_paths.append(
101
+ hf_hub_download(
102
+ prediction_file.path,
103
+ repo_id=config.HF_PREDICTIONS_DATASET_NAME,
104
+ repo_type="dataset",
105
+ cache_dir=config.CACHE_DIR,
106
+ )
107
+ )
108
+
109
+ dfs = []
110
+ for path in predictions_paths:
111
+ dfs.append(pd.read_json(path, orient="records", lines=True))
112
+ predictions_dataset = pd.concat(dfs, axis=0, ignore_index=True)
113
+ predictions_dataset = predictions_dataset.sample(frac=1, random_state=config.RANDOM_STATE).set_index(
114
+ ["hash", "repo"]
115
+ )[["prediction"]]
116
+ predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep="first")]
117
+
118
+ dataset = full_dataset.join(other=predictions_dataset, on=("hash", "repo"))
119
+
120
+ return dataset.reset_index()
change_visualizer.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import pandas as pd
4
+ import plotly.graph_objects as go
5
+ from datasets import load_dataset
6
+ from evaluate.utils import parse_readme
7
+ from scipy.stats import gaussian_kde, spearmanr
8
+
9
+ import generate_annotated_diffs
10
+ from api_wrappers import hf_data_loader
11
+ from generation_steps.metrics_analysis import AGGR_METRICS, edit_distance_fn
12
+
13
+ colors = {
14
+ "Expert-labeled": "#C19C0B",
15
+ "Synthetic Backward": "#913632",
16
+ "Synthetic Forward": "#58136a",
17
+ "Full": "#000000",
18
+ }
19
+
20
+ METRICS = {
21
+ "Edit Distance": "editdist",
22
+ "Edit Similarity": "editsim",
23
+ "BLEU": "bleu",
24
+ "METEOR": "meteor",
25
+ "ROUGE-1": "rouge1",
26
+ "ROUGE-2": "rouge2",
27
+ "ROUGE-L": "rougeL",
28
+ "BERTScore": "bertscore",
29
+ "ChrF": "chrF",
30
+ }
31
+
32
+
33
+ df_related = generate_annotated_diffs.data_with_annotated_diffs()
34
+
35
+
36
+ def golden():
37
+ return df_related.loc[(df_related["G_type"] == "initial") & (df_related["E_type"] == "expert_labeled")].reset_index(
38
+ drop=True
39
+ )
40
+
41
+
42
+ def backward():
43
+ return df_related.loc[
44
+ (df_related["G_type"] == "synthetic_backward") & (df_related["E_type"] == "expert_labeled")
45
+ ].reset_index(drop=True)
46
+
47
+
48
+ def forward():
49
+ return df_related.loc[
50
+ (df_related["G_type"] == "initial") & (df_related["E_type"] == "synthetic_forward")
51
+ ].reset_index(drop=True)
52
+
53
+
54
+ def forward_from_backward():
55
+ return df_related.loc[
56
+ (df_related.G_type == "synthetic_backward")
57
+ & (df_related.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))
58
+ ].reset_index(drop=True)
59
+
60
+
61
+ n_diffs_manual = len(golden())
62
+ n_diffs_synthetic_backward = len(backward())
63
+ n_diffs_synthetic_forward = len(forward())
64
+ n_diffs_synthetic_forward_backward = len(forward_from_backward())
65
+
66
+
67
+ def update_dataset_view(diff_idx, df):
68
+ diff_idx -= 1
69
+ return (
70
+ df.iloc[diff_idx]["annotated_diff"],
71
+ df.iloc[diff_idx]["commit_msg_start"] if "commit_msg_start" in df.columns else df.iloc[diff_idx]["G_text"],
72
+ df.iloc[diff_idx]["commit_msg_end"] if "commit_msg_end" in df.columns else df.iloc[diff_idx]["E_text"],
73
+ f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",
74
+ )
75
+
76
+
77
+ def update_dataset_view_manual(diff_idx):
78
+ return update_dataset_view(diff_idx, golden())
79
+
80
+
81
+ def update_dataset_view_synthetic_backward(diff_idx):
82
+ return update_dataset_view(diff_idx, backward())
83
+
84
+
85
+ def update_dataset_view_synthetic_forward(diff_idx):
86
+ return update_dataset_view(diff_idx, forward())
87
+
88
+
89
+ def update_dataset_view_synthetic_forward_backward(diff_idx):
90
+ return update_dataset_view(diff_idx, forward_from_backward())
91
+
92
+
93
+ def number_of_pairs_plot():
94
+ related_plot_dict = {
95
+ "Full": df_related,
96
+ "Synthetic Backward": backward(),
97
+ "Synthetic Forward": pd.concat([forward(), forward_from_backward()], axis=0, ignore_index=True),
98
+ "Expert-labeled": golden(),
99
+ }
100
+
101
+ df_unrelated = hf_data_loader.load_synthetic_as_pandas()
102
+ df_unrelated = df_unrelated.loc[~df_unrelated.is_related].copy()
103
+ unrelated_plot_dict = {
104
+ "Full": df_unrelated,
105
+ "Synthetic Backward": df_unrelated.loc[
106
+ (df_unrelated["G_type"] == "synthetic_backward")
107
+ & (~df_unrelated.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))
108
+ ],
109
+ "Synthetic Forward": df_unrelated.loc[
110
+ ((df_unrelated["G_type"] == "initial") & (df_unrelated["E_type"] == "synthetic_forward"))
111
+ | (
112
+ (df_unrelated["G_type"] == "synthetic_backward")
113
+ & (df_unrelated["E_type"].isin(["synthetic_forward", "synthetic_forward_from_backward"]))
114
+ )
115
+ ],
116
+ "Expert-labeled": df_unrelated.loc[
117
+ (df_unrelated.G_type == "initial") & (df_unrelated.E_type == "expert_labeled")
118
+ ],
119
+ }
120
+
121
+ traces = []
122
+
123
+ for split in related_plot_dict.keys():
124
+ related_count = len(related_plot_dict[split])
125
+ unrelated_count = len(unrelated_plot_dict[split])
126
+
127
+ traces.append(
128
+ go.Bar(
129
+ name=f"{split} - Related pairs",
130
+ x=[split],
131
+ y=[related_count],
132
+ marker=dict(
133
+ color=colors[split],
134
+ ),
135
+ )
136
+ )
137
+
138
+ traces.append(
139
+ go.Bar(
140
+ name=f"{split} - Conditionally independent pairs",
141
+ x=[split],
142
+ y=[unrelated_count],
143
+ marker=dict(
144
+ color=colors[split],
145
+ pattern=dict(
146
+ shape="/", # Crosses
147
+ fillmode="overlay",
148
+ solidity=0.5,
149
+ ),
150
+ ),
151
+ )
152
+ )
153
+
154
+ fig = go.Figure(data=traces)
155
+
156
+ fig.update_layout(
157
+ barmode="stack",
158
+ bargap=0.2,
159
+ xaxis=dict(title="Split", showgrid=True, gridcolor="lightgrey"),
160
+ yaxis=dict(title="Number of Examples", showgrid=True, gridcolor="lightgrey"),
161
+ legend=dict(title="Pair Type", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
162
+ plot_bgcolor="rgba(0,0,0,0)",
163
+ paper_bgcolor="rgba(0,0,0,0)",
164
+ width=1100,
165
+ )
166
+ return fig
167
+
168
+
169
+ def edit_distance_plot():
170
+ df_edit_distance = {
171
+ "Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in df_related.iterrows()],
172
+ "Synthetic Backward": [
173
+ edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in backward().iterrows()
174
+ ],
175
+ "Synthetic Forward": [
176
+ edit_distance_fn(pred=row["G_text"], ref=row["E_text"])
177
+ for _, row in pd.concat([forward(), forward_from_backward()], axis=0, ignore_index=True).iterrows()
178
+ ],
179
+ "Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in golden().iterrows()],
180
+ }
181
+ traces = []
182
+
183
+ for key in df_edit_distance:
184
+ kde_x = np.linspace(0, 1200, 1000)
185
+ kde = gaussian_kde(df_edit_distance[key])
186
+ kde_line = go.Scatter(x=kde_x, y=kde(kde_x), mode="lines", name=key, line=dict(color=colors[key], width=5))
187
+ traces.append(kde_line)
188
+
189
+ fig = go.Figure(data=traces)
190
+
191
+ fig.update_layout(
192
+ bargap=0.1,
193
+ xaxis=dict(title=dict(text="Edit Distance"), range=[0, 1200], showgrid=True, gridcolor="lightgrey"),
194
+ yaxis=dict(
195
+ title=dict(text="Probability Density"),
196
+ range=[0, 0.004],
197
+ showgrid=True,
198
+ gridcolor="lightgrey",
199
+ tickvals=[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004],
200
+ tickformat=".4f",
201
+ ),
202
+ plot_bgcolor="rgba(0,0,0,0)",
203
+ paper_bgcolor="rgba(0,0,0,0)",
204
+ width=1100,
205
+ )
206
+ return fig
207
+
208
+
209
+ def get_correlations_table(online_metric_name: str) -> pd.DataFrame:
210
+ df = load_dataset(
211
+ "JetBrains-Research/synthetic-commit-msg-edits", "all_pairs_with_metrics_other_online_metrics", split="train"
212
+ ).to_pandas()
213
+ corr_df = (
214
+ df.loc[~df.is_related]
215
+ .groupby(["G_text", "G_type", "hash", "repo"] + [f"online_{online_metric_name}"])
216
+ .apply(lambda g: g.to_dict(orient="records"), include_groups=False)
217
+ .reset_index(name="unrelated_pairs")
218
+ .copy()
219
+ )
220
+ _ = corr_df.copy()
221
+ for metric in AGGR_METRICS:
222
+ if metric in ["editdist"]:
223
+ _[metric] = _.unrelated_pairs.apply(lambda pairs: min(pair[metric] for pair in pairs))
224
+ else:
225
+ _[metric] = _.unrelated_pairs.apply(lambda pairs: max(pair[metric] for pair in pairs))
226
+
227
+ results = []
228
+
229
+ for metric in AGGR_METRICS:
230
+ x = _[metric].to_numpy()
231
+ y = _[f"online_{online_metric_name}"].to_numpy()
232
+ corr, p_value = spearmanr(x, y)
233
+ results.append({"metric": metric, "corr": corr, "p_value": p_value})
234
+
235
+ __ = pd.DataFrame(results)
236
+ __["p_value"] = ["< 0.05" if p < 0.05 else p for p in __.p_value]
237
+ __["corr_abs"] = abs(__["corr"])
238
+ __["corr"] = __["corr"].round(2)
239
+ __["metric"] = __["metric"].map({v: k for k, v in METRICS.items()})
240
+ return (
241
+ __.sort_values(by=["corr_abs"], ascending=False)
242
+ .drop(columns=["corr_abs"])
243
+ .rename(columns={"metric": "Metric m", "corr": "Correlation Q(m, m*)", "p_value": "p-value"})
244
+ )
245
+
246
+
247
+ force_light_theme_js_func = """
248
+ function refresh() {
249
+ const url = new URL(window.location);
250
+
251
+ if (url.searchParams.get('__theme') !== 'light') {
252
+ url.searchParams.set('__theme', 'light');
253
+ window.location.href = url.href;
254
+ }
255
+ }
256
+ """
257
+
258
+ if __name__ == "__main__":
259
+ with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
260
+ gr.Markdown(parse_readme("README.md"))
261
+
262
+ def dataset_view_tab(n_items):
263
+ slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1, label=f"Sample number (total: {n_items})")
264
+
265
+ diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={"+": "green", "-": "red"})
266
+ start_view = gr.Textbox(interactive=False, label="Initial message G", container=True)
267
+ end_view = gr.Textbox(interactive=False, label="Edited message E", container=True)
268
+ link_view = gr.Markdown()
269
+
270
+ view = [diff_view, start_view, end_view, link_view]
271
+
272
+ return slider, view
273
+
274
+ with gr.Tab("Examples Exploration"):
275
+ with gr.Tab("Manual"):
276
+ slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
277
+
278
+ slider_manual.change(update_dataset_view_manual, inputs=slider_manual, outputs=view_manual)
279
+
280
+ with gr.Tab("Synthetic Backward"):
281
+ slider_synthetic_backward, view_synthetic_backward = dataset_view_tab(n_diffs_synthetic_backward)
282
+
283
+ slider_synthetic_backward.change(
284
+ update_dataset_view_synthetic_backward,
285
+ inputs=slider_synthetic_backward,
286
+ outputs=view_synthetic_backward,
287
+ )
288
+
289
+ with gr.Tab("Synthetic Forward (from initial)"):
290
+ slider_synthetic_forward, view_synthetic_forward = dataset_view_tab(n_diffs_synthetic_forward)
291
+
292
+ slider_synthetic_forward.change(
293
+ update_dataset_view_synthetic_forward,
294
+ inputs=slider_synthetic_forward,
295
+ outputs=view_synthetic_forward,
296
+ )
297
+
298
+ with gr.Tab("Synthetic Forward (from backward)"):
299
+ slider_synthetic_forward_backward, view_synthetic_forward_backward = dataset_view_tab(
300
+ n_diffs_synthetic_forward_backward
301
+ )
302
+
303
+ slider_synthetic_forward_backward.change(
304
+ update_dataset_view_synthetic_forward_backward,
305
+ inputs=slider_synthetic_forward_backward,
306
+ outputs=view_synthetic_forward_backward,
307
+ )
308
+
309
+ with gr.Tab("Dataset Statistics"):
310
+ gr.Markdown("## Number of examples per split")
311
+
312
+ number_of_pairs_gr_plot = gr.Plot(number_of_pairs_plot, label=None)
313
+
314
+ gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")
315
+
316
+ edit_distance_gr_plot = gr.Plot(edit_distance_plot(), label=None)
317
+
318
+ with gr.Tab("Experimental Results"):
319
+ gr.Markdown(
320
+ "Here, we provide the additional experimental results with different text similarity metrics used as the target online metric, "
321
+ "in addition to edit distance between generated messages G and their edited counterparts E."
322
+ )
323
+
324
+ gr.Markdown(
325
+ "Please, select one of the available metrics **m*** below to see the correlations **Q(m, m\*)** of offline text similarity metrics with **m*** as an online metric."
326
+ )
327
+
328
+ for metric in METRICS:
329
+ with gr.Tab(metric):
330
+ gr.Markdown(
331
+ f"The table below presents the correlation coefficients **Q(m, m\*)** where {metric} is used as an online metric **m***."
332
+ )
333
+
334
+ result_df = get_correlations_table(METRICS[metric])
335
+ gr.DataFrame(result_df)
336
+
337
+ application.load(update_dataset_view_manual, inputs=slider_manual, outputs=view_manual)
338
+
339
+ application.load(
340
+ update_dataset_view_synthetic_backward, inputs=slider_synthetic_backward, outputs=view_synthetic_backward
341
+ )
342
+
343
+ application.load(
344
+ update_dataset_view_synthetic_forward, inputs=slider_synthetic_forward, outputs=view_synthetic_forward
345
+ )
346
+
347
+ application.load(
348
+ update_dataset_view_synthetic_forward_backward,
349
+ inputs=slider_synthetic_forward_backward,
350
+ outputs=view_synthetic_forward_backward,
351
+ )
352
+
353
+ application.launch()
chart.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
config.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ RANDOM_STATE = 42
5
+
6
+ GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_JWT_TOKEN")
7
+ GRAZIE_TIMEOUT_SEC = 1.0
8
+
9
+ HF_TOKEN = os.environ.get("HF_TOKEN")
10
+
11
+ HF_RAW_DATASET_NAME = "JetBrains-Research/commit-msg-rewriting"
12
+ HF_RAW_DATASET_SPLIT = "train"
13
+
14
+ HF_FULL_COMMITS_DATASET_NAME = "JetBrains-Research/lca-commit-message-generation"
15
+ HF_FULL_COMMITS_DATASET_SUBNAME = "commitchronicle-py-long"
16
+ HF_FULL_COMMITS_DATASET_SPLIT = "test"
17
+
18
+ HF_PREDICTIONS_DATASET_NAME = "JetBrains-Research/lca-results"
19
+ HF_PREDICTIONS_MODEL = "gpt_4_0613"
20
+
21
+ HF_SYNTHETIC_DATASET_NAME = "JetBrains-Research/synthetic-commit-msg-rewriting"
22
+ HF_SYNTHETIC_DATASET_SPLIT = "train"
23
+
24
+ LLM_MODEL = "gpt-4-1106-preview"
25
+
26
+ CACHE_DIR = Path("cache")
27
+ CACHE_DIR.mkdir(exist_ok=True)
28
+
29
+ OUTPUT_DIR = Path("output")
30
+ OUTPUT_DIR.mkdir(exist_ok=True)
31
+
32
+ END_TO_START_ARTIFACT = OUTPUT_DIR / "end_to_start.csv"
33
+ START_TO_END_ARTIFACT = OUTPUT_DIR / "start_to_end.csv"
34
+ SYNTHETIC_DATASET_ARTIFACT = OUTPUT_DIR / "synthetic.csv"
35
+ METRICS_CORRELATIONS_ARTIFACT = OUTPUT_DIR / "metrics_correlations.csv"
36
+ DATA_FOR_LABELING_ARTIFACT = OUTPUT_DIR / "data_for_labeling.csv"
37
+
38
+ OUTPUT_CHARTS_DIR = OUTPUT_DIR / "charts"
39
+ OUTPUT_CHARTS_DIR.mkdir(exist_ok=True)
data_stats.ipynb ADDED
@@ -0,0 +1,759 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {},
5
+ "cell_type": "markdown",
6
+ "source": "# Data Stats",
7
+ "id": "694a6cc631d4ab93"
8
+ },
9
+ {
10
+ "metadata": {
11
+ "ExecuteTime": {
12
+ "end_time": "2024-10-15T18:43:07.644299Z",
13
+ "start_time": "2024-10-15T18:43:02.316453Z"
14
+ }
15
+ },
16
+ "cell_type": "code",
17
+ "source": [
18
+ "from datasets import load_dataset\n",
19
+ "\n",
20
+ "\n",
21
+ "df = load_dataset(\"JetBrains-Research/synthetic-commit-msg-edits\", \"all_pairs\", split=\"train\").to_pandas()\n",
22
+ "df.head()"
23
+ ],
24
+ "id": "ed42f4f83199feb2",
25
+ "outputs": [
26
+ {
27
+ "name": "stderr",
28
+ "output_type": "stream",
29
+ "text": [
30
+ "Downloading data: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6.35M/6.35M [00:00<00:00, 9.95MB/s]\n"
31
+ ]
32
+ },
33
+ {
34
+ "data": {
35
+ "text/plain": [
36
+ "Generating train split: 0 examples [00:00, ? examples/s]"
37
+ ],
38
+ "application/vnd.jupyter.widget-view+json": {
39
+ "version_major": 2,
40
+ "version_minor": 0,
41
+ "model_id": "1a0523289d424b29974b60d017643280"
42
+ }
43
+ },
44
+ "metadata": {},
45
+ "output_type": "display_data"
46
+ },
47
+ {
48
+ "data": {
49
+ "text/plain": [
50
+ " hash repo \\\n",
51
+ "0 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n",
52
+ "1 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n",
53
+ "2 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n",
54
+ "3 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n",
55
+ "4 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n",
56
+ "\n",
57
+ " G_text \\\n",
58
+ "0 Enhance OptionOverrideProxy and simplify optio... \n",
59
+ "1 Enhance OptionOverrideProxy and simplify optio... \n",
60
+ "2 Enhance OptionOverrideProxy and simplify optio... \n",
61
+ "3 Enhance OptionOverrideProxy and simplify optio... \n",
62
+ "4 Enhance OptionOverrideProxy and simplify optio... \n",
63
+ "\n",
64
+ " E_text G_type \\\n",
65
+ "0 Enhance OptionOverrideProxy for multiple optio... synthetic_backward \n",
66
+ "1 Refactor OptionOverrideProxy and Backend class... synthetic_backward \n",
67
+ "2 Refactor OptionOverrideProxy and backend optio... synthetic_backward \n",
68
+ "3 Refactor: Enhance OptionOverrideProxy for mult... synthetic_backward \n",
69
+ "4 Refactor OptionOverrideProxy and add target-sp... synthetic_backward \n",
70
+ "\n",
71
+ " E_type is_related \n",
72
+ "0 expert_labeled True \n",
73
+ "1 synthetic_forward True \n",
74
+ "2 synthetic_forward True \n",
75
+ "3 synthetic_forward True \n",
76
+ "4 synthetic_forward_from_backward False "
77
+ ],
78
+ "text/html": [
79
+ "<div>\n",
80
+ "<style scoped>\n",
81
+ " .dataframe tbody tr th:only-of-type {\n",
82
+ " vertical-align: middle;\n",
83
+ " }\n",
84
+ "\n",
85
+ " .dataframe tbody tr th {\n",
86
+ " vertical-align: top;\n",
87
+ " }\n",
88
+ "\n",
89
+ " .dataframe thead th {\n",
90
+ " text-align: right;\n",
91
+ " }\n",
92
+ "</style>\n",
93
+ "<table border=\"1\" class=\"dataframe\">\n",
94
+ " <thead>\n",
95
+ " <tr style=\"text-align: right;\">\n",
96
+ " <th></th>\n",
97
+ " <th>hash</th>\n",
98
+ " <th>repo</th>\n",
99
+ " <th>G_text</th>\n",
100
+ " <th>E_text</th>\n",
101
+ " <th>G_type</th>\n",
102
+ " <th>E_type</th>\n",
103
+ " <th>is_related</th>\n",
104
+ " </tr>\n",
105
+ " </thead>\n",
106
+ " <tbody>\n",
107
+ " <tr>\n",
108
+ " <th>0</th>\n",
109
+ " <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
110
+ " <td>mesonbuild/meson</td>\n",
111
+ " <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
112
+ " <td>Enhance OptionOverrideProxy for multiple optio...</td>\n",
113
+ " <td>synthetic_backward</td>\n",
114
+ " <td>expert_labeled</td>\n",
115
+ " <td>True</td>\n",
116
+ " </tr>\n",
117
+ " <tr>\n",
118
+ " <th>1</th>\n",
119
+ " <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
120
+ " <td>mesonbuild/meson</td>\n",
121
+ " <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
122
+ " <td>Refactor OptionOverrideProxy and Backend class...</td>\n",
123
+ " <td>synthetic_backward</td>\n",
124
+ " <td>synthetic_forward</td>\n",
125
+ " <td>True</td>\n",
126
+ " </tr>\n",
127
+ " <tr>\n",
128
+ " <th>2</th>\n",
129
+ " <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
130
+ " <td>mesonbuild/meson</td>\n",
131
+ " <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
132
+ " <td>Refactor OptionOverrideProxy and backend optio...</td>\n",
133
+ " <td>synthetic_backward</td>\n",
134
+ " <td>synthetic_forward</td>\n",
135
+ " <td>True</td>\n",
136
+ " </tr>\n",
137
+ " <tr>\n",
138
+ " <th>3</th>\n",
139
+ " <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
140
+ " <td>mesonbuild/meson</td>\n",
141
+ " <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
142
+ " <td>Refactor: Enhance OptionOverrideProxy for mult...</td>\n",
143
+ " <td>synthetic_backward</td>\n",
144
+ " <td>synthetic_forward</td>\n",
145
+ " <td>True</td>\n",
146
+ " </tr>\n",
147
+ " <tr>\n",
148
+ " <th>4</th>\n",
149
+ " <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
150
+ " <td>mesonbuild/meson</td>\n",
151
+ " <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
152
+ " <td>Refactor OptionOverrideProxy and add target-sp...</td>\n",
153
+ " <td>synthetic_backward</td>\n",
154
+ " <td>synthetic_forward_from_backward</td>\n",
155
+ " <td>False</td>\n",
156
+ " </tr>\n",
157
+ " </tbody>\n",
158
+ "</table>\n",
159
+ "</div>"
160
+ ]
161
+ },
162
+ "execution_count": 3,
163
+ "metadata": {},
164
+ "output_type": "execute_result"
165
+ }
166
+ ],
167
+ "execution_count": 3
168
+ },
169
+ {
170
+ "metadata": {},
171
+ "cell_type": "markdown",
172
+ "source": "## Full",
173
+ "id": "922e7a73f11a4aec"
174
+ },
175
+ {
176
+ "metadata": {
177
+ "ExecuteTime": {
178
+ "end_time": "2024-10-15T18:43:14.266540Z",
179
+ "start_time": "2024-10-15T18:43:14.262103Z"
180
+ }
181
+ },
182
+ "cell_type": "code",
183
+ "source": "len(df.loc[df.is_related])",
184
+ "id": "562d9c53da109d1a",
185
+ "outputs": [
186
+ {
187
+ "data": {
188
+ "text/plain": [
189
+ "656"
190
+ ]
191
+ },
192
+ "execution_count": 4,
193
+ "metadata": {},
194
+ "output_type": "execute_result"
195
+ }
196
+ ],
197
+ "execution_count": 4
198
+ },
199
+ {
200
+ "metadata": {
201
+ "ExecuteTime": {
202
+ "end_time": "2024-10-15T18:43:18.073966Z",
203
+ "start_time": "2024-10-15T18:43:18.069219Z"
204
+ }
205
+ },
206
+ "cell_type": "code",
207
+ "source": "df.loc[df.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
208
+ "id": "b4f3c96a4b676a0d",
209
+ "outputs": [
210
+ {
211
+ "data": {
212
+ "text/plain": [
213
+ "43.733333333333334"
214
+ ]
215
+ },
216
+ "execution_count": 5,
217
+ "metadata": {},
218
+ "output_type": "execute_result"
219
+ }
220
+ ],
221
+ "execution_count": 5
222
+ },
223
+ {
224
+ "metadata": {
225
+ "ExecuteTime": {
226
+ "end_time": "2024-10-15T18:43:19.026689Z",
227
+ "start_time": "2024-10-15T18:43:19.021680Z"
228
+ }
229
+ },
230
+ "cell_type": "code",
231
+ "source": "len(df.loc[~df.is_related])",
232
+ "id": "54d9f32f1d18844f",
233
+ "outputs": [
234
+ {
235
+ "data": {
236
+ "text/plain": [
237
+ "5140"
238
+ ]
239
+ },
240
+ "execution_count": 6,
241
+ "metadata": {},
242
+ "output_type": "execute_result"
243
+ }
244
+ ],
245
+ "execution_count": 6
246
+ },
247
+ {
248
+ "metadata": {
249
+ "ExecuteTime": {
250
+ "end_time": "2024-10-15T18:43:19.484304Z",
251
+ "start_time": "2024-10-15T18:43:19.480012Z"
252
+ }
253
+ },
254
+ "cell_type": "code",
255
+ "source": "df.loc[~df.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
256
+ "id": "679761631517b9e4",
257
+ "outputs": [
258
+ {
259
+ "data": {
260
+ "text/plain": [
261
+ "342.6666666666667"
262
+ ]
263
+ },
264
+ "execution_count": 7,
265
+ "metadata": {},
266
+ "output_type": "execute_result"
267
+ }
268
+ ],
269
+ "execution_count": 7
270
+ },
271
+ {
272
+ "metadata": {},
273
+ "cell_type": "markdown",
274
+ "source": "## Expert-labeled",
275
+ "id": "84561ea89717d61a"
276
+ },
277
+ {
278
+ "metadata": {
279
+ "ExecuteTime": {
280
+ "end_time": "2024-10-15T18:45:52.905631Z",
281
+ "start_time": "2024-10-15T18:45:52.901913Z"
282
+ }
283
+ },
284
+ "cell_type": "code",
285
+ "source": "_ = df.loc[(df.G_type == \"initial\") & (df.E_type == \"expert_labeled\")]",
286
+ "id": "be1c800f45cef26e",
287
+ "outputs": [],
288
+ "execution_count": 36
289
+ },
290
+ {
291
+ "metadata": {
292
+ "ExecuteTime": {
293
+ "end_time": "2024-10-15T18:45:53.234109Z",
294
+ "start_time": "2024-10-15T18:45:53.230986Z"
295
+ }
296
+ },
297
+ "cell_type": "code",
298
+ "source": "len(_.loc[_.is_related])",
299
+ "id": "1d092dff4d39bcd1",
300
+ "outputs": [
301
+ {
302
+ "data": {
303
+ "text/plain": [
304
+ "57"
305
+ ]
306
+ },
307
+ "execution_count": 37,
308
+ "metadata": {},
309
+ "output_type": "execute_result"
310
+ }
311
+ ],
312
+ "execution_count": 37
313
+ },
314
+ {
315
+ "metadata": {
316
+ "ExecuteTime": {
317
+ "end_time": "2024-10-15T18:45:53.629311Z",
318
+ "start_time": "2024-10-15T18:45:53.625620Z"
319
+ }
320
+ },
321
+ "cell_type": "code",
322
+ "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
323
+ "id": "a06a532cd5a29725",
324
+ "outputs": [
325
+ {
326
+ "data": {
327
+ "text/plain": [
328
+ "3.8"
329
+ ]
330
+ },
331
+ "execution_count": 38,
332
+ "metadata": {},
333
+ "output_type": "execute_result"
334
+ }
335
+ ],
336
+ "execution_count": 38
337
+ },
338
+ {
339
+ "metadata": {
340
+ "ExecuteTime": {
341
+ "end_time": "2024-10-15T18:45:53.956790Z",
342
+ "start_time": "2024-10-15T18:45:53.953842Z"
343
+ }
344
+ },
345
+ "cell_type": "code",
346
+ "source": "len(_.loc[~_.is_related])",
347
+ "id": "5e19c8a6309b62aa",
348
+ "outputs": [
349
+ {
350
+ "data": {
351
+ "text/plain": [
352
+ "0"
353
+ ]
354
+ },
355
+ "execution_count": 39,
356
+ "metadata": {},
357
+ "output_type": "execute_result"
358
+ }
359
+ ],
360
+ "execution_count": 39
361
+ },
362
+ {
363
+ "metadata": {
364
+ "ExecuteTime": {
365
+ "end_time": "2024-10-15T18:46:02.554527Z",
366
+ "start_time": "2024-10-15T18:46:02.551084Z"
367
+ }
368
+ },
369
+ "cell_type": "code",
370
+ "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
371
+ "id": "e43179c5dcab5eb2",
372
+ "outputs": [
373
+ {
374
+ "data": {
375
+ "text/plain": [
376
+ "nan"
377
+ ]
378
+ },
379
+ "execution_count": 40,
380
+ "metadata": {},
381
+ "output_type": "execute_result"
382
+ }
383
+ ],
384
+ "execution_count": 40
385
+ },
386
+ {
387
+ "metadata": {},
388
+ "cell_type": "markdown",
389
+ "source": "## Backward",
390
+ "id": "70ee052fae2f88e3"
391
+ },
392
+ {
393
+ "metadata": {
394
+ "ExecuteTime": {
395
+ "end_time": "2024-10-15T18:44:33.559606Z",
396
+ "start_time": "2024-10-15T18:44:33.556802Z"
397
+ }
398
+ },
399
+ "cell_type": "code",
400
+ "source": "_ = df.loc[(df.G_type == \"synthetic_backward\") & (~df.E_type.isin([\"synthetic_forward\", \"synthetic_forward_from_backward\"]))]",
401
+ "id": "99f51ecc55c4db35",
402
+ "outputs": [],
403
+ "execution_count": 20
404
+ },
405
+ {
406
+ "metadata": {
407
+ "ExecuteTime": {
408
+ "end_time": "2024-10-15T18:44:33.958325Z",
409
+ "start_time": "2024-10-15T18:44:33.955847Z"
410
+ }
411
+ },
412
+ "cell_type": "code",
413
+ "source": "len(_.loc[_.is_related])",
414
+ "id": "6ff29390c8e127c2",
415
+ "outputs": [
416
+ {
417
+ "data": {
418
+ "text/plain": [
419
+ "104"
420
+ ]
421
+ },
422
+ "execution_count": 21,
423
+ "metadata": {},
424
+ "output_type": "execute_result"
425
+ }
426
+ ],
427
+ "execution_count": 21
428
+ },
429
+ {
430
+ "metadata": {
431
+ "ExecuteTime": {
432
+ "end_time": "2024-10-15T18:44:34.455560Z",
433
+ "start_time": "2024-10-15T18:44:34.452303Z"
434
+ }
435
+ },
436
+ "cell_type": "code",
437
+ "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
438
+ "id": "e1ae04e1ecfb2040",
439
+ "outputs": [
440
+ {
441
+ "data": {
442
+ "text/plain": [
443
+ "7.428571428571429"
444
+ ]
445
+ },
446
+ "execution_count": 22,
447
+ "metadata": {},
448
+ "output_type": "execute_result"
449
+ }
450
+ ],
451
+ "execution_count": 22
452
+ },
453
+ {
454
+ "metadata": {
455
+ "ExecuteTime": {
456
+ "end_time": "2024-10-15T18:44:34.903849Z",
457
+ "start_time": "2024-10-15T18:44:34.901226Z"
458
+ }
459
+ },
460
+ "cell_type": "code",
461
+ "source": "len(_.loc[~_.is_related])",
462
+ "id": "125c4c335e7761da",
463
+ "outputs": [
464
+ {
465
+ "data": {
466
+ "text/plain": [
467
+ "1048"
468
+ ]
469
+ },
470
+ "execution_count": 23,
471
+ "metadata": {},
472
+ "output_type": "execute_result"
473
+ }
474
+ ],
475
+ "execution_count": 23
476
+ },
477
+ {
478
+ "metadata": {
479
+ "ExecuteTime": {
480
+ "end_time": "2024-10-15T18:44:35.783538Z",
481
+ "start_time": "2024-10-15T18:44:35.778676Z"
482
+ }
483
+ },
484
+ "cell_type": "code",
485
+ "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
486
+ "id": "4782f1d6e6863f89",
487
+ "outputs": [
488
+ {
489
+ "data": {
490
+ "text/plain": [
491
+ "74.85714285714286"
492
+ ]
493
+ },
494
+ "execution_count": 24,
495
+ "metadata": {},
496
+ "output_type": "execute_result"
497
+ }
498
+ ],
499
+ "execution_count": 24
500
+ },
501
+ {
502
+ "metadata": {},
503
+ "cell_type": "markdown",
504
+ "source": "## Forward",
505
+ "id": "bf61a4b422f779fa"
506
+ },
507
+ {
508
+ "metadata": {},
509
+ "cell_type": "markdown",
510
+ "source": "### From human",
511
+ "id": "1429f9f99acf75d"
512
+ },
513
+ {
514
+ "metadata": {
515
+ "ExecuteTime": {
516
+ "end_time": "2024-10-15T18:46:21.359807Z",
517
+ "start_time": "2024-10-15T18:46:21.356451Z"
518
+ }
519
+ },
520
+ "cell_type": "code",
521
+ "source": "_ = df.loc[(df.G_type == \"initial\") & (df.E_type == \"synthetic_forward\")]",
522
+ "id": "e13d55b0124f04b3",
523
+ "outputs": [],
524
+ "execution_count": 41
525
+ },
526
+ {
527
+ "metadata": {
528
+ "ExecuteTime": {
529
+ "end_time": "2024-10-15T18:46:21.798508Z",
530
+ "start_time": "2024-10-15T18:46:21.795885Z"
531
+ }
532
+ },
533
+ "cell_type": "code",
534
+ "source": "len(_.loc[_.is_related])",
535
+ "id": "b8353390df7da427",
536
+ "outputs": [
537
+ {
538
+ "data": {
539
+ "text/plain": [
540
+ "177"
541
+ ]
542
+ },
543
+ "execution_count": 42,
544
+ "metadata": {},
545
+ "output_type": "execute_result"
546
+ }
547
+ ],
548
+ "execution_count": 42
549
+ },
550
+ {
551
+ "metadata": {
552
+ "ExecuteTime": {
553
+ "end_time": "2024-10-15T18:46:22.163595Z",
554
+ "start_time": "2024-10-15T18:46:22.160176Z"
555
+ }
556
+ },
557
+ "cell_type": "code",
558
+ "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
559
+ "id": "ac89afde65efd73d",
560
+ "outputs": [
561
+ {
562
+ "data": {
563
+ "text/plain": [
564
+ "11.8"
565
+ ]
566
+ },
567
+ "execution_count": 43,
568
+ "metadata": {},
569
+ "output_type": "execute_result"
570
+ }
571
+ ],
572
+ "execution_count": 43
573
+ },
574
+ {
575
+ "metadata": {
576
+ "ExecuteTime": {
577
+ "end_time": "2024-10-15T18:46:22.552314Z",
578
+ "start_time": "2024-10-15T18:46:22.549570Z"
579
+ }
580
+ },
581
+ "cell_type": "code",
582
+ "source": "len(_.loc[~_.is_related])",
583
+ "id": "9b6cb335e3bbb7ff",
584
+ "outputs": [
585
+ {
586
+ "data": {
587
+ "text/plain": [
588
+ "0"
589
+ ]
590
+ },
591
+ "execution_count": 44,
592
+ "metadata": {},
593
+ "output_type": "execute_result"
594
+ }
595
+ ],
596
+ "execution_count": 44
597
+ },
598
+ {
599
+ "metadata": {
600
+ "ExecuteTime": {
601
+ "end_time": "2024-10-15T18:46:23.237736Z",
602
+ "start_time": "2024-10-15T18:46:23.234085Z"
603
+ }
604
+ },
605
+ "cell_type": "code",
606
+ "source": "__.loc[~__.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
607
+ "id": "fe22189a70fc4149",
608
+ "outputs": [
609
+ {
610
+ "data": {
611
+ "text/plain": [
612
+ "nan"
613
+ ]
614
+ },
615
+ "execution_count": 45,
616
+ "metadata": {},
617
+ "output_type": "execute_result"
618
+ }
619
+ ],
620
+ "execution_count": 45
621
+ },
622
+ {
623
+ "metadata": {},
624
+ "cell_type": "markdown",
625
+ "source": "### From backward",
626
+ "id": "ace7afb876fb88a0"
627
+ },
628
+ {
629
+ "metadata": {
630
+ "ExecuteTime": {
631
+ "end_time": "2024-10-15T18:47:06.641374Z",
632
+ "start_time": "2024-10-15T18:47:06.637018Z"
633
+ }
634
+ },
635
+ "cell_type": "code",
636
+ "source": "_ = df.loc[(df.G_type == \"synthetic_backward\") & (df.E_type.isin([\"synthetic_forward\", \"synthetic_forward_from_backward\"]))]",
637
+ "id": "88800960dbff619a",
638
+ "outputs": [],
639
+ "execution_count": 53
640
+ },
641
+ {
642
+ "metadata": {
643
+ "ExecuteTime": {
644
+ "end_time": "2024-10-15T18:47:15.358650Z",
645
+ "start_time": "2024-10-15T18:47:15.355108Z"
646
+ }
647
+ },
648
+ "cell_type": "code",
649
+ "source": "len(_.loc[_.is_related])",
650
+ "id": "890613156e005c83",
651
+ "outputs": [
652
+ {
653
+ "data": {
654
+ "text/plain": [
655
+ "318"
656
+ ]
657
+ },
658
+ "execution_count": 56,
659
+ "metadata": {},
660
+ "output_type": "execute_result"
661
+ }
662
+ ],
663
+ "execution_count": 56
664
+ },
665
+ {
666
+ "metadata": {
667
+ "ExecuteTime": {
668
+ "end_time": "2024-10-15T18:47:15.579415Z",
669
+ "start_time": "2024-10-15T18:47:15.576016Z"
670
+ }
671
+ },
672
+ "cell_type": "code",
673
+ "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
674
+ "id": "999f91382a2c8ff6",
675
+ "outputs": [
676
+ {
677
+ "data": {
678
+ "text/plain": [
679
+ "22.714285714285715"
680
+ ]
681
+ },
682
+ "execution_count": 57,
683
+ "metadata": {},
684
+ "output_type": "execute_result"
685
+ }
686
+ ],
687
+ "execution_count": 57
688
+ },
689
+ {
690
+ "metadata": {
691
+ "ExecuteTime": {
692
+ "end_time": "2024-10-15T18:47:15.834218Z",
693
+ "start_time": "2024-10-15T18:47:15.831258Z"
694
+ }
695
+ },
696
+ "cell_type": "code",
697
+ "source": "len(_.loc[~_.is_related])",
698
+ "id": "d347941cbb4b2db1",
699
+ "outputs": [
700
+ {
701
+ "data": {
702
+ "text/plain": [
703
+ "3753"
704
+ ]
705
+ },
706
+ "execution_count": 58,
707
+ "metadata": {},
708
+ "output_type": "execute_result"
709
+ }
710
+ ],
711
+ "execution_count": 58
712
+ },
713
+ {
714
+ "metadata": {
715
+ "ExecuteTime": {
716
+ "end_time": "2024-10-15T18:47:16.138798Z",
717
+ "start_time": "2024-10-15T18:47:16.133397Z"
718
+ }
719
+ },
720
+ "cell_type": "code",
721
+ "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
722
+ "id": "2db4d96713a8634d",
723
+ "outputs": [
724
+ {
725
+ "data": {
726
+ "text/plain": [
727
+ "268.07142857142856"
728
+ ]
729
+ },
730
+ "execution_count": 59,
731
+ "metadata": {},
732
+ "output_type": "execute_result"
733
+ }
734
+ ],
735
+ "execution_count": 59
736
+ }
737
+ ],
738
+ "metadata": {
739
+ "kernelspec": {
740
+ "display_name": "Python 3",
741
+ "language": "python",
742
+ "name": "python3"
743
+ },
744
+ "language_info": {
745
+ "codemirror_mode": {
746
+ "name": "ipython",
747
+ "version": 2
748
+ },
749
+ "file_extension": ".py",
750
+ "mimetype": "text/x-python",
751
+ "name": "python",
752
+ "nbconvert_exporter": "python",
753
+ "pygments_lexer": "ipython2",
754
+ "version": "2.7.6"
755
+ }
756
+ },
757
+ "nbformat": 4,
758
+ "nbformat_minor": 5
759
+ }
dataset_statistics.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+
3
+ import Levenshtein
4
+ import numpy as np
5
+ import pandas as pd
6
+ import plotly.figure_factory as ff
7
+
8
+ import config
9
+
10
+
11
+ def get_statistics_for_sample(start_msg, end_msg, row=None):
12
+ edit_ops = Levenshtein.editops(start_msg, end_msg)
13
+ n_deletes = sum([1 if op == "delete" else 0 for op, _, _ in edit_ops])
14
+ n_inserts = sum([1 if op == "insert" else 0 for op, _, _ in edit_ops])
15
+ n_replaces = sum([1 if op == "replace" else 0 for op, _, _ in edit_ops])
16
+
17
+ n_changes = n_deletes + n_inserts + n_replaces
18
+ n_deletes += n_replaces
19
+ n_inserts += n_replaces
20
+
21
+ return {
22
+ "deletions": n_deletes,
23
+ "insertions": n_inserts,
24
+ "changes": n_changes,
25
+ "deletions_norm": n_deletes / len(start_msg),
26
+ "insertions_norm": n_inserts / len(end_msg),
27
+ "changes_norm": n_changes / len(end_msg),
28
+ "lendiff": abs(len(start_msg) - len(end_msg)),
29
+ "editdist": row["editdist"] if row is not None else Levenshtein.distance(start_msg, end_msg),
30
+ }
31
+
32
+
33
+ def get_statistics_for_row(row):
34
+ if "commit_msg_start" in row:
35
+ start = row["commit_msg_start"]
36
+ else:
37
+ start = row["G_text"]
38
+ if "commit_msg_end" in row:
39
+ end = row["commit_msg_end"]
40
+ else:
41
+ end = row["E_text"]
42
+ return get_statistics_for_sample(start, end, row=row)
43
+
44
+
45
+ def get_statistics_for_df(df: pd.DataFrame):
46
+ stats = [get_statistics_for_row(row) for _, row in df.iterrows()]
47
+
48
+ assert len(stats) > 0
49
+
50
+ return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}
51
+
52
+
53
+ def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name):
54
+ hist_data = [
55
+ stat_golden,
56
+ stat_e2s,
57
+ stat_s2e,
58
+ stat_e2s_s2e,
59
+ np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0),
60
+ ]
61
+
62
+ group_labels = ["Golden", "e2s", "s2e", "e2s+s2e", "Synthetic"]
63
+
64
+ fig = ff.create_distplot(hist_data, group_labels, bin_size=0.05, show_rug=False, show_hist=False)
65
+
66
+ fig.update_layout(title_text=stat_name)
67
+
68
+ with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f:
69
+ pickle.dump(hist_data, f)
70
+
71
+ return fig
generate_annotated_diffs.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import diff_match_patch as dmp_module
2
+ from tqdm import tqdm
3
+
4
+ from api_wrappers import hf_data_loader
5
+
6
+
7
+ def get_annotated_diff(start_text, end_text):
8
+ dmp = dmp_module.diff_match_patch()
9
+ dmp_mapping = {-1: "-", 0: None, 1: "+"}
10
+
11
+ diff = dmp.diff_main(start_text, end_text)
12
+ dmp.diff_cleanupSemantic(diff)
13
+
14
+ result = [[w, dmp_mapping[t]] for t, w in diff]
15
+
16
+ return result
17
+
18
+
19
+ def annotated_diff_for_row(row):
20
+ if "commit_msg_start" in row:
21
+ start = row["commit_msg_start"]
22
+ else:
23
+ start = row["G_text"]
24
+ if "commit_msg_end" in row:
25
+ end = row["commit_msg_end"]
26
+ else:
27
+ end = row["E_text"]
28
+ return get_annotated_diff(start, end)
29
+
30
+
31
+ def data_with_annotated_diffs():
32
+ tqdm.pandas()
33
+
34
+ df = hf_data_loader.load_synthetic_as_pandas()
35
+ df = df.loc[df.is_related].copy()
36
+ annotated = df.progress_apply(annotated_diff_for_row, axis=1)
37
+ df["annotated_diff"] = annotated
38
+ return df
generated_message_length_comparison.ipynb ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {},
5
+ "cell_type": "markdown",
6
+ "source": [
7
+ "### How to run\n",
8
+ "\n",
9
+ "* Install libraries using the cell below (for grazie-api-gateway-client you will have to add a custom JB repository)\n",
10
+ "* Put the production prompt to file `data/prod_prompt.txt`\n",
11
+ "* Environment variables:\n",
12
+ " - `GRAZIE_API_JWT_TOKEN` -- JWT token for grazie (check `api_wrappers/grazie_wrapper.py` to adjust the client initialization if necessary)\n",
13
+ " - `HF_TOKEN` -- should _not_ be required; however, if it is, set it to a valid Hugging Face token"
14
+ ],
15
+ "id": "77d51d55b41735cf"
16
+ },
17
+ {
18
+ "metadata": {
19
+ "ExecuteTime": {
20
+ "end_time": "2024-06-20T16:09:07.968406Z",
21
+ "start_time": "2024-06-20T16:09:07.955405Z"
22
+ }
23
+ },
24
+ "cell_type": "code",
25
+ "source": [
26
+ "# !pip install grazie-api-gateway-client\n",
27
+ "# !pip install tqdm\n",
28
+ "# !pip install pandas\n",
29
+ "# !pip install datasets"
30
+ ],
31
+ "id": "91fa273e8987f6f6",
32
+ "outputs": [],
33
+ "execution_count": 1
34
+ },
35
+ {
36
+ "metadata": {
37
+ "ExecuteTime": {
38
+ "end_time": "2024-06-20T16:09:10.353479Z",
39
+ "start_time": "2024-06-20T16:09:07.970405Z"
40
+ }
41
+ },
42
+ "cell_type": "code",
43
+ "source": [
44
+ "from api_wrappers.grazie_wrapper import generate_for_prompt\n",
45
+ "from api_wrappers.hf_data_loader import load_full_commit_with_predictions_as_pandas\n",
46
+ "from tqdm import tqdm\n",
47
+ "\n",
48
+ "tqdm.pandas()"
49
+ ],
50
+ "id": "ce11a4c781c152e",
51
+ "outputs": [],
52
+ "execution_count": 2
53
+ },
54
+ {
55
+ "metadata": {
56
+ "ExecuteTime": {
57
+ "end_time": "2024-06-20T16:09:10.368996Z",
58
+ "start_time": "2024-06-20T16:09:10.354434Z"
59
+ }
60
+ },
61
+ "cell_type": "code",
62
+ "source": [
63
+ "with open(\"data/prod_prompt.txt\") as f:\n",
64
+ "\tPROD_PROMPT = f.read().strip()\n",
65
+ "\n",
66
+ "def prod_prompt(diff):\n",
67
+ "\treturn PROD_PROMPT.replace(\"$diff\", diff).replace(\"$text\", \"\")\n",
68
+ "\n",
69
+ "def generate_commit_message_prod(diff):\n",
70
+ "\treturn generate_for_prompt(prod_prompt(diff))"
71
+ ],
72
+ "id": "84a769c8765a7b64",
73
+ "outputs": [],
74
+ "execution_count": 3
75
+ },
76
+ {
77
+ "metadata": {
78
+ "ExecuteTime": {
79
+ "end_time": "2024-06-20T16:09:10.384590Z",
80
+ "start_time": "2024-06-20T16:09:10.371410Z"
81
+ }
82
+ },
83
+ "cell_type": "code",
84
+ "source": "generate_commit_message_prod(\"TEST\")",
85
+ "id": "af2f20def94b0490",
86
+ "outputs": [
87
+ {
88
+ "data": {
89
+ "text/plain": [
90
+ "\"Certainly! I'll need to see the specific code differences (diffs) you would like to have summarized into a commit message. Please provide the diffs so I can assist you properly.\""
91
+ ]
92
+ },
93
+ "execution_count": 4,
94
+ "metadata": {},
95
+ "output_type": "execute_result"
96
+ }
97
+ ],
98
+ "execution_count": 4
99
+ },
100
+ {
101
+ "metadata": {
102
+ "ExecuteTime": {
103
+ "end_time": "2024-06-20T16:09:22.224167Z",
104
+ "start_time": "2024-06-20T16:09:10.388409Z"
105
+ }
106
+ },
107
+ "cell_type": "code",
108
+ "source": [
109
+ "DATA = load_full_commit_with_predictions_as_pandas()[[\"mods\", \"prediction\"]].rename(columns={\"mods\": \"diff\", \"prediction\": \"prediction_current\"})\n",
110
+ "DATA.head()"
111
+ ],
112
+ "id": "a49cabf576c9d692",
113
+ "outputs": [
114
+ {
115
+ "name": "stderr",
116
+ "output_type": "stream",
117
+ "text": [
118
+ "Using the latest cached version of the dataset since JetBrains-Research/lca-commit-message-generation couldn't be found on the Hugging Face Hub\n",
119
+ "Found the latest cached dataset configuration 'commitchronicle-py-long' at cache\\JetBrains-Research___lca-commit-message-generation\\commitchronicle-py-long\\0.0.0\\58dcef83a63cccebacd3e786afd73181cc9175e5 (last modified on Sun Apr 7 11:16:22 2024).\n",
120
+ "Using the latest cached version of the dataset since JetBrains-Research/lca-results couldn't be found on the Hugging Face Hub\n",
121
+ "Found the latest cached dataset configuration 'cmg_gpt_4_0613' at cache\\JetBrains-Research___lca-results\\cmg_gpt_4_0613\\0.0.0\\4b56bbf7243da371b3e0a42a0c9db1f37af98c39 (last modified on Fri May 31 16:00:33 2024).\n"
122
+ ]
123
+ },
124
+ {
125
+ "data": {
126
+ "text/plain": [
127
+ " diff \\\n",
128
+ "0 [{'change_type': 'MODIFY', 'old_path': 'cupy/c... \n",
129
+ "1 [{'change_type': 'MODIFY', 'old_path': 'tests/... \n",
130
+ "2 [{'change_type': 'MODIFY', 'old_path': 'numpy/... \n",
131
+ "3 [{'change_type': 'MODIFY', 'old_path': 'numpy/... \n",
132
+ "4 [{'change_type': 'MODIFY', 'old_path': 'numpy/... \n",
133
+ "\n",
134
+ " prediction_current \n",
135
+ "0 Extend memory management to consider CUDA stre... \n",
136
+ "1 Implement utility methods for parameterized te... \n",
137
+ "2 Update numpy function imports to use numpy as ... \n",
138
+ "3 Switch to using internal implementation method... \n",
139
+ "4 Add type hints and refine array API wrappers\\n... "
140
+ ],
141
+ "text/html": [
142
+ "<div>\n",
143
+ "<style scoped>\n",
144
+ " .dataframe tbody tr th:only-of-type {\n",
145
+ " vertical-align: middle;\n",
146
+ " }\n",
147
+ "\n",
148
+ " .dataframe tbody tr th {\n",
149
+ " vertical-align: top;\n",
150
+ " }\n",
151
+ "\n",
152
+ " .dataframe thead th {\n",
153
+ " text-align: right;\n",
154
+ " }\n",
155
+ "</style>\n",
156
+ "<table border=\"1\" class=\"dataframe\">\n",
157
+ " <thead>\n",
158
+ " <tr style=\"text-align: right;\">\n",
159
+ " <th></th>\n",
160
+ " <th>diff</th>\n",
161
+ " <th>prediction_current</th>\n",
162
+ " </tr>\n",
163
+ " </thead>\n",
164
+ " <tbody>\n",
165
+ " <tr>\n",
166
+ " <th>0</th>\n",
167
+ " <td>[{'change_type': 'MODIFY', 'old_path': 'cupy/c...</td>\n",
168
+ " <td>Extend memory management to consider CUDA stre...</td>\n",
169
+ " </tr>\n",
170
+ " <tr>\n",
171
+ " <th>1</th>\n",
172
+ " <td>[{'change_type': 'MODIFY', 'old_path': 'tests/...</td>\n",
173
+ " <td>Implement utility methods for parameterized te...</td>\n",
174
+ " </tr>\n",
175
+ " <tr>\n",
176
+ " <th>2</th>\n",
177
+ " <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n",
178
+ " <td>Update numpy function imports to use numpy as ...</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>3</th>\n",
182
+ " <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n",
183
+ " <td>Switch to using internal implementation method...</td>\n",
184
+ " </tr>\n",
185
+ " <tr>\n",
186
+ " <th>4</th>\n",
187
+ " <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n",
188
+ " <td>Add type hints and refine array API wrappers\\n...</td>\n",
189
+ " </tr>\n",
190
+ " </tbody>\n",
191
+ "</table>\n",
192
+ "</div>"
193
+ ]
194
+ },
195
+ "execution_count": 5,
196
+ "metadata": {},
197
+ "output_type": "execute_result"
198
+ }
199
+ ],
200
+ "execution_count": 5
201
+ },
202
+ {
203
+ "metadata": {
204
+ "ExecuteTime": {
205
+ "end_time": "2024-06-20T16:21:20.410778Z",
206
+ "start_time": "2024-06-20T16:09:22.227258Z"
207
+ }
208
+ },
209
+ "cell_type": "code",
210
+ "source": "DATA[\"prediction_prod\"] = DATA.progress_apply(lambda row: generate_commit_message_prod(str(row[\"diff\"])), axis=1)",
211
+ "id": "9ded493e087f991d",
212
+ "outputs": [
213
+ {
214
+ "name": "stderr",
215
+ "output_type": "stream",
216
+ "text": [
217
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 163/163 [11:58<00:00, 4.41s/it]\n"
218
+ ]
219
+ }
220
+ ],
221
+ "execution_count": 6
222
+ },
223
+ {
224
+ "metadata": {
225
+ "ExecuteTime": {
226
+ "end_time": "2024-06-20T16:21:20.426781Z",
227
+ "start_time": "2024-06-20T16:21:20.414781Z"
228
+ }
229
+ },
230
+ "cell_type": "code",
231
+ "source": [
232
+ "current_avg_length = DATA[\"prediction_current\"].str.len().mean()\n",
233
+ "print(f\"Current average length: {current_avg_length}\")"
234
+ ],
235
+ "id": "ad38c2dce387f26d",
236
+ "outputs": [
237
+ {
238
+ "name": "stdout",
239
+ "output_type": "stream",
240
+ "text": [
241
+ "Current average length: 625.5644171779142\n"
242
+ ]
243
+ }
244
+ ],
245
+ "execution_count": 7
246
+ },
247
+ {
248
+ "metadata": {
249
+ "ExecuteTime": {
250
+ "end_time": "2024-06-20T16:21:20.442017Z",
251
+ "start_time": "2024-06-20T16:21:20.429913Z"
252
+ }
253
+ },
254
+ "cell_type": "code",
255
+ "source": [
256
+ "prod_avg_length = DATA[\"prediction_prod\"].str.len().mean()\n",
257
+ "print(f\"Prod average length: {prod_avg_length}\")"
258
+ ],
259
+ "id": "ec8b4412410794a4",
260
+ "outputs": [
261
+ {
262
+ "name": "stdout",
263
+ "output_type": "stream",
264
+ "text": [
265
+ "Prod average length: 352.88957055214723\n"
266
+ ]
267
+ }
268
+ ],
269
+ "execution_count": 8
270
+ },
271
+ {
272
+ "metadata": {
273
+ "ExecuteTime": {
274
+ "end_time": "2024-06-20T16:21:20.457884Z",
275
+ "start_time": "2024-06-20T16:21:20.444852Z"
276
+ }
277
+ },
278
+ "cell_type": "code",
279
+ "source": "print(f\"Length ratio (current / prod): {current_avg_length / prod_avg_length})\")",
280
+ "id": "10f087784896eca3",
281
+ "outputs": [
282
+ {
283
+ "name": "stdout",
284
+ "output_type": "stream",
285
+ "text": [
286
+ "Length ratio (current / prod): 1.772691712591923)\n"
287
+ ]
288
+ }
289
+ ],
290
+ "execution_count": 9
291
+ }
292
+ ],
293
+ "metadata": {
294
+ "kernelspec": {
295
+ "display_name": "Python 3",
296
+ "language": "python",
297
+ "name": "python3"
298
+ },
299
+ "language_info": {
300
+ "codemirror_mode": {
301
+ "name": "ipython",
302
+ "version": 2
303
+ },
304
+ "file_extension": ".py",
305
+ "mimetype": "text/x-python",
306
+ "name": "python",
307
+ "nbconvert_exporter": "python",
308
+ "pygments_lexer": "ipython2",
309
+ "version": "2.7.6"
310
+ }
311
+ },
312
+ "nbformat": 4,
313
+ "nbformat_minor": 5
314
+ }
generation_steps/__init__.py ADDED
File without changes
generation_steps/examples.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import config
2
+ from api_wrappers import hf_data_loader
3
+
4
+ N_EXAMPLES = 15
5
+
6
+
7
+ def get_example_prompt_end_to_start(start_msg, end_msg):
8
+ return f"""START OF THE EXAMPLE
9
+
10
+ For the following edited commit message:
11
+ START OF THE EDITED COMMIT MESSAGE
12
+ {end_msg}
13
+ END OF THE EDITED COMMIT MESSAGE
14
+
15
+ You would output the following initial commit message:
16
+ START OF THE INITIAL COMMIT MESSAGE
17
+ {start_msg}
18
+ END OF THE INITIAL COMMIT MESSAGE
19
+
20
+ END OF THE EXAMPLE"""
21
+
22
+
23
+ def get_example_prompt_start_to_end(start_msg, end_msg):
24
+ return f"""START OF THE EXAMPLE
25
+
26
+ For the following LLM-generated commit message:
27
+ START OF THE GENERATED COMMIT MESSAGE
28
+ {start_msg}
29
+ END OF THE GENERATED COMMIT MESSAGE
30
+
31
+ You would output the following improved commit message:
32
+ START OF THE IMPROVED COMMIT MESSAGE
33
+ {end_msg}
34
+ END OF THE IMPROVED COMMIT MESSAGE
35
+
36
+ END OF THE EXAMPLE"""
37
+
38
+
39
+ manual_df = hf_data_loader.load_raw_rewriting_as_pandas()[["commit_msg_start", "commit_msg_end"]]
40
+ manual_df = manual_df.sample(n=N_EXAMPLES, random_state=config.RANDOM_STATE)
41
+
42
+
43
+ def generate_examples(end_to_start):
44
+ prompt_fn = get_example_prompt_end_to_start if end_to_start else get_example_prompt_start_to_end
45
+ examples = [prompt_fn(row["commit_msg_start"], row["commit_msg_end"]) for _, row in manual_df.iterrows()]
46
+
47
+ return "\n".join(examples)
48
+
49
+
50
+ EXAMPLES_END_TO_START = generate_examples(end_to_start=True)
51
+ EXAMPLES_START_TO_END = generate_examples(end_to_start=False)
generation_steps/for_labeling.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from tqdm import tqdm
4
+
5
+ import config
6
+ from api_wrappers import hf_data_loader
7
+ from generation_steps import synthetic_forward
8
+
9
+
10
+ def transform(df):
11
+ print("Generating data for labeling:")
12
+ synthetic_forward.print_config()
13
+ tqdm.pandas()
14
+
15
+ manual_df = hf_data_loader.load_raw_rewriting_as_pandas()
16
+
17
+ manual_df = manual_df.sample(frac=1, random_state=config.RANDOM_STATE).set_index(["hash", "repo"])[
18
+ ["commit_msg_start", "commit_msg_end"]
19
+ ]
20
+
21
+ manual_df = manual_df[~manual_df.index.duplicated(keep="first")]
22
+
23
+ def get_is_manually_rewritten(row):
24
+ commit_id = (row["hash"], row["repo"])
25
+ return commit_id in manual_df.index
26
+
27
+ result = df
28
+ result["manual_sample"] = result.progress_apply(get_is_manually_rewritten, axis=1)
29
+
30
+ def get_prediction_message(row):
31
+ commit_id = (row["hash"], row["repo"])
32
+ if row["manual_sample"]:
33
+ return manual_df.loc[commit_id]["commit_msg_start"]
34
+ return row["prediction"]
35
+
36
+ def get_enhanced_message(row):
37
+ commit_id = (row["hash"], row["repo"])
38
+ if row["manual_sample"]:
39
+ return manual_df.loc[commit_id]["commit_msg_end"]
40
+ return synthetic_forward.generate_end_msg(start_msg=row["prediction"], diff=row["mods"])
41
+
42
+ result["enhanced"] = result.progress_apply(get_enhanced_message, axis=1)
43
+ result["prediction"] = result.progress_apply(get_prediction_message, axis=1)
44
+ result["mods"] = result["mods"].progress_apply(json.dumps)
45
+
46
+ result.to_csv(config.DATA_FOR_LABELING_ARTIFACT)
47
+ print("Done")
48
+ return result
49
+
50
+
51
+ def main():
52
+ synthetic_forward.GENERATION_ATTEMPTS = 3
53
+ df = hf_data_loader.load_full_commit_with_predictions_as_pandas()
54
+ transform(df)
55
+
56
+
57
+ if __name__ == "__main__":
58
+ main()
generation_steps/metrics_analysis.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from rapidfuzz.distance.Levenshtein import distance, normalized_similarity
3
+
4
+ import config
5
+
6
+ BLEU = evaluate.load("saridormi/b_norm", cache_dir=config.CACHE_DIR)
7
+
8
+
9
+ def bleu_fn(pred, ref, **kwargs):
10
+ if "refs" in kwargs:
11
+ return BLEU.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["b_norm"]
12
+ return BLEU.compute(predictions=[pred], references=[ref])["b_norm"]
13
+
14
+
15
+ METEOR = evaluate.load("meteor", cache_dir=config.CACHE_DIR)
16
+
17
+
18
+ def meteor_fn(pred, ref, **kwargs):
19
+ if "refs" in kwargs:
20
+ return METEOR.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["meteor"]
21
+ return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
22
+
23
+
24
+ ROUGE = evaluate.load("rouge", cache_dir=config.CACHE_DIR)
25
+
26
+
27
+ def rouge1_fn(pred, ref, **kwargs):
28
+ if "refs" in kwargs:
29
+ return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge1"]
30
+ return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
31
+
32
+
33
+ def rouge2_fn(pred, ref, **kwargs):
34
+ if "refs" in kwargs:
35
+ return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge2"]
36
+ return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
37
+
38
+
39
+ def rougeL_fn(pred, ref, **kwargs):
40
+ if "refs" in kwargs:
41
+ return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rougeL"]
42
+ return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
43
+
44
+
45
+ BERTSCORE = evaluate.load("bertscore", cache_dir=config.CACHE_DIR)
46
+
47
+
48
+ def bertscore_fn(pred, ref, **kwargs):
49
+ if "refs" in kwargs:
50
+ return BERTSCORE.compute(predictions=[pred], references=[kwargs["refs"]], model_type="distilbert-base-uncased")[
51
+ "f1"
52
+ ][0]
53
+ return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
54
+
55
+
56
+ CHRF = evaluate.load("chrf")
57
+
58
+
59
+ def chrf_fn(pred, ref, **kwargs):
60
+ if "refs" in kwargs:
61
+ return CHRF.compute(predictions=[pred], references=[kwargs["refs"]])["score"]
62
+ return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
63
+
64
+
65
+ def edit_distance_fn(pred, ref, **kwargs):
66
+ if "refs" in kwargs:
67
+ scores = [distance(pred, ref) for ref in kwargs["refs"]]
68
+ return sum(scores) / len(scores)
69
+ return distance(pred, ref)
70
+
71
+
72
+ def edit_distance_norm_fn(pred, ref, **kwargs):
73
+ if "refs" in kwargs:
74
+ scores = [normalized_similarity(pred, ref) for ref in kwargs["refs"]]
75
+ return sum(scores) / len(scores)
76
+ return normalized_similarity(pred, ref)
77
+
78
+
79
+ AGGR_METRICS = {
80
+ "editdist": edit_distance_fn,
81
+ "editsim": edit_distance_norm_fn,
82
+ "bleu": bleu_fn,
83
+ "meteor": meteor_fn,
84
+ "rouge1": rouge1_fn,
85
+ "rouge2": rouge2_fn,
86
+ "rougeL": rougeL_fn,
87
+ "bertscore": bertscore_fn,
88
+ "chrF": chrf_fn,
89
+ }
90
+
91
+
92
+ REL_METRICS = {
93
+ "editdist": edit_distance_fn,
94
+ }
generation_steps/synthetic_backward.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import chain
2
+
3
+ import pandas as pd
4
+ from tqdm import tqdm
5
+
6
+ import config
7
+ import dataset_statistics
8
+ from api_wrappers import grazie_wrapper, hf_data_loader
9
+ from generation_steps import examples
10
+
11
+ GENERATION_MULTIPLIER = 3
12
+ REL_INSERTIONS_THRESHOLD = 0.5
13
+ GENERATION_ATTEMPTS = 3
14
+
15
+
16
+ def build_prompt(reference, diff):
17
+ return f"""A software developer uses a LLM to generate commit messages.
18
+
19
+ They generated a commit message for the following source code changes:
20
+ START OF THE SOURCE CODE CHANGES
21
+ {diff}
22
+ END OF THE SOURCE CODE CHANGES
23
+
24
+ After generating the commit message the developer understands that it is not perfect. After making dome changes,
25
+ they come up with an edited version of the message. Here is this edited message:
26
+ START OF THE COMMIT MESSAGE
27
+ {reference}
28
+ END OF THE COMMIT MESSAGE
29
+
30
+ Your task is to print the initial, LLM-generated commit message.
31
+ The message you print must share some fragments with the edited message.
32
+ Here are some examples of what you should output:
33
+ START OF THE EXAMPLES LIST
34
+ {examples.EXAMPLES_END_TO_START}
35
+ END OF THE EXAMPLES LIST
36
+
37
+
38
+ Print only the initial commit message's text after the
39
+ token "OUTPUT".
40
+
41
+ OUTPUT"""
42
+
43
+
44
+ def generate_start_msg(end_msg, diff):
45
+ prompt = build_prompt(reference=end_msg, diff=diff)
46
+ results = []
47
+
48
+ for i in range(GENERATION_ATTEMPTS):
49
+ start_msg_pred = grazie_wrapper.generate_for_prompt(prompt)
50
+
51
+ stats = dataset_statistics.get_statistics_for_sample(
52
+ start_msg=start_msg_pred,
53
+ end_msg=end_msg,
54
+ )
55
+
56
+ if stats["insertions"] < REL_INSERTIONS_THRESHOLD:
57
+ return start_msg_pred
58
+ else:
59
+ results.append((stats["insertions"], start_msg_pred))
60
+
61
+ results.sort()
62
+ return results[0][1]
63
+
64
+
65
+ COLS_TO_KEEP = ["hash", "repo", "commit_msg_end", "mods", "session"]
66
+
67
+ COLS_TO_DEFAULT = {"edit_time": None}
68
+
69
+
70
+ def transform(df):
71
+ print("End -> start synthesis:")
72
+ print(f"NUMBER OF EXAMPLES PER PROMPT = {examples.N_EXAMPLES}")
73
+ print(f"GENERATION_MULTIPLIER = {GENERATION_MULTIPLIER}")
74
+ print(f"REL_INSERTIONS_THRESHOLD = {REL_INSERTIONS_THRESHOLD}")
75
+ print(f"GENERATION_ATTEMPTS = {GENERATION_ATTEMPTS}")
76
+
77
+ df["end_to_start"] = False
78
+
79
+ generated_data = {"commit_msg_start": []}
80
+
81
+ for col in chain(COLS_TO_KEEP, COLS_TO_DEFAULT):
82
+ generated_data[col] = []
83
+
84
+ for _, row in tqdm(df.iterrows(), total=len(df)):
85
+ for i in range(GENERATION_MULTIPLIER):
86
+ commit_msg_start_pred = generate_start_msg(end_msg=row["commit_msg_end"], diff=row["mods"])
87
+
88
+ generated_data["commit_msg_start"].append(commit_msg_start_pred)
89
+ for col in COLS_TO_KEEP:
90
+ generated_data[col].append(row[col])
91
+
92
+ for col in COLS_TO_DEFAULT:
93
+ generated_data[col].append(COLS_TO_DEFAULT[col])
94
+
95
+ generated_df = pd.DataFrame.from_dict(generated_data)
96
+ generated_df["end_to_start"] = True
97
+
98
+ result = pd.concat([df, generated_df], ignore_index=True)
99
+ result.to_csv(config.END_TO_START_ARTIFACT)
100
+
101
+ print("Done")
102
+ return result
103
+
104
+
105
+ def main():
106
+ df = hf_data_loader.load_processed_rewriting_as_pandas()
107
+ transform(df)
108
+
109
+
110
+ if __name__ == "__main__":
111
+ main()
generation_steps/synthetic_forward.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+
4
+ import config
5
+ import dataset_statistics
6
+ from api_wrappers import grazie_wrapper
7
+ from generation_steps import examples
8
+
9
+ GENERATION_MULTIPLIER = 3
10
+ REL_DELETIONS_THRESHOLD = 0.75
11
+ GENERATION_ATTEMPTS = 3
12
+
13
+
14
+ def build_prompt(prediction, diff):
15
+ return f"""A LLM generated a commit message for the following source code changes:
16
+ START OF THE SOURCE CODE CHANGES
17
+ {diff}
18
+ END OF THE SOURCE CODE CHANGES
19
+
20
+ Here is the message the LLM generated:
21
+ START OF THE COMMIT MESSAGE
22
+ {prediction}
23
+ END OF THE COMMIT MESSAGE
24
+
25
+ This generated message is not perfect. Your task is to rewrite and improve it.
26
+ You have to simulate a human software developer who manually rewrites the LLM-generated commit message,
27
+ so the message you print must share some fragments with the generated message.
28
+ Your message should be concise.
29
+ Follow the Conventional Commits guidelines.
30
+ Here are some examples of what you should output:
31
+ START OF THE EXAMPLES LIST
32
+ {examples.EXAMPLES_START_TO_END}
33
+ END OF THE EXAMPLES LIST
34
+
35
+
36
+ Print only the improved commit message's text after the
37
+ token "OUTPUT".
38
+
39
+ OUTPUT"""
40
+
41
+
42
+ def generate_end_msg(start_msg, diff):
43
+ prompt = build_prompt(prediction=start_msg, diff=diff)
44
+ results = []
45
+
46
+ for i in range(GENERATION_ATTEMPTS):
47
+ end_msg_pred = grazie_wrapper.generate_for_prompt(prompt)
48
+
49
+ stats = dataset_statistics.get_statistics_for_sample(
50
+ start_msg=start_msg,
51
+ end_msg=end_msg_pred,
52
+ )
53
+ if stats["deletions"] < REL_DELETIONS_THRESHOLD:
54
+ return end_msg_pred
55
+ else:
56
+ results.append((stats["deletions"], end_msg_pred))
57
+
58
+ results.sort()
59
+ return results[0][1]
60
+
61
+
62
+ COLS_TO_KEEP = ["hash", "repo", "commit_msg_start", "mods", "session", "end_to_start"]
63
+
64
+
65
+ def print_config():
66
+ print(f"NUMBER OF EXAMPLES PER PROMPT = {examples.N_EXAMPLES}")
67
+ print(f"GENERATION_MULTIPLIER = {GENERATION_MULTIPLIER}")
68
+ print(f"REL_DELETIONS_THRESHOLD = {REL_DELETIONS_THRESHOLD}")
69
+ print(f"GENERATION_ATTEMPTS = {GENERATION_ATTEMPTS}")
70
+
71
+
72
+ def transform(df):
73
+ print("Start -> send synthesis:")
74
+ print_config()
75
+
76
+ df["start_to_end"] = False
77
+
78
+ generated_data = {"commit_msg_end": []}
79
+
80
+ for col in COLS_TO_KEEP:
81
+ generated_data[col] = []
82
+
83
+ for _, row in tqdm(df.iterrows(), total=len(df)):
84
+ for i in range(GENERATION_MULTIPLIER):
85
+ commit_msg_end_pred = generate_end_msg(start_msg=row["commit_msg_start"], diff=row["mods"])
86
+
87
+ generated_data["commit_msg_end"].append(commit_msg_end_pred)
88
+ for col in COLS_TO_KEEP:
89
+ generated_data[col].append(row[col])
90
+
91
+ generated_df = pd.DataFrame.from_dict(generated_data)
92
+ generated_df["start_to_end"] = True
93
+
94
+ result = pd.concat([df, generated_df], ignore_index=True)
95
+ result.to_csv(config.START_TO_END_ARTIFACT)
96
+
97
+ print("Done")
98
+ return result
99
+
100
+
101
+ def main():
102
+ df = pd.read_csv(config.END_TO_START_ARTIFACT, index_col=[0])
103
+ transform(df)
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
metrics_analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "commit-message-editing-visualization"
3
+ version = "0.1.0"
4
+ description = "Utilities for synthetic data generation, metrics analysis and visualization space for CMG Evaluaton."
5
+ authors = ["Your Name <[email protected]>"]
6
+ license = "MIT"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.9"
10
+ absl-py = "2.1.0"
11
+ aiofiles = "23.2.1"
12
+ aiohttp = "3.9.3"
13
+ aiosignal = "1.3.1"
14
+ altair = "5.3.0"
15
+ annotated-types = "0.6.0"
16
+ anyio = "4.3.0"
17
+ argon2-cffi = "23.1.0"
18
+ argon2-cffi-bindings = "21.2.0"
19
+ arrow = "1.3.0"
20
+ asttokens = "2.4.1"
21
+ async-lru = "2.0.4"
22
+ async-timeout = "4.0.3"
23
+ attrs = "23.2.0"
24
+ Babel = "2.14.0"
25
+ beautifulsoup4 = "4.12.3"
26
+ bert-score = "0.3.13"
27
+ bleach = "6.1.0"
28
+ cbor2 = "5.6.2"
29
+ certifi = "2024.2.2"
30
+ cffi = "1.16.0"
31
+ charset-normalizer = "3.3.2"
32
+ click = "8.1.7"
33
+ colorama = "0.4.6"
34
+ comm = "0.2.2"
35
+ contourpy = "1.2.1"
36
+ cycler = "0.12.1"
37
+ datasets = "2.18.0"
38
+ debugpy = "1.8.1"
39
+ decorator = "5.1.1"
40
+ defusedxml = "0.7.1"
41
+ diff-match-patch = "20230430"
42
+ dill = "0.3.8"
43
+ evaluate = "0.4.1"
44
+ exceptiongroup = "1.2.0"
45
+ executing = "2.0.1"
46
+ fastapi = "0.110.1"
47
+ fastjsonschema = "2.19.1"
48
+ ffmpy = "0.3.2"
49
+ filelock = "3.13.3"
50
+ fonttools = "4.50.0"
51
+ fqdn = "1.5.1"
52
+ frozenlist = "1.4.1"
53
+ fsspec = "2024.2.0"
54
+ gradio = "4.25.0"
55
+ gradio_client = "0.15.0"
56
+ h11 = "0.14.0"
57
+ httpcore = "1.0.5"
58
+ httpx = "0.27.0"
59
+ huggingface-hub = "0.22.2"
60
+ idna = "3.6"
61
+ importlib_metadata = "7.1.0"
62
+ importlib_resources = "6.4.0"
63
+ ipykernel = "6.29.4"
64
+ ipython = "8.18.1"
65
+ ipywidgets = "8.1.2"
66
+ isoduration = "20.11.0"
67
+ jedi = "0.19.1"
68
+ Jinja2 = "3.1.3"
69
+ joblib = "1.4.0"
70
+ json5 = "0.9.25"
71
+ jsonpointer = "2.4"
72
+ jsonschema = "4.21.1"
73
+ jsonschema-specifications = "2023.12.1"
74
+ kiwisolver = "1.4.5"
75
+ lxml = "5.2.1"
76
+ markdown-it-py = "3.0.0"
77
+ MarkupSafe = "2.1.5"
78
+ matplotlib = "3.8.4"
79
+ matplotlib-inline = "0.1.7"
80
+ mdurl = "0.1.2"
81
+ mistune = "3.0.2"
82
+ mpmath = "1.3.0"
83
+ multidict = "6.0.5"
84
+ multiprocess = "0.70.16"
85
+ nbclient = "0.10.0"
86
+ nbconvert = "7.16.4"
87
+ nbformat = "5.10.4"
88
+ nest-asyncio = "1.6.0"
89
+ networkx = "3.2.1"
90
+ nltk = "3.8.1"
91
+ numpy = "1.26.4"
92
+ orjson = "3.10.0"
93
+ overrides = "7.7.0"
94
+ packaging = "24.0"
95
+ pandas = "2.2.1"
96
+ pandocfilters = "1.5.1"
97
+ parso = "0.8.4"
98
+ pillow = "10.3.0"
99
+ platformdirs = "4.2.1"
100
+ portalocker = "2.8.2"
101
+ prometheus_client = "0.20.0"
102
+ prompt-toolkit = "3.0.43"
103
+ psutil = "5.9.8"
104
+ pure-eval = "0.2.2"
105
+ pyarrow = "15.0.2"
106
+ pyarrow-hotfix = "0.6"
107
+ pycparser = "2.22"
108
+ pydantic = "2.6.4"
109
+ pydantic_core = "2.16.3"
110
+ pydub = "0.25.1"
111
+ Pygments = "2.17.2"
112
+ pyparsing = "3.1.2"
113
+ python-dateutil = "2.9.0.post0"
114
+ python-json-logger = "2.0.7"
115
+ python-multipart = "0.0.9"
116
+ pytz = "2024.1"
117
+ PyYAML = "6.0.1"
118
+ pyzmq = "26.0.2"
119
+ rapidfuzz = "3.8.1"
120
+ referencing = "0.34.0"
121
+ regex = "2023.12.25"
122
+ requests = "2.31.0"
123
+ responses = "0.18.0"
124
+ rfc3339-validator = "0.1.4"
125
+ rfc3986-validator = "0.1.1"
126
+ rich = "13.7.1"
127
+ rouge-score = "0.1.2"
128
+ rpds-py = "0.18.0"
129
+ ruff = "0.3.5"
130
+ sacrebleu = "2.4.2"
131
+ safetensors = "0.4.2"
132
+ scikit-learn = "1.4.2"
133
+ scipy = "1.13.0"
134
+ semantic-version = "2.10.0"
135
+ Send2Trash = "1.8.3"
136
+ shellingham = "1.5.4"
137
+ six = "1.16.0"
138
+ sniffio = "1.3.1"
139
+ soupsieve = "2.5"
140
+ stack-data = "0.6.3"
141
+ starlette = "0.37.2"
142
+ sympy = "1.12"
143
+ tabulate = "0.9.0"
144
+ terminado = "0.18.1"
145
+ threadpoolctl = "3.4.0"
146
+ tinycss2 = "1.3.0"
147
+ tokenizers = "0.15.2"
148
+ tomli = "2.0.1"
149
+ tomlkit = "0.12.0"
150
+ toolz = "0.12.1"
151
+ torch = "2.2.2"
152
+ tornado = "6.4"
153
+ tqdm = "4.66.2"
154
+ traitlets = "5.14.3"
155
+ transformers = "4.39.3"
156
+ typer = "0.12.1"
157
+ types-python-dateutil = "2.9.0.20240316"
158
+ typing_extensions = "4.10.0"
159
+ tzdata = "2024.1"
160
+ uri-template = "1.3.0"
161
+ urllib3 = "2.2.1"
162
+ uvicorn = "0.29.0"
163
+ wcwidth = "0.2.13"
164
+ webcolors = "1.13"
165
+ webencodings = "0.5.1"
166
+ websocket-client = "1.8.0"
167
+ websockets = "11.0.3"
168
+ widgetsnbextension = "4.0.10"
169
+ xxhash = "3.4.1"
170
+ yarl = "1.9.4"
171
+ zipp = "3.18.1"
172
+ plotly = "5.22.0"
173
+ tenacity = "8.2.3"
174
+ Levenshtein = "0.25.1"
175
+ kaleido = "0.2.1"
176
+ jupyter = "^1.0.0"
177
+ grazie-api-gateway-client = {version = "^0.1.3", source = "space-grazie-ml"}
178
+ seaborn = "^0.13.2"
179
+
180
+ [tool.ruff]
181
+ line-length = 120
182
+ target-version = "py310"
183
+
184
+ [tool.ruff.lint]
185
+ extend-select = ["I"]
186
+
187
+ [tool.isort]
188
+ profile = "black"
189
+ force_sort_within_sections = true
190
+ order_by_type = true
191
+
192
+ [[tool.poetry.source]]
193
+ name = "space-grazie-ml"
194
+ url = "https://packages.jetbrains.team/pypi/p/grazi/grazie-ml/simple"
195
+ priority="supplemental"
196
+
197
+ [build-system]
198
+ requires = ["poetry-core"]
199
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ aiofiles==23.2.1
3
+ aiohttp==3.9.3
4
+ aiosignal==1.3.1
5
+ altair==5.3.0
6
+ annotated-types==0.6.0
7
+ anyio==4.3.0
8
+ argon2-cffi==23.1.0
9
+ argon2-cffi-bindings==21.2.0
10
+ arrow==1.3.0
11
+ asttokens==2.4.1
12
+ async-lru==2.0.4
13
+ async-timeout==4.0.3
14
+ attrs==23.2.0
15
+ Babel==2.14.0
16
+ beautifulsoup4==4.12.3
17
+ bert-score==0.3.13
18
+ bleach==6.1.0
19
+ cbor2==5.6.2
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ colorama==0.4.6
25
+ comm==0.2.2
26
+ contourpy==1.2.1
27
+ cycler==0.12.1
28
+ datasets==2.18.0
29
+ debugpy==1.8.1
30
+ decorator==5.1.1
31
+ defusedxml==0.7.1
32
+ diff-match-patch==20230430
33
+ dill==0.3.8
34
+ evaluate==0.4.1
35
+ exceptiongroup==1.2.0
36
+ executing==2.0.1
37
+ fastapi==0.110.1
38
+ fastjsonschema==2.19.1
39
+ ffmpy==0.3.2
40
+ filelock==3.13.3
41
+ fonttools==4.50.0
42
+ fqdn==1.5.1
43
+ frozenlist==1.4.1
44
+ fsspec==2024.2.0
45
+ gradio==4.25.0
46
+ gradio_client==0.15.0
47
+ h11==0.14.0
48
+ httpcore==1.0.5
49
+ httpx==0.27.0
50
+ huggingface-hub==0.22.2
51
+ idna==3.6
52
+ importlib_metadata==7.1.0
53
+ importlib_resources==6.4.0
54
+ ipykernel==6.29.4
55
+ ipython==8.18.1
56
+ ipywidgets==8.1.2
57
+ isoduration==20.11.0
58
+ jedi==0.19.1
59
+ Jinja2==3.1.3
60
+ joblib==1.4.0
61
+ json5==0.9.25
62
+ jsonpointer==2.4
63
+ jsonschema==4.21.1
64
+ jsonschema-specifications==2023.12.1
65
+ kiwisolver==1.4.5
66
+ lxml==5.2.1
67
+ markdown-it-py==3.0.0
68
+ MarkupSafe==2.1.5
69
+ matplotlib==3.8.4
70
+ matplotlib-inline==0.1.7
71
+ mdurl==0.1.2
72
+ mistune==3.0.2
73
+ mpmath==1.3.0
74
+ multidict==6.0.5
75
+ multiprocess==0.70.16
76
+ nbclient==0.10.0
77
+ nbconvert==7.16.4
78
+ nbformat==5.10.4
79
+ nest-asyncio==1.6.0
80
+ networkx==3.2.1
81
+ nltk==3.8.1
82
+ numpy==1.26.4
83
+ orjson==3.10.0
84
+ overrides==7.7.0
85
+ packaging==24.0
86
+ pandas==2.2.1
87
+ pandocfilters==1.5.1
88
+ parso==0.8.4
89
+ pillow==10.3.0
90
+ platformdirs==4.2.1
91
+ portalocker==2.8.2
92
+ prometheus_client==0.20.0
93
+ prompt-toolkit==3.0.43
94
+ psutil==5.9.8
95
+ pure-eval==0.2.2
96
+ pyarrow==15.0.2
97
+ pyarrow-hotfix==0.6
98
+ pycparser==2.22
99
+ pydantic==2.6.4
100
+ pydantic_core==2.16.3
101
+ pydub==0.25.1
102
+ Pygments==2.17.2
103
+ pyparsing==3.1.2
104
+ python-dateutil==2.9.0.post0
105
+ python-json-logger==2.0.7
106
+ python-multipart==0.0.9
107
+ pytz==2024.1
108
+ PyYAML==6.0.1
109
+ pyzmq==26.0.2
110
+ rapidfuzz==3.8.1
111
+ referencing==0.34.0
112
+ regex==2023.12.25
113
+ requests==2.31.0
114
+ responses==0.18.0
115
+ rfc3339-validator==0.1.4
116
+ rfc3986-validator==0.1.1
117
+ rich==13.7.1
118
+ rouge-score==0.1.2
119
+ rpds-py==0.18.0
120
+ ruff==0.3.5
121
+ sacrebleu==2.4.2
122
+ safetensors==0.4.2
123
+ scikit-learn==1.4.2
124
+ scipy==1.13.0
125
+ semantic-version==2.10.0
126
+ Send2Trash==1.8.3
127
+ shellingham==1.5.4
128
+ six==1.16.0
129
+ sniffio==1.3.1
130
+ soupsieve==2.5
131
+ stack-data==0.6.3
132
+ starlette==0.37.2
133
+ sympy==1.12
134
+ tabulate==0.9.0
135
+ terminado==0.18.1
136
+ threadpoolctl==3.4.0
137
+ tinycss2==1.3.0
138
+ tokenizers==0.15.2
139
+ tomli==2.0.1
140
+ tomlkit==0.12.0
141
+ toolz==0.12.1
142
+ torch==2.2.2
143
+ tornado==6.4
144
+ tqdm==4.66.2
145
+ traitlets==5.14.3
146
+ transformers==4.39.3
147
+ typer==0.12.1
148
+ types-python-dateutil==2.9.0.20240316
149
+ typing_extensions==4.10.0
150
+ tzdata==2024.1
151
+ uri-template==1.3.0
152
+ urllib3==2.2.1
153
+ uvicorn==0.29.0
154
+ wcwidth==0.2.13
155
+ webcolors==1.13
156
+ webencodings==0.5.1
157
+ websocket-client==1.8.0
158
+ websockets==11.0.3
159
+ widgetsnbextension==4.0.10
160
+ xxhash==3.4.1
161
+ yarl==1.9.4
162
+ zipp==3.18.1
163
+
164
+ plotly==5.22.0
165
+ tenacity==8.2.3
166
+ Levenshtein==0.25.1
167
+ kaleido==0.2.1
run_pipeline.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import config
2
+ from api_wrappers import hf_data_loader
3
+ from generation_steps import metrics_analysis, synthetic_end_to_start, synthetic_start_to_end
4
+
5
+
6
+ def run():
7
+ df = hf_data_loader.load_processed_rewriting_as_pandas()
8
+
9
+ df = synthetic_end_to_start.transform(df)
10
+ df = synthetic_start_to_end.transform(df)
11
+ df = metrics_analysis.transform(df)
12
+
13
+ df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
14
+
15
+
16
+ if __name__ == "__main__":
17
+ run()