Petr Tsvetkov
commited on
Commit
β’
9513395
0
Parent(s):
release
Browse files- .gitattributes +35 -0
- .gitignore +282 -0
- README.md +26 -0
- api_wrappers/__init__.py +0 -0
- api_wrappers/grazie_wrapper.py +64 -0
- api_wrappers/hf_data_loader.py +120 -0
- change_visualizer.py +353 -0
- chart.ipynb +0 -0
- config.py +39 -0
- data_stats.ipynb +759 -0
- dataset_statistics.py +71 -0
- generate_annotated_diffs.py +38 -0
- generated_message_length_comparison.ipynb +314 -0
- generation_steps/__init__.py +0 -0
- generation_steps/examples.py +51 -0
- generation_steps/for_labeling.py +58 -0
- generation_steps/metrics_analysis.py +94 -0
- generation_steps/synthetic_backward.py +111 -0
- generation_steps/synthetic_forward.py +107 -0
- metrics_analysis.ipynb +0 -0
- poetry.lock +0 -0
- pyproject.toml +199 -0
- requirements.txt +167 -0
- run_pipeline.py +17 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by https://www.toptal.com/developers/gitignore/api/pycharm+all,venv,python
|
2 |
+
# Edit at https://www.toptal.com/developers/gitignore?templates=pycharm+all,venv,python
|
3 |
+
|
4 |
+
### PyCharm+all ###
|
5 |
+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
6 |
+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
7 |
+
|
8 |
+
# User-specific stuff
|
9 |
+
.idea/**/workspace.xml
|
10 |
+
.idea/**/tasks.xml
|
11 |
+
.idea/**/usage.statistics.xml
|
12 |
+
.idea/**/dictionaries
|
13 |
+
.idea/**/shelf
|
14 |
+
|
15 |
+
# AWS User-specific
|
16 |
+
.idea/**/aws.xml
|
17 |
+
|
18 |
+
# Generated files
|
19 |
+
.idea/**/contentModel.xml
|
20 |
+
|
21 |
+
# Sensitive or high-churn files
|
22 |
+
.idea/**/dataSources/
|
23 |
+
.idea/**/dataSources.ids
|
24 |
+
.idea/**/dataSources.local.xml
|
25 |
+
.idea/**/sqlDataSources.xml
|
26 |
+
.idea/**/dynamic.xml
|
27 |
+
.idea/**/uiDesigner.xml
|
28 |
+
.idea/**/dbnavigator.xml
|
29 |
+
|
30 |
+
# Gradle
|
31 |
+
.idea/**/gradle.xml
|
32 |
+
.idea/**/libraries
|
33 |
+
|
34 |
+
# Gradle and Maven with auto-import
|
35 |
+
# When using Gradle or Maven with auto-import, you should exclude module files,
|
36 |
+
# since they will be recreated, and may cause churn. Uncomment if using
|
37 |
+
# auto-import.
|
38 |
+
# .idea/artifacts
|
39 |
+
# .idea/compiler.xml
|
40 |
+
# .idea/jarRepositories.xml
|
41 |
+
# .idea/modules.xml
|
42 |
+
# .idea/*.iml
|
43 |
+
# .idea/modules
|
44 |
+
# *.iml
|
45 |
+
# *.ipr
|
46 |
+
|
47 |
+
# CMake
|
48 |
+
cmake-build-*/
|
49 |
+
|
50 |
+
# Mongo Explorer plugin
|
51 |
+
.idea/**/mongoSettings.xml
|
52 |
+
|
53 |
+
# File-based project format
|
54 |
+
*.iws
|
55 |
+
|
56 |
+
# IntelliJ
|
57 |
+
out/
|
58 |
+
|
59 |
+
# mpeltonen/sbt-idea plugin
|
60 |
+
.idea_modules/
|
61 |
+
|
62 |
+
# JIRA plugin
|
63 |
+
atlassian-ide-plugin.xml
|
64 |
+
|
65 |
+
# Cursive Clojure plugin
|
66 |
+
.idea/replstate.xml
|
67 |
+
|
68 |
+
# SonarLint plugin
|
69 |
+
.idea/sonarlint/
|
70 |
+
|
71 |
+
# Crashlytics plugin (for Android Studio and IntelliJ)
|
72 |
+
com_crashlytics_export_strings.xml
|
73 |
+
crashlytics.properties
|
74 |
+
crashlytics-build.properties
|
75 |
+
fabric.properties
|
76 |
+
|
77 |
+
# Editor-based Rest Client
|
78 |
+
.idea/httpRequests
|
79 |
+
|
80 |
+
# Android studio 3.1+ serialized cache file
|
81 |
+
.idea/caches/build_file_checksums.ser
|
82 |
+
|
83 |
+
### PyCharm+all Patch ###
|
84 |
+
# Ignore everything but code style settings and run configurations
|
85 |
+
# that are supposed to be shared within teams.
|
86 |
+
|
87 |
+
.idea/*
|
88 |
+
|
89 |
+
!.idea/codeStyles
|
90 |
+
!.idea/runConfigurations
|
91 |
+
|
92 |
+
### Python ###
|
93 |
+
# Byte-compiled / optimized / DLL files
|
94 |
+
__pycache__/
|
95 |
+
*.py[cod]
|
96 |
+
*$py.class
|
97 |
+
|
98 |
+
# C extensions
|
99 |
+
*.so
|
100 |
+
|
101 |
+
# Distribution / packaging
|
102 |
+
.Python
|
103 |
+
build/
|
104 |
+
develop-eggs/
|
105 |
+
dist/
|
106 |
+
downloads/
|
107 |
+
eggs/
|
108 |
+
.eggs/
|
109 |
+
lib/
|
110 |
+
lib64/
|
111 |
+
parts/
|
112 |
+
sdist/
|
113 |
+
var/
|
114 |
+
wheels/
|
115 |
+
share/python-wheels/
|
116 |
+
*.egg-info/
|
117 |
+
.installed.cfg
|
118 |
+
*.egg
|
119 |
+
MANIFEST
|
120 |
+
|
121 |
+
# PyInstaller
|
122 |
+
# Usually these files are written by a python script from a template
|
123 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
124 |
+
*.manifest
|
125 |
+
*.spec
|
126 |
+
|
127 |
+
# Installer logs
|
128 |
+
pip-log.txt
|
129 |
+
pip-delete-this-directory.txt
|
130 |
+
|
131 |
+
# Unit test / coverage reports
|
132 |
+
htmlcov/
|
133 |
+
.tox/
|
134 |
+
.nox/
|
135 |
+
.coverage
|
136 |
+
.coverage.*
|
137 |
+
.cache
|
138 |
+
nosetests.xml
|
139 |
+
coverage.xml
|
140 |
+
*.cover
|
141 |
+
*.py,cover
|
142 |
+
.hypothesis/
|
143 |
+
.pytest_cache/
|
144 |
+
cover/
|
145 |
+
|
146 |
+
# Translations
|
147 |
+
*.mo
|
148 |
+
*.pot
|
149 |
+
|
150 |
+
# Django stuff:
|
151 |
+
*.log
|
152 |
+
local_settings.py
|
153 |
+
db.sqlite3
|
154 |
+
db.sqlite3-journal
|
155 |
+
|
156 |
+
# Flask stuff:
|
157 |
+
instance/
|
158 |
+
.webassets-cache
|
159 |
+
|
160 |
+
# Scrapy stuff:
|
161 |
+
.scrapy
|
162 |
+
|
163 |
+
# Sphinx documentation
|
164 |
+
docs/_build/
|
165 |
+
|
166 |
+
# PyBuilder
|
167 |
+
.pybuilder/
|
168 |
+
target/
|
169 |
+
|
170 |
+
# Jupyter Notebook
|
171 |
+
.ipynb_checkpoints
|
172 |
+
|
173 |
+
# IPython
|
174 |
+
profile_default/
|
175 |
+
ipython_config.py
|
176 |
+
|
177 |
+
# pyenv
|
178 |
+
# For a library or package, you might want to ignore these files since the code is
|
179 |
+
# intended to run in multiple environments; otherwise, check them in:
|
180 |
+
# .python-version
|
181 |
+
|
182 |
+
# pipenv
|
183 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
184 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
185 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
186 |
+
# install all needed dependencies.
|
187 |
+
#Pipfile.lock
|
188 |
+
|
189 |
+
# poetry
|
190 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
191 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
192 |
+
# commonly ignored for libraries.
|
193 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
194 |
+
#poetry.lock
|
195 |
+
|
196 |
+
# pdm
|
197 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
198 |
+
#pdm.lock
|
199 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
200 |
+
# in version control.
|
201 |
+
# https://pdm.fming.dev/#use-with-ide
|
202 |
+
.pdm.toml
|
203 |
+
|
204 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
205 |
+
__pypackages__/
|
206 |
+
|
207 |
+
# Celery stuff
|
208 |
+
celerybeat-schedule
|
209 |
+
celerybeat.pid
|
210 |
+
|
211 |
+
# SageMath parsed files
|
212 |
+
*.sage.py
|
213 |
+
|
214 |
+
# Environments
|
215 |
+
.env
|
216 |
+
.venv
|
217 |
+
env/
|
218 |
+
venv/
|
219 |
+
ENV/
|
220 |
+
env.bak/
|
221 |
+
venv.bak/
|
222 |
+
|
223 |
+
# Spyder project settings
|
224 |
+
.spyderproject
|
225 |
+
.spyproject
|
226 |
+
|
227 |
+
# Rope project settings
|
228 |
+
.ropeproject
|
229 |
+
|
230 |
+
# mkdocs documentation
|
231 |
+
/site
|
232 |
+
|
233 |
+
# mypy
|
234 |
+
.mypy_cache/
|
235 |
+
.dmypy.json
|
236 |
+
dmypy.json
|
237 |
+
|
238 |
+
# Pyre type checker
|
239 |
+
.pyre/
|
240 |
+
|
241 |
+
# pytype static type analyzer
|
242 |
+
.pytype/
|
243 |
+
|
244 |
+
# Cython debug symbols
|
245 |
+
cython_debug/
|
246 |
+
|
247 |
+
# PyCharm
|
248 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
249 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
250 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
251 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
252 |
+
#.idea/
|
253 |
+
|
254 |
+
### Python Patch ###
|
255 |
+
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
256 |
+
poetry.toml
|
257 |
+
|
258 |
+
# ruff
|
259 |
+
.ruff_cache/
|
260 |
+
|
261 |
+
# LSP config files
|
262 |
+
pyrightconfig.json
|
263 |
+
|
264 |
+
### venv ###
|
265 |
+
# Virtualenv
|
266 |
+
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
|
267 |
+
[Bb]in
|
268 |
+
[Ii]nclude
|
269 |
+
[Ll]ib
|
270 |
+
[Ll]ib64
|
271 |
+
[Ll]ocal
|
272 |
+
[Ss]cripts
|
273 |
+
pyvenv.cfg
|
274 |
+
pip-selfcheck.json
|
275 |
+
|
276 |
+
# End of https://www.toptal.com/developers/gitignore/api/pycharm+all,venv,python
|
277 |
+
|
278 |
+
.idea
|
279 |
+
|
280 |
+
cache
|
281 |
+
output
|
282 |
+
data
|
README.md
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Commit Message Editing Visualization
|
3 |
+
emoji: π
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 4.37.2
|
6 |
+
app_file: change_visualizer.py
|
7 |
+
---
|
8 |
+
|
9 |
+
# Commit Message Editing Visualisation βοΈππ
|
10 |
+
|
11 |
+
This space provides a visualization app for exploring the commit message edits datasets (π€ [expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and π€ [synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
|
12 |
+
from π [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper and also hosts important artifacts from our work.
|
13 |
+
|
14 |
+
## Artifacts
|
15 |
+
|
16 |
+
* π[`metrics_analysis.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
|
17 |
+
* π[`chart.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/chart.ipynb) contains the code for Figure 4 with edit distance distribution;
|
18 |
+
* ποΈ[`data_stats.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1;
|
19 |
+
* β¬
οΈ[`generation_steps/synthetic_backward.py`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/generation_steps/synthetic_backward.py) contains the code for *Synthetic Backward* generation proposed in our paper;
|
20 |
+
* β‘οΈ[`generation_steps/synthetic_forward.py`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/generation_steps/synthetic_forward.py) contains the code for *Synthetic Forward* generation proposed in our paper.
|
21 |
+
|
22 |
+
## Visualization
|
23 |
+
|
24 |
+
* π Click on `Examples Exploration` tab to browse through nicely-formatted examples from our dataset.
|
25 |
+
* π Click on `Dataset Statistics` tab to see the major statistics for our dataset.
|
26 |
+
* π Click on `Experimental Results` tab to see additional metrics tested as target online metrics alongside our main edit distance results.
|
api_wrappers/__init__.py
ADDED
File without changes
|
api_wrappers/grazie_wrapper.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import time
|
3 |
+
|
4 |
+
from grazie.api.client.chat.prompt import ChatPrompt
|
5 |
+
from grazie.api.client.endpoints import GrazieApiGatewayUrls
|
6 |
+
from grazie.api.client.gateway import AuthType, GrazieAgent, GrazieApiGatewayClient
|
7 |
+
from grazie.api.client.profiles import LLMProfile
|
8 |
+
|
9 |
+
import config
|
10 |
+
|
11 |
+
client = GrazieApiGatewayClient(
|
12 |
+
grazie_agent=GrazieAgent("grazie-toolformers", "v1.0"),
|
13 |
+
url=GrazieApiGatewayUrls.STAGING,
|
14 |
+
auth_type=AuthType.APPLICATION,
|
15 |
+
grazie_jwt_token=config.GRAZIE_API_JWT_TOKEN,
|
16 |
+
)
|
17 |
+
|
18 |
+
LLM_CACHE_FILE = config.CACHE_DIR / f"{config.LLM_MODEL}.cache.pkl"
|
19 |
+
LLM_CACHE = {}
|
20 |
+
LLM_CACHE_USED = {}
|
21 |
+
|
22 |
+
if not LLM_CACHE_FILE.exists():
|
23 |
+
with open(LLM_CACHE_FILE, "wb") as file:
|
24 |
+
pickle.dump(obj=LLM_CACHE, file=file)
|
25 |
+
|
26 |
+
with open(LLM_CACHE_FILE, "rb") as file:
|
27 |
+
LLM_CACHE = pickle.load(file=file)
|
28 |
+
|
29 |
+
|
30 |
+
def llm_request(prompt):
|
31 |
+
output = None
|
32 |
+
|
33 |
+
while output is None:
|
34 |
+
try:
|
35 |
+
output = client.chat(
|
36 |
+
chat=ChatPrompt().add_system("You are a helpful assistant.").add_user(prompt),
|
37 |
+
profile=LLMProfile(config.LLM_MODEL),
|
38 |
+
).content
|
39 |
+
except Exception:
|
40 |
+
time.sleep(config.GRAZIE_TIMEOUT_SEC)
|
41 |
+
|
42 |
+
assert output is not None
|
43 |
+
|
44 |
+
return output
|
45 |
+
|
46 |
+
|
47 |
+
def generate_for_prompt(prompt):
|
48 |
+
if prompt not in LLM_CACHE:
|
49 |
+
LLM_CACHE[prompt] = []
|
50 |
+
|
51 |
+
if prompt not in LLM_CACHE_USED:
|
52 |
+
LLM_CACHE_USED[prompt] = 0
|
53 |
+
|
54 |
+
while LLM_CACHE_USED[prompt] >= len(LLM_CACHE[prompt]):
|
55 |
+
new_response = llm_request(prompt)
|
56 |
+
LLM_CACHE[prompt].append(new_response)
|
57 |
+
|
58 |
+
with open(LLM_CACHE_FILE, "wb") as file:
|
59 |
+
pickle.dump(obj=LLM_CACHE, file=file)
|
60 |
+
|
61 |
+
result = LLM_CACHE[prompt][LLM_CACHE_USED[prompt]]
|
62 |
+
LLM_CACHE_USED[prompt] += 1
|
63 |
+
|
64 |
+
return result
|
api_wrappers/hf_data_loader.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
from datasets import load_dataset
|
7 |
+
from huggingface_hub import hf_hub_download, list_repo_tree
|
8 |
+
|
9 |
+
import config
|
10 |
+
|
11 |
+
|
12 |
+
def load_raw_rewriting_as_pandas():
|
13 |
+
return load_dataset(
|
14 |
+
config.HF_RAW_DATASET_NAME, split=config.HF_RAW_DATASET_SPLIT, token=config.HF_TOKEN, cache_dir=config.CACHE_DIR
|
15 |
+
).to_pandas()
|
16 |
+
|
17 |
+
|
18 |
+
def load_full_commit_as_pandas():
|
19 |
+
return (
|
20 |
+
load_dataset(
|
21 |
+
path=config.HF_FULL_COMMITS_DATASET_NAME,
|
22 |
+
name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
|
23 |
+
split=config.HF_FULL_COMMITS_DATASET_SPLIT,
|
24 |
+
cache_dir=config.CACHE_DIR,
|
25 |
+
)
|
26 |
+
.to_pandas()
|
27 |
+
.rename(columns={"message": "reference"})
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
def edit_time_from_history(history_str):
|
32 |
+
history = json.loads(history_str)
|
33 |
+
|
34 |
+
if len(history) == 0:
|
35 |
+
return 0
|
36 |
+
|
37 |
+
timestamps = list(map(lambda e: datetime.fromisoformat(e["ts"]), history))
|
38 |
+
delta = max(timestamps) - min(timestamps)
|
39 |
+
|
40 |
+
return delta // timedelta(milliseconds=1)
|
41 |
+
|
42 |
+
|
43 |
+
def edit_time_from_timestamps(row):
|
44 |
+
loaded_ts = datetime.fromisoformat(row["loaded_ts"])
|
45 |
+
submitted_ts = datetime.fromisoformat(row["submitted_ts"])
|
46 |
+
|
47 |
+
delta = submitted_ts - loaded_ts
|
48 |
+
|
49 |
+
result = delta // timedelta(milliseconds=1)
|
50 |
+
|
51 |
+
return result if result >= 0 else None
|
52 |
+
|
53 |
+
|
54 |
+
def load_processed_rewriting_as_pandas():
|
55 |
+
manual_rewriting = load_raw_rewriting_as_pandas()[
|
56 |
+
[
|
57 |
+
"hash",
|
58 |
+
"repo",
|
59 |
+
"commit_msg_start",
|
60 |
+
"commit_msg_end",
|
61 |
+
"session",
|
62 |
+
"commit_msg_history",
|
63 |
+
"loaded_ts",
|
64 |
+
"submitted_ts",
|
65 |
+
]
|
66 |
+
]
|
67 |
+
|
68 |
+
manual_rewriting["edit_time_hist"] = manual_rewriting["commit_msg_history"].apply(edit_time_from_history)
|
69 |
+
manual_rewriting["edit_time"] = manual_rewriting.apply(edit_time_from_timestamps, axis=1)
|
70 |
+
|
71 |
+
manual_rewriting.drop(columns=["commit_msg_history", "loaded_ts", "submitted_ts"])
|
72 |
+
|
73 |
+
manual_rewriting.set_index(["hash", "repo"], inplace=True)
|
74 |
+
|
75 |
+
mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]]
|
76 |
+
mods_dataset.set_index(["hash", "repo"], inplace=True)
|
77 |
+
|
78 |
+
return manual_rewriting.join(other=mods_dataset, how="left").reset_index()
|
79 |
+
|
80 |
+
|
81 |
+
def load_synthetic_as_pandas():
|
82 |
+
return load_dataset(
|
83 |
+
config.HF_SYNTHETIC_DATASET_NAME,
|
84 |
+
"all_pairs_with_metrics",
|
85 |
+
split=config.HF_SYNTHETIC_DATASET_SPLIT,
|
86 |
+
token=config.HF_TOKEN,
|
87 |
+
cache_dir=config.CACHE_DIR,
|
88 |
+
).to_pandas()
|
89 |
+
|
90 |
+
|
91 |
+
def load_full_commit_with_predictions_as_pandas():
|
92 |
+
full_dataset = load_full_commit_as_pandas()
|
93 |
+
|
94 |
+
predictions_paths = []
|
95 |
+
for prediction_file in list_repo_tree(
|
96 |
+
repo_id=config.HF_PREDICTIONS_DATASET_NAME,
|
97 |
+
path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL),
|
98 |
+
repo_type="dataset",
|
99 |
+
):
|
100 |
+
predictions_paths.append(
|
101 |
+
hf_hub_download(
|
102 |
+
prediction_file.path,
|
103 |
+
repo_id=config.HF_PREDICTIONS_DATASET_NAME,
|
104 |
+
repo_type="dataset",
|
105 |
+
cache_dir=config.CACHE_DIR,
|
106 |
+
)
|
107 |
+
)
|
108 |
+
|
109 |
+
dfs = []
|
110 |
+
for path in predictions_paths:
|
111 |
+
dfs.append(pd.read_json(path, orient="records", lines=True))
|
112 |
+
predictions_dataset = pd.concat(dfs, axis=0, ignore_index=True)
|
113 |
+
predictions_dataset = predictions_dataset.sample(frac=1, random_state=config.RANDOM_STATE).set_index(
|
114 |
+
["hash", "repo"]
|
115 |
+
)[["prediction"]]
|
116 |
+
predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep="first")]
|
117 |
+
|
118 |
+
dataset = full_dataset.join(other=predictions_dataset, on=("hash", "repo"))
|
119 |
+
|
120 |
+
return dataset.reset_index()
|
change_visualizer.py
ADDED
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
from datasets import load_dataset
|
6 |
+
from evaluate.utils import parse_readme
|
7 |
+
from scipy.stats import gaussian_kde, spearmanr
|
8 |
+
|
9 |
+
import generate_annotated_diffs
|
10 |
+
from api_wrappers import hf_data_loader
|
11 |
+
from generation_steps.metrics_analysis import AGGR_METRICS, edit_distance_fn
|
12 |
+
|
13 |
+
colors = {
|
14 |
+
"Expert-labeled": "#C19C0B",
|
15 |
+
"Synthetic Backward": "#913632",
|
16 |
+
"Synthetic Forward": "#58136a",
|
17 |
+
"Full": "#000000",
|
18 |
+
}
|
19 |
+
|
20 |
+
METRICS = {
|
21 |
+
"Edit Distance": "editdist",
|
22 |
+
"Edit Similarity": "editsim",
|
23 |
+
"BLEU": "bleu",
|
24 |
+
"METEOR": "meteor",
|
25 |
+
"ROUGE-1": "rouge1",
|
26 |
+
"ROUGE-2": "rouge2",
|
27 |
+
"ROUGE-L": "rougeL",
|
28 |
+
"BERTScore": "bertscore",
|
29 |
+
"ChrF": "chrF",
|
30 |
+
}
|
31 |
+
|
32 |
+
|
33 |
+
df_related = generate_annotated_diffs.data_with_annotated_diffs()
|
34 |
+
|
35 |
+
|
36 |
+
def golden():
|
37 |
+
return df_related.loc[(df_related["G_type"] == "initial") & (df_related["E_type"] == "expert_labeled")].reset_index(
|
38 |
+
drop=True
|
39 |
+
)
|
40 |
+
|
41 |
+
|
42 |
+
def backward():
|
43 |
+
return df_related.loc[
|
44 |
+
(df_related["G_type"] == "synthetic_backward") & (df_related["E_type"] == "expert_labeled")
|
45 |
+
].reset_index(drop=True)
|
46 |
+
|
47 |
+
|
48 |
+
def forward():
|
49 |
+
return df_related.loc[
|
50 |
+
(df_related["G_type"] == "initial") & (df_related["E_type"] == "synthetic_forward")
|
51 |
+
].reset_index(drop=True)
|
52 |
+
|
53 |
+
|
54 |
+
def forward_from_backward():
|
55 |
+
return df_related.loc[
|
56 |
+
(df_related.G_type == "synthetic_backward")
|
57 |
+
& (df_related.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))
|
58 |
+
].reset_index(drop=True)
|
59 |
+
|
60 |
+
|
61 |
+
n_diffs_manual = len(golden())
|
62 |
+
n_diffs_synthetic_backward = len(backward())
|
63 |
+
n_diffs_synthetic_forward = len(forward())
|
64 |
+
n_diffs_synthetic_forward_backward = len(forward_from_backward())
|
65 |
+
|
66 |
+
|
67 |
+
def update_dataset_view(diff_idx, df):
|
68 |
+
diff_idx -= 1
|
69 |
+
return (
|
70 |
+
df.iloc[diff_idx]["annotated_diff"],
|
71 |
+
df.iloc[diff_idx]["commit_msg_start"] if "commit_msg_start" in df.columns else df.iloc[diff_idx]["G_text"],
|
72 |
+
df.iloc[diff_idx]["commit_msg_end"] if "commit_msg_end" in df.columns else df.iloc[diff_idx]["E_text"],
|
73 |
+
f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",
|
74 |
+
)
|
75 |
+
|
76 |
+
|
77 |
+
def update_dataset_view_manual(diff_idx):
|
78 |
+
return update_dataset_view(diff_idx, golden())
|
79 |
+
|
80 |
+
|
81 |
+
def update_dataset_view_synthetic_backward(diff_idx):
|
82 |
+
return update_dataset_view(diff_idx, backward())
|
83 |
+
|
84 |
+
|
85 |
+
def update_dataset_view_synthetic_forward(diff_idx):
|
86 |
+
return update_dataset_view(diff_idx, forward())
|
87 |
+
|
88 |
+
|
89 |
+
def update_dataset_view_synthetic_forward_backward(diff_idx):
|
90 |
+
return update_dataset_view(diff_idx, forward_from_backward())
|
91 |
+
|
92 |
+
|
93 |
+
def number_of_pairs_plot():
|
94 |
+
related_plot_dict = {
|
95 |
+
"Full": df_related,
|
96 |
+
"Synthetic Backward": backward(),
|
97 |
+
"Synthetic Forward": pd.concat([forward(), forward_from_backward()], axis=0, ignore_index=True),
|
98 |
+
"Expert-labeled": golden(),
|
99 |
+
}
|
100 |
+
|
101 |
+
df_unrelated = hf_data_loader.load_synthetic_as_pandas()
|
102 |
+
df_unrelated = df_unrelated.loc[~df_unrelated.is_related].copy()
|
103 |
+
unrelated_plot_dict = {
|
104 |
+
"Full": df_unrelated,
|
105 |
+
"Synthetic Backward": df_unrelated.loc[
|
106 |
+
(df_unrelated["G_type"] == "synthetic_backward")
|
107 |
+
& (~df_unrelated.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))
|
108 |
+
],
|
109 |
+
"Synthetic Forward": df_unrelated.loc[
|
110 |
+
((df_unrelated["G_type"] == "initial") & (df_unrelated["E_type"] == "synthetic_forward"))
|
111 |
+
| (
|
112 |
+
(df_unrelated["G_type"] == "synthetic_backward")
|
113 |
+
& (df_unrelated["E_type"].isin(["synthetic_forward", "synthetic_forward_from_backward"]))
|
114 |
+
)
|
115 |
+
],
|
116 |
+
"Expert-labeled": df_unrelated.loc[
|
117 |
+
(df_unrelated.G_type == "initial") & (df_unrelated.E_type == "expert_labeled")
|
118 |
+
],
|
119 |
+
}
|
120 |
+
|
121 |
+
traces = []
|
122 |
+
|
123 |
+
for split in related_plot_dict.keys():
|
124 |
+
related_count = len(related_plot_dict[split])
|
125 |
+
unrelated_count = len(unrelated_plot_dict[split])
|
126 |
+
|
127 |
+
traces.append(
|
128 |
+
go.Bar(
|
129 |
+
name=f"{split} - Related pairs",
|
130 |
+
x=[split],
|
131 |
+
y=[related_count],
|
132 |
+
marker=dict(
|
133 |
+
color=colors[split],
|
134 |
+
),
|
135 |
+
)
|
136 |
+
)
|
137 |
+
|
138 |
+
traces.append(
|
139 |
+
go.Bar(
|
140 |
+
name=f"{split} - Conditionally independent pairs",
|
141 |
+
x=[split],
|
142 |
+
y=[unrelated_count],
|
143 |
+
marker=dict(
|
144 |
+
color=colors[split],
|
145 |
+
pattern=dict(
|
146 |
+
shape="/", # Crosses
|
147 |
+
fillmode="overlay",
|
148 |
+
solidity=0.5,
|
149 |
+
),
|
150 |
+
),
|
151 |
+
)
|
152 |
+
)
|
153 |
+
|
154 |
+
fig = go.Figure(data=traces)
|
155 |
+
|
156 |
+
fig.update_layout(
|
157 |
+
barmode="stack",
|
158 |
+
bargap=0.2,
|
159 |
+
xaxis=dict(title="Split", showgrid=True, gridcolor="lightgrey"),
|
160 |
+
yaxis=dict(title="Number of Examples", showgrid=True, gridcolor="lightgrey"),
|
161 |
+
legend=dict(title="Pair Type", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
162 |
+
plot_bgcolor="rgba(0,0,0,0)",
|
163 |
+
paper_bgcolor="rgba(0,0,0,0)",
|
164 |
+
width=1100,
|
165 |
+
)
|
166 |
+
return fig
|
167 |
+
|
168 |
+
|
169 |
+
def edit_distance_plot():
|
170 |
+
df_edit_distance = {
|
171 |
+
"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in df_related.iterrows()],
|
172 |
+
"Synthetic Backward": [
|
173 |
+
edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in backward().iterrows()
|
174 |
+
],
|
175 |
+
"Synthetic Forward": [
|
176 |
+
edit_distance_fn(pred=row["G_text"], ref=row["E_text"])
|
177 |
+
for _, row in pd.concat([forward(), forward_from_backward()], axis=0, ignore_index=True).iterrows()
|
178 |
+
],
|
179 |
+
"Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in golden().iterrows()],
|
180 |
+
}
|
181 |
+
traces = []
|
182 |
+
|
183 |
+
for key in df_edit_distance:
|
184 |
+
kde_x = np.linspace(0, 1200, 1000)
|
185 |
+
kde = gaussian_kde(df_edit_distance[key])
|
186 |
+
kde_line = go.Scatter(x=kde_x, y=kde(kde_x), mode="lines", name=key, line=dict(color=colors[key], width=5))
|
187 |
+
traces.append(kde_line)
|
188 |
+
|
189 |
+
fig = go.Figure(data=traces)
|
190 |
+
|
191 |
+
fig.update_layout(
|
192 |
+
bargap=0.1,
|
193 |
+
xaxis=dict(title=dict(text="Edit Distance"), range=[0, 1200], showgrid=True, gridcolor="lightgrey"),
|
194 |
+
yaxis=dict(
|
195 |
+
title=dict(text="Probability Density"),
|
196 |
+
range=[0, 0.004],
|
197 |
+
showgrid=True,
|
198 |
+
gridcolor="lightgrey",
|
199 |
+
tickvals=[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004],
|
200 |
+
tickformat=".4f",
|
201 |
+
),
|
202 |
+
plot_bgcolor="rgba(0,0,0,0)",
|
203 |
+
paper_bgcolor="rgba(0,0,0,0)",
|
204 |
+
width=1100,
|
205 |
+
)
|
206 |
+
return fig
|
207 |
+
|
208 |
+
|
209 |
+
def get_correlations_table(online_metric_name: str) -> pd.DataFrame:
|
210 |
+
df = load_dataset(
|
211 |
+
"JetBrains-Research/synthetic-commit-msg-edits", "all_pairs_with_metrics_other_online_metrics", split="train"
|
212 |
+
).to_pandas()
|
213 |
+
corr_df = (
|
214 |
+
df.loc[~df.is_related]
|
215 |
+
.groupby(["G_text", "G_type", "hash", "repo"] + [f"online_{online_metric_name}"])
|
216 |
+
.apply(lambda g: g.to_dict(orient="records"), include_groups=False)
|
217 |
+
.reset_index(name="unrelated_pairs")
|
218 |
+
.copy()
|
219 |
+
)
|
220 |
+
_ = corr_df.copy()
|
221 |
+
for metric in AGGR_METRICS:
|
222 |
+
if metric in ["editdist"]:
|
223 |
+
_[metric] = _.unrelated_pairs.apply(lambda pairs: min(pair[metric] for pair in pairs))
|
224 |
+
else:
|
225 |
+
_[metric] = _.unrelated_pairs.apply(lambda pairs: max(pair[metric] for pair in pairs))
|
226 |
+
|
227 |
+
results = []
|
228 |
+
|
229 |
+
for metric in AGGR_METRICS:
|
230 |
+
x = _[metric].to_numpy()
|
231 |
+
y = _[f"online_{online_metric_name}"].to_numpy()
|
232 |
+
corr, p_value = spearmanr(x, y)
|
233 |
+
results.append({"metric": metric, "corr": corr, "p_value": p_value})
|
234 |
+
|
235 |
+
__ = pd.DataFrame(results)
|
236 |
+
__["p_value"] = ["< 0.05" if p < 0.05 else p for p in __.p_value]
|
237 |
+
__["corr_abs"] = abs(__["corr"])
|
238 |
+
__["corr"] = __["corr"].round(2)
|
239 |
+
__["metric"] = __["metric"].map({v: k for k, v in METRICS.items()})
|
240 |
+
return (
|
241 |
+
__.sort_values(by=["corr_abs"], ascending=False)
|
242 |
+
.drop(columns=["corr_abs"])
|
243 |
+
.rename(columns={"metric": "Metric m", "corr": "Correlation Q(m, m*)", "p_value": "p-value"})
|
244 |
+
)
|
245 |
+
|
246 |
+
|
247 |
+
force_light_theme_js_func = """
|
248 |
+
function refresh() {
|
249 |
+
const url = new URL(window.location);
|
250 |
+
|
251 |
+
if (url.searchParams.get('__theme') !== 'light') {
|
252 |
+
url.searchParams.set('__theme', 'light');
|
253 |
+
window.location.href = url.href;
|
254 |
+
}
|
255 |
+
}
|
256 |
+
"""
|
257 |
+
|
258 |
+
if __name__ == "__main__":
|
259 |
+
with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
|
260 |
+
gr.Markdown(parse_readme("README.md"))
|
261 |
+
|
262 |
+
def dataset_view_tab(n_items):
|
263 |
+
slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1, label=f"Sample number (total: {n_items})")
|
264 |
+
|
265 |
+
diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={"+": "green", "-": "red"})
|
266 |
+
start_view = gr.Textbox(interactive=False, label="Initial message G", container=True)
|
267 |
+
end_view = gr.Textbox(interactive=False, label="Edited message E", container=True)
|
268 |
+
link_view = gr.Markdown()
|
269 |
+
|
270 |
+
view = [diff_view, start_view, end_view, link_view]
|
271 |
+
|
272 |
+
return slider, view
|
273 |
+
|
274 |
+
with gr.Tab("Examples Exploration"):
|
275 |
+
with gr.Tab("Manual"):
|
276 |
+
slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
|
277 |
+
|
278 |
+
slider_manual.change(update_dataset_view_manual, inputs=slider_manual, outputs=view_manual)
|
279 |
+
|
280 |
+
with gr.Tab("Synthetic Backward"):
|
281 |
+
slider_synthetic_backward, view_synthetic_backward = dataset_view_tab(n_diffs_synthetic_backward)
|
282 |
+
|
283 |
+
slider_synthetic_backward.change(
|
284 |
+
update_dataset_view_synthetic_backward,
|
285 |
+
inputs=slider_synthetic_backward,
|
286 |
+
outputs=view_synthetic_backward,
|
287 |
+
)
|
288 |
+
|
289 |
+
with gr.Tab("Synthetic Forward (from initial)"):
|
290 |
+
slider_synthetic_forward, view_synthetic_forward = dataset_view_tab(n_diffs_synthetic_forward)
|
291 |
+
|
292 |
+
slider_synthetic_forward.change(
|
293 |
+
update_dataset_view_synthetic_forward,
|
294 |
+
inputs=slider_synthetic_forward,
|
295 |
+
outputs=view_synthetic_forward,
|
296 |
+
)
|
297 |
+
|
298 |
+
with gr.Tab("Synthetic Forward (from backward)"):
|
299 |
+
slider_synthetic_forward_backward, view_synthetic_forward_backward = dataset_view_tab(
|
300 |
+
n_diffs_synthetic_forward_backward
|
301 |
+
)
|
302 |
+
|
303 |
+
slider_synthetic_forward_backward.change(
|
304 |
+
update_dataset_view_synthetic_forward_backward,
|
305 |
+
inputs=slider_synthetic_forward_backward,
|
306 |
+
outputs=view_synthetic_forward_backward,
|
307 |
+
)
|
308 |
+
|
309 |
+
with gr.Tab("Dataset Statistics"):
|
310 |
+
gr.Markdown("## Number of examples per split")
|
311 |
+
|
312 |
+
number_of_pairs_gr_plot = gr.Plot(number_of_pairs_plot, label=None)
|
313 |
+
|
314 |
+
gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")
|
315 |
+
|
316 |
+
edit_distance_gr_plot = gr.Plot(edit_distance_plot(), label=None)
|
317 |
+
|
318 |
+
with gr.Tab("Experimental Results"):
|
319 |
+
gr.Markdown(
|
320 |
+
"Here, we provide the additional experimental results with different text similarity metrics used as the target online metric, "
|
321 |
+
"in addition to edit distance between generated messages G and their edited counterparts E."
|
322 |
+
)
|
323 |
+
|
324 |
+
gr.Markdown(
|
325 |
+
"Please, select one of the available metrics **m*** below to see the correlations **Q(m, m\*)** of offline text similarity metrics with **m*** as an online metric."
|
326 |
+
)
|
327 |
+
|
328 |
+
for metric in METRICS:
|
329 |
+
with gr.Tab(metric):
|
330 |
+
gr.Markdown(
|
331 |
+
f"The table below presents the correlation coefficients **Q(m, m\*)** where {metric} is used as an online metric **m***."
|
332 |
+
)
|
333 |
+
|
334 |
+
result_df = get_correlations_table(METRICS[metric])
|
335 |
+
gr.DataFrame(result_df)
|
336 |
+
|
337 |
+
application.load(update_dataset_view_manual, inputs=slider_manual, outputs=view_manual)
|
338 |
+
|
339 |
+
application.load(
|
340 |
+
update_dataset_view_synthetic_backward, inputs=slider_synthetic_backward, outputs=view_synthetic_backward
|
341 |
+
)
|
342 |
+
|
343 |
+
application.load(
|
344 |
+
update_dataset_view_synthetic_forward, inputs=slider_synthetic_forward, outputs=view_synthetic_forward
|
345 |
+
)
|
346 |
+
|
347 |
+
application.load(
|
348 |
+
update_dataset_view_synthetic_forward_backward,
|
349 |
+
inputs=slider_synthetic_forward_backward,
|
350 |
+
outputs=view_synthetic_forward_backward,
|
351 |
+
)
|
352 |
+
|
353 |
+
application.launch()
|
chart.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
config.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
RANDOM_STATE = 42
|
5 |
+
|
6 |
+
GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_JWT_TOKEN")
|
7 |
+
GRAZIE_TIMEOUT_SEC = 1.0
|
8 |
+
|
9 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
10 |
+
|
11 |
+
HF_RAW_DATASET_NAME = "JetBrains-Research/commit-msg-rewriting"
|
12 |
+
HF_RAW_DATASET_SPLIT = "train"
|
13 |
+
|
14 |
+
HF_FULL_COMMITS_DATASET_NAME = "JetBrains-Research/lca-commit-message-generation"
|
15 |
+
HF_FULL_COMMITS_DATASET_SUBNAME = "commitchronicle-py-long"
|
16 |
+
HF_FULL_COMMITS_DATASET_SPLIT = "test"
|
17 |
+
|
18 |
+
HF_PREDICTIONS_DATASET_NAME = "JetBrains-Research/lca-results"
|
19 |
+
HF_PREDICTIONS_MODEL = "gpt_4_0613"
|
20 |
+
|
21 |
+
HF_SYNTHETIC_DATASET_NAME = "JetBrains-Research/synthetic-commit-msg-rewriting"
|
22 |
+
HF_SYNTHETIC_DATASET_SPLIT = "train"
|
23 |
+
|
24 |
+
LLM_MODEL = "gpt-4-1106-preview"
|
25 |
+
|
26 |
+
CACHE_DIR = Path("cache")
|
27 |
+
CACHE_DIR.mkdir(exist_ok=True)
|
28 |
+
|
29 |
+
OUTPUT_DIR = Path("output")
|
30 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
31 |
+
|
32 |
+
END_TO_START_ARTIFACT = OUTPUT_DIR / "end_to_start.csv"
|
33 |
+
START_TO_END_ARTIFACT = OUTPUT_DIR / "start_to_end.csv"
|
34 |
+
SYNTHETIC_DATASET_ARTIFACT = OUTPUT_DIR / "synthetic.csv"
|
35 |
+
METRICS_CORRELATIONS_ARTIFACT = OUTPUT_DIR / "metrics_correlations.csv"
|
36 |
+
DATA_FOR_LABELING_ARTIFACT = OUTPUT_DIR / "data_for_labeling.csv"
|
37 |
+
|
38 |
+
OUTPUT_CHARTS_DIR = OUTPUT_DIR / "charts"
|
39 |
+
OUTPUT_CHARTS_DIR.mkdir(exist_ok=True)
|
data_stats.ipynb
ADDED
@@ -0,0 +1,759 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"metadata": {},
|
5 |
+
"cell_type": "markdown",
|
6 |
+
"source": "# Data Stats",
|
7 |
+
"id": "694a6cc631d4ab93"
|
8 |
+
},
|
9 |
+
{
|
10 |
+
"metadata": {
|
11 |
+
"ExecuteTime": {
|
12 |
+
"end_time": "2024-10-15T18:43:07.644299Z",
|
13 |
+
"start_time": "2024-10-15T18:43:02.316453Z"
|
14 |
+
}
|
15 |
+
},
|
16 |
+
"cell_type": "code",
|
17 |
+
"source": [
|
18 |
+
"from datasets import load_dataset\n",
|
19 |
+
"\n",
|
20 |
+
"\n",
|
21 |
+
"df = load_dataset(\"JetBrains-Research/synthetic-commit-msg-edits\", \"all_pairs\", split=\"train\").to_pandas()\n",
|
22 |
+
"df.head()"
|
23 |
+
],
|
24 |
+
"id": "ed42f4f83199feb2",
|
25 |
+
"outputs": [
|
26 |
+
{
|
27 |
+
"name": "stderr",
|
28 |
+
"output_type": "stream",
|
29 |
+
"text": [
|
30 |
+
"Downloading data: 100%|ββββββββββ| 6.35M/6.35M [00:00<00:00, 9.95MB/s]\n"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"data": {
|
35 |
+
"text/plain": [
|
36 |
+
"Generating train split: 0 examples [00:00, ? examples/s]"
|
37 |
+
],
|
38 |
+
"application/vnd.jupyter.widget-view+json": {
|
39 |
+
"version_major": 2,
|
40 |
+
"version_minor": 0,
|
41 |
+
"model_id": "1a0523289d424b29974b60d017643280"
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"metadata": {},
|
45 |
+
"output_type": "display_data"
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"data": {
|
49 |
+
"text/plain": [
|
50 |
+
" hash repo \\\n",
|
51 |
+
"0 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n",
|
52 |
+
"1 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n",
|
53 |
+
"2 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n",
|
54 |
+
"3 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n",
|
55 |
+
"4 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n",
|
56 |
+
"\n",
|
57 |
+
" G_text \\\n",
|
58 |
+
"0 Enhance OptionOverrideProxy and simplify optio... \n",
|
59 |
+
"1 Enhance OptionOverrideProxy and simplify optio... \n",
|
60 |
+
"2 Enhance OptionOverrideProxy and simplify optio... \n",
|
61 |
+
"3 Enhance OptionOverrideProxy and simplify optio... \n",
|
62 |
+
"4 Enhance OptionOverrideProxy and simplify optio... \n",
|
63 |
+
"\n",
|
64 |
+
" E_text G_type \\\n",
|
65 |
+
"0 Enhance OptionOverrideProxy for multiple optio... synthetic_backward \n",
|
66 |
+
"1 Refactor OptionOverrideProxy and Backend class... synthetic_backward \n",
|
67 |
+
"2 Refactor OptionOverrideProxy and backend optio... synthetic_backward \n",
|
68 |
+
"3 Refactor: Enhance OptionOverrideProxy for mult... synthetic_backward \n",
|
69 |
+
"4 Refactor OptionOverrideProxy and add target-sp... synthetic_backward \n",
|
70 |
+
"\n",
|
71 |
+
" E_type is_related \n",
|
72 |
+
"0 expert_labeled True \n",
|
73 |
+
"1 synthetic_forward True \n",
|
74 |
+
"2 synthetic_forward True \n",
|
75 |
+
"3 synthetic_forward True \n",
|
76 |
+
"4 synthetic_forward_from_backward False "
|
77 |
+
],
|
78 |
+
"text/html": [
|
79 |
+
"<div>\n",
|
80 |
+
"<style scoped>\n",
|
81 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
82 |
+
" vertical-align: middle;\n",
|
83 |
+
" }\n",
|
84 |
+
"\n",
|
85 |
+
" .dataframe tbody tr th {\n",
|
86 |
+
" vertical-align: top;\n",
|
87 |
+
" }\n",
|
88 |
+
"\n",
|
89 |
+
" .dataframe thead th {\n",
|
90 |
+
" text-align: right;\n",
|
91 |
+
" }\n",
|
92 |
+
"</style>\n",
|
93 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
94 |
+
" <thead>\n",
|
95 |
+
" <tr style=\"text-align: right;\">\n",
|
96 |
+
" <th></th>\n",
|
97 |
+
" <th>hash</th>\n",
|
98 |
+
" <th>repo</th>\n",
|
99 |
+
" <th>G_text</th>\n",
|
100 |
+
" <th>E_text</th>\n",
|
101 |
+
" <th>G_type</th>\n",
|
102 |
+
" <th>E_type</th>\n",
|
103 |
+
" <th>is_related</th>\n",
|
104 |
+
" </tr>\n",
|
105 |
+
" </thead>\n",
|
106 |
+
" <tbody>\n",
|
107 |
+
" <tr>\n",
|
108 |
+
" <th>0</th>\n",
|
109 |
+
" <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
|
110 |
+
" <td>mesonbuild/meson</td>\n",
|
111 |
+
" <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
|
112 |
+
" <td>Enhance OptionOverrideProxy for multiple optio...</td>\n",
|
113 |
+
" <td>synthetic_backward</td>\n",
|
114 |
+
" <td>expert_labeled</td>\n",
|
115 |
+
" <td>True</td>\n",
|
116 |
+
" </tr>\n",
|
117 |
+
" <tr>\n",
|
118 |
+
" <th>1</th>\n",
|
119 |
+
" <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
|
120 |
+
" <td>mesonbuild/meson</td>\n",
|
121 |
+
" <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
|
122 |
+
" <td>Refactor OptionOverrideProxy and Backend class...</td>\n",
|
123 |
+
" <td>synthetic_backward</td>\n",
|
124 |
+
" <td>synthetic_forward</td>\n",
|
125 |
+
" <td>True</td>\n",
|
126 |
+
" </tr>\n",
|
127 |
+
" <tr>\n",
|
128 |
+
" <th>2</th>\n",
|
129 |
+
" <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
|
130 |
+
" <td>mesonbuild/meson</td>\n",
|
131 |
+
" <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
|
132 |
+
" <td>Refactor OptionOverrideProxy and backend optio...</td>\n",
|
133 |
+
" <td>synthetic_backward</td>\n",
|
134 |
+
" <td>synthetic_forward</td>\n",
|
135 |
+
" <td>True</td>\n",
|
136 |
+
" </tr>\n",
|
137 |
+
" <tr>\n",
|
138 |
+
" <th>3</th>\n",
|
139 |
+
" <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
|
140 |
+
" <td>mesonbuild/meson</td>\n",
|
141 |
+
" <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
|
142 |
+
" <td>Refactor: Enhance OptionOverrideProxy for mult...</td>\n",
|
143 |
+
" <td>synthetic_backward</td>\n",
|
144 |
+
" <td>synthetic_forward</td>\n",
|
145 |
+
" <td>True</td>\n",
|
146 |
+
" </tr>\n",
|
147 |
+
" <tr>\n",
|
148 |
+
" <th>4</th>\n",
|
149 |
+
" <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
|
150 |
+
" <td>mesonbuild/meson</td>\n",
|
151 |
+
" <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
|
152 |
+
" <td>Refactor OptionOverrideProxy and add target-sp...</td>\n",
|
153 |
+
" <td>synthetic_backward</td>\n",
|
154 |
+
" <td>synthetic_forward_from_backward</td>\n",
|
155 |
+
" <td>False</td>\n",
|
156 |
+
" </tr>\n",
|
157 |
+
" </tbody>\n",
|
158 |
+
"</table>\n",
|
159 |
+
"</div>"
|
160 |
+
]
|
161 |
+
},
|
162 |
+
"execution_count": 3,
|
163 |
+
"metadata": {},
|
164 |
+
"output_type": "execute_result"
|
165 |
+
}
|
166 |
+
],
|
167 |
+
"execution_count": 3
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"metadata": {},
|
171 |
+
"cell_type": "markdown",
|
172 |
+
"source": "## Full",
|
173 |
+
"id": "922e7a73f11a4aec"
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"metadata": {
|
177 |
+
"ExecuteTime": {
|
178 |
+
"end_time": "2024-10-15T18:43:14.266540Z",
|
179 |
+
"start_time": "2024-10-15T18:43:14.262103Z"
|
180 |
+
}
|
181 |
+
},
|
182 |
+
"cell_type": "code",
|
183 |
+
"source": "len(df.loc[df.is_related])",
|
184 |
+
"id": "562d9c53da109d1a",
|
185 |
+
"outputs": [
|
186 |
+
{
|
187 |
+
"data": {
|
188 |
+
"text/plain": [
|
189 |
+
"656"
|
190 |
+
]
|
191 |
+
},
|
192 |
+
"execution_count": 4,
|
193 |
+
"metadata": {},
|
194 |
+
"output_type": "execute_result"
|
195 |
+
}
|
196 |
+
],
|
197 |
+
"execution_count": 4
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"metadata": {
|
201 |
+
"ExecuteTime": {
|
202 |
+
"end_time": "2024-10-15T18:43:18.073966Z",
|
203 |
+
"start_time": "2024-10-15T18:43:18.069219Z"
|
204 |
+
}
|
205 |
+
},
|
206 |
+
"cell_type": "code",
|
207 |
+
"source": "df.loc[df.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
|
208 |
+
"id": "b4f3c96a4b676a0d",
|
209 |
+
"outputs": [
|
210 |
+
{
|
211 |
+
"data": {
|
212 |
+
"text/plain": [
|
213 |
+
"43.733333333333334"
|
214 |
+
]
|
215 |
+
},
|
216 |
+
"execution_count": 5,
|
217 |
+
"metadata": {},
|
218 |
+
"output_type": "execute_result"
|
219 |
+
}
|
220 |
+
],
|
221 |
+
"execution_count": 5
|
222 |
+
},
|
223 |
+
{
|
224 |
+
"metadata": {
|
225 |
+
"ExecuteTime": {
|
226 |
+
"end_time": "2024-10-15T18:43:19.026689Z",
|
227 |
+
"start_time": "2024-10-15T18:43:19.021680Z"
|
228 |
+
}
|
229 |
+
},
|
230 |
+
"cell_type": "code",
|
231 |
+
"source": "len(df.loc[~df.is_related])",
|
232 |
+
"id": "54d9f32f1d18844f",
|
233 |
+
"outputs": [
|
234 |
+
{
|
235 |
+
"data": {
|
236 |
+
"text/plain": [
|
237 |
+
"5140"
|
238 |
+
]
|
239 |
+
},
|
240 |
+
"execution_count": 6,
|
241 |
+
"metadata": {},
|
242 |
+
"output_type": "execute_result"
|
243 |
+
}
|
244 |
+
],
|
245 |
+
"execution_count": 6
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"metadata": {
|
249 |
+
"ExecuteTime": {
|
250 |
+
"end_time": "2024-10-15T18:43:19.484304Z",
|
251 |
+
"start_time": "2024-10-15T18:43:19.480012Z"
|
252 |
+
}
|
253 |
+
},
|
254 |
+
"cell_type": "code",
|
255 |
+
"source": "df.loc[~df.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
|
256 |
+
"id": "679761631517b9e4",
|
257 |
+
"outputs": [
|
258 |
+
{
|
259 |
+
"data": {
|
260 |
+
"text/plain": [
|
261 |
+
"342.6666666666667"
|
262 |
+
]
|
263 |
+
},
|
264 |
+
"execution_count": 7,
|
265 |
+
"metadata": {},
|
266 |
+
"output_type": "execute_result"
|
267 |
+
}
|
268 |
+
],
|
269 |
+
"execution_count": 7
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"metadata": {},
|
273 |
+
"cell_type": "markdown",
|
274 |
+
"source": "## Expert-labeled",
|
275 |
+
"id": "84561ea89717d61a"
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"metadata": {
|
279 |
+
"ExecuteTime": {
|
280 |
+
"end_time": "2024-10-15T18:45:52.905631Z",
|
281 |
+
"start_time": "2024-10-15T18:45:52.901913Z"
|
282 |
+
}
|
283 |
+
},
|
284 |
+
"cell_type": "code",
|
285 |
+
"source": "_ = df.loc[(df.G_type == \"initial\") & (df.E_type == \"expert_labeled\")]",
|
286 |
+
"id": "be1c800f45cef26e",
|
287 |
+
"outputs": [],
|
288 |
+
"execution_count": 36
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"metadata": {
|
292 |
+
"ExecuteTime": {
|
293 |
+
"end_time": "2024-10-15T18:45:53.234109Z",
|
294 |
+
"start_time": "2024-10-15T18:45:53.230986Z"
|
295 |
+
}
|
296 |
+
},
|
297 |
+
"cell_type": "code",
|
298 |
+
"source": "len(_.loc[_.is_related])",
|
299 |
+
"id": "1d092dff4d39bcd1",
|
300 |
+
"outputs": [
|
301 |
+
{
|
302 |
+
"data": {
|
303 |
+
"text/plain": [
|
304 |
+
"57"
|
305 |
+
]
|
306 |
+
},
|
307 |
+
"execution_count": 37,
|
308 |
+
"metadata": {},
|
309 |
+
"output_type": "execute_result"
|
310 |
+
}
|
311 |
+
],
|
312 |
+
"execution_count": 37
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"metadata": {
|
316 |
+
"ExecuteTime": {
|
317 |
+
"end_time": "2024-10-15T18:45:53.629311Z",
|
318 |
+
"start_time": "2024-10-15T18:45:53.625620Z"
|
319 |
+
}
|
320 |
+
},
|
321 |
+
"cell_type": "code",
|
322 |
+
"source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
|
323 |
+
"id": "a06a532cd5a29725",
|
324 |
+
"outputs": [
|
325 |
+
{
|
326 |
+
"data": {
|
327 |
+
"text/plain": [
|
328 |
+
"3.8"
|
329 |
+
]
|
330 |
+
},
|
331 |
+
"execution_count": 38,
|
332 |
+
"metadata": {},
|
333 |
+
"output_type": "execute_result"
|
334 |
+
}
|
335 |
+
],
|
336 |
+
"execution_count": 38
|
337 |
+
},
|
338 |
+
{
|
339 |
+
"metadata": {
|
340 |
+
"ExecuteTime": {
|
341 |
+
"end_time": "2024-10-15T18:45:53.956790Z",
|
342 |
+
"start_time": "2024-10-15T18:45:53.953842Z"
|
343 |
+
}
|
344 |
+
},
|
345 |
+
"cell_type": "code",
|
346 |
+
"source": "len(_.loc[~_.is_related])",
|
347 |
+
"id": "5e19c8a6309b62aa",
|
348 |
+
"outputs": [
|
349 |
+
{
|
350 |
+
"data": {
|
351 |
+
"text/plain": [
|
352 |
+
"0"
|
353 |
+
]
|
354 |
+
},
|
355 |
+
"execution_count": 39,
|
356 |
+
"metadata": {},
|
357 |
+
"output_type": "execute_result"
|
358 |
+
}
|
359 |
+
],
|
360 |
+
"execution_count": 39
|
361 |
+
},
|
362 |
+
{
|
363 |
+
"metadata": {
|
364 |
+
"ExecuteTime": {
|
365 |
+
"end_time": "2024-10-15T18:46:02.554527Z",
|
366 |
+
"start_time": "2024-10-15T18:46:02.551084Z"
|
367 |
+
}
|
368 |
+
},
|
369 |
+
"cell_type": "code",
|
370 |
+
"source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
|
371 |
+
"id": "e43179c5dcab5eb2",
|
372 |
+
"outputs": [
|
373 |
+
{
|
374 |
+
"data": {
|
375 |
+
"text/plain": [
|
376 |
+
"nan"
|
377 |
+
]
|
378 |
+
},
|
379 |
+
"execution_count": 40,
|
380 |
+
"metadata": {},
|
381 |
+
"output_type": "execute_result"
|
382 |
+
}
|
383 |
+
],
|
384 |
+
"execution_count": 40
|
385 |
+
},
|
386 |
+
{
|
387 |
+
"metadata": {},
|
388 |
+
"cell_type": "markdown",
|
389 |
+
"source": "## Backward",
|
390 |
+
"id": "70ee052fae2f88e3"
|
391 |
+
},
|
392 |
+
{
|
393 |
+
"metadata": {
|
394 |
+
"ExecuteTime": {
|
395 |
+
"end_time": "2024-10-15T18:44:33.559606Z",
|
396 |
+
"start_time": "2024-10-15T18:44:33.556802Z"
|
397 |
+
}
|
398 |
+
},
|
399 |
+
"cell_type": "code",
|
400 |
+
"source": "_ = df.loc[(df.G_type == \"synthetic_backward\") & (~df.E_type.isin([\"synthetic_forward\", \"synthetic_forward_from_backward\"]))]",
|
401 |
+
"id": "99f51ecc55c4db35",
|
402 |
+
"outputs": [],
|
403 |
+
"execution_count": 20
|
404 |
+
},
|
405 |
+
{
|
406 |
+
"metadata": {
|
407 |
+
"ExecuteTime": {
|
408 |
+
"end_time": "2024-10-15T18:44:33.958325Z",
|
409 |
+
"start_time": "2024-10-15T18:44:33.955847Z"
|
410 |
+
}
|
411 |
+
},
|
412 |
+
"cell_type": "code",
|
413 |
+
"source": "len(_.loc[_.is_related])",
|
414 |
+
"id": "6ff29390c8e127c2",
|
415 |
+
"outputs": [
|
416 |
+
{
|
417 |
+
"data": {
|
418 |
+
"text/plain": [
|
419 |
+
"104"
|
420 |
+
]
|
421 |
+
},
|
422 |
+
"execution_count": 21,
|
423 |
+
"metadata": {},
|
424 |
+
"output_type": "execute_result"
|
425 |
+
}
|
426 |
+
],
|
427 |
+
"execution_count": 21
|
428 |
+
},
|
429 |
+
{
|
430 |
+
"metadata": {
|
431 |
+
"ExecuteTime": {
|
432 |
+
"end_time": "2024-10-15T18:44:34.455560Z",
|
433 |
+
"start_time": "2024-10-15T18:44:34.452303Z"
|
434 |
+
}
|
435 |
+
},
|
436 |
+
"cell_type": "code",
|
437 |
+
"source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
|
438 |
+
"id": "e1ae04e1ecfb2040",
|
439 |
+
"outputs": [
|
440 |
+
{
|
441 |
+
"data": {
|
442 |
+
"text/plain": [
|
443 |
+
"7.428571428571429"
|
444 |
+
]
|
445 |
+
},
|
446 |
+
"execution_count": 22,
|
447 |
+
"metadata": {},
|
448 |
+
"output_type": "execute_result"
|
449 |
+
}
|
450 |
+
],
|
451 |
+
"execution_count": 22
|
452 |
+
},
|
453 |
+
{
|
454 |
+
"metadata": {
|
455 |
+
"ExecuteTime": {
|
456 |
+
"end_time": "2024-10-15T18:44:34.903849Z",
|
457 |
+
"start_time": "2024-10-15T18:44:34.901226Z"
|
458 |
+
}
|
459 |
+
},
|
460 |
+
"cell_type": "code",
|
461 |
+
"source": "len(_.loc[~_.is_related])",
|
462 |
+
"id": "125c4c335e7761da",
|
463 |
+
"outputs": [
|
464 |
+
{
|
465 |
+
"data": {
|
466 |
+
"text/plain": [
|
467 |
+
"1048"
|
468 |
+
]
|
469 |
+
},
|
470 |
+
"execution_count": 23,
|
471 |
+
"metadata": {},
|
472 |
+
"output_type": "execute_result"
|
473 |
+
}
|
474 |
+
],
|
475 |
+
"execution_count": 23
|
476 |
+
},
|
477 |
+
{
|
478 |
+
"metadata": {
|
479 |
+
"ExecuteTime": {
|
480 |
+
"end_time": "2024-10-15T18:44:35.783538Z",
|
481 |
+
"start_time": "2024-10-15T18:44:35.778676Z"
|
482 |
+
}
|
483 |
+
},
|
484 |
+
"cell_type": "code",
|
485 |
+
"source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
|
486 |
+
"id": "4782f1d6e6863f89",
|
487 |
+
"outputs": [
|
488 |
+
{
|
489 |
+
"data": {
|
490 |
+
"text/plain": [
|
491 |
+
"74.85714285714286"
|
492 |
+
]
|
493 |
+
},
|
494 |
+
"execution_count": 24,
|
495 |
+
"metadata": {},
|
496 |
+
"output_type": "execute_result"
|
497 |
+
}
|
498 |
+
],
|
499 |
+
"execution_count": 24
|
500 |
+
},
|
501 |
+
{
|
502 |
+
"metadata": {},
|
503 |
+
"cell_type": "markdown",
|
504 |
+
"source": "## Forward",
|
505 |
+
"id": "bf61a4b422f779fa"
|
506 |
+
},
|
507 |
+
{
|
508 |
+
"metadata": {},
|
509 |
+
"cell_type": "markdown",
|
510 |
+
"source": "### From human",
|
511 |
+
"id": "1429f9f99acf75d"
|
512 |
+
},
|
513 |
+
{
|
514 |
+
"metadata": {
|
515 |
+
"ExecuteTime": {
|
516 |
+
"end_time": "2024-10-15T18:46:21.359807Z",
|
517 |
+
"start_time": "2024-10-15T18:46:21.356451Z"
|
518 |
+
}
|
519 |
+
},
|
520 |
+
"cell_type": "code",
|
521 |
+
"source": "_ = df.loc[(df.G_type == \"initial\") & (df.E_type == \"synthetic_forward\")]",
|
522 |
+
"id": "e13d55b0124f04b3",
|
523 |
+
"outputs": [],
|
524 |
+
"execution_count": 41
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"metadata": {
|
528 |
+
"ExecuteTime": {
|
529 |
+
"end_time": "2024-10-15T18:46:21.798508Z",
|
530 |
+
"start_time": "2024-10-15T18:46:21.795885Z"
|
531 |
+
}
|
532 |
+
},
|
533 |
+
"cell_type": "code",
|
534 |
+
"source": "len(_.loc[_.is_related])",
|
535 |
+
"id": "b8353390df7da427",
|
536 |
+
"outputs": [
|
537 |
+
{
|
538 |
+
"data": {
|
539 |
+
"text/plain": [
|
540 |
+
"177"
|
541 |
+
]
|
542 |
+
},
|
543 |
+
"execution_count": 42,
|
544 |
+
"metadata": {},
|
545 |
+
"output_type": "execute_result"
|
546 |
+
}
|
547 |
+
],
|
548 |
+
"execution_count": 42
|
549 |
+
},
|
550 |
+
{
|
551 |
+
"metadata": {
|
552 |
+
"ExecuteTime": {
|
553 |
+
"end_time": "2024-10-15T18:46:22.163595Z",
|
554 |
+
"start_time": "2024-10-15T18:46:22.160176Z"
|
555 |
+
}
|
556 |
+
},
|
557 |
+
"cell_type": "code",
|
558 |
+
"source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
|
559 |
+
"id": "ac89afde65efd73d",
|
560 |
+
"outputs": [
|
561 |
+
{
|
562 |
+
"data": {
|
563 |
+
"text/plain": [
|
564 |
+
"11.8"
|
565 |
+
]
|
566 |
+
},
|
567 |
+
"execution_count": 43,
|
568 |
+
"metadata": {},
|
569 |
+
"output_type": "execute_result"
|
570 |
+
}
|
571 |
+
],
|
572 |
+
"execution_count": 43
|
573 |
+
},
|
574 |
+
{
|
575 |
+
"metadata": {
|
576 |
+
"ExecuteTime": {
|
577 |
+
"end_time": "2024-10-15T18:46:22.552314Z",
|
578 |
+
"start_time": "2024-10-15T18:46:22.549570Z"
|
579 |
+
}
|
580 |
+
},
|
581 |
+
"cell_type": "code",
|
582 |
+
"source": "len(_.loc[~_.is_related])",
|
583 |
+
"id": "9b6cb335e3bbb7ff",
|
584 |
+
"outputs": [
|
585 |
+
{
|
586 |
+
"data": {
|
587 |
+
"text/plain": [
|
588 |
+
"0"
|
589 |
+
]
|
590 |
+
},
|
591 |
+
"execution_count": 44,
|
592 |
+
"metadata": {},
|
593 |
+
"output_type": "execute_result"
|
594 |
+
}
|
595 |
+
],
|
596 |
+
"execution_count": 44
|
597 |
+
},
|
598 |
+
{
|
599 |
+
"metadata": {
|
600 |
+
"ExecuteTime": {
|
601 |
+
"end_time": "2024-10-15T18:46:23.237736Z",
|
602 |
+
"start_time": "2024-10-15T18:46:23.234085Z"
|
603 |
+
}
|
604 |
+
},
|
605 |
+
"cell_type": "code",
|
606 |
+
"source": "__.loc[~__.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
|
607 |
+
"id": "fe22189a70fc4149",
|
608 |
+
"outputs": [
|
609 |
+
{
|
610 |
+
"data": {
|
611 |
+
"text/plain": [
|
612 |
+
"nan"
|
613 |
+
]
|
614 |
+
},
|
615 |
+
"execution_count": 45,
|
616 |
+
"metadata": {},
|
617 |
+
"output_type": "execute_result"
|
618 |
+
}
|
619 |
+
],
|
620 |
+
"execution_count": 45
|
621 |
+
},
|
622 |
+
{
|
623 |
+
"metadata": {},
|
624 |
+
"cell_type": "markdown",
|
625 |
+
"source": "### From backward",
|
626 |
+
"id": "ace7afb876fb88a0"
|
627 |
+
},
|
628 |
+
{
|
629 |
+
"metadata": {
|
630 |
+
"ExecuteTime": {
|
631 |
+
"end_time": "2024-10-15T18:47:06.641374Z",
|
632 |
+
"start_time": "2024-10-15T18:47:06.637018Z"
|
633 |
+
}
|
634 |
+
},
|
635 |
+
"cell_type": "code",
|
636 |
+
"source": "_ = df.loc[(df.G_type == \"synthetic_backward\") & (df.E_type.isin([\"synthetic_forward\", \"synthetic_forward_from_backward\"]))]",
|
637 |
+
"id": "88800960dbff619a",
|
638 |
+
"outputs": [],
|
639 |
+
"execution_count": 53
|
640 |
+
},
|
641 |
+
{
|
642 |
+
"metadata": {
|
643 |
+
"ExecuteTime": {
|
644 |
+
"end_time": "2024-10-15T18:47:15.358650Z",
|
645 |
+
"start_time": "2024-10-15T18:47:15.355108Z"
|
646 |
+
}
|
647 |
+
},
|
648 |
+
"cell_type": "code",
|
649 |
+
"source": "len(_.loc[_.is_related])",
|
650 |
+
"id": "890613156e005c83",
|
651 |
+
"outputs": [
|
652 |
+
{
|
653 |
+
"data": {
|
654 |
+
"text/plain": [
|
655 |
+
"318"
|
656 |
+
]
|
657 |
+
},
|
658 |
+
"execution_count": 56,
|
659 |
+
"metadata": {},
|
660 |
+
"output_type": "execute_result"
|
661 |
+
}
|
662 |
+
],
|
663 |
+
"execution_count": 56
|
664 |
+
},
|
665 |
+
{
|
666 |
+
"metadata": {
|
667 |
+
"ExecuteTime": {
|
668 |
+
"end_time": "2024-10-15T18:47:15.579415Z",
|
669 |
+
"start_time": "2024-10-15T18:47:15.576016Z"
|
670 |
+
}
|
671 |
+
},
|
672 |
+
"cell_type": "code",
|
673 |
+
"source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
|
674 |
+
"id": "999f91382a2c8ff6",
|
675 |
+
"outputs": [
|
676 |
+
{
|
677 |
+
"data": {
|
678 |
+
"text/plain": [
|
679 |
+
"22.714285714285715"
|
680 |
+
]
|
681 |
+
},
|
682 |
+
"execution_count": 57,
|
683 |
+
"metadata": {},
|
684 |
+
"output_type": "execute_result"
|
685 |
+
}
|
686 |
+
],
|
687 |
+
"execution_count": 57
|
688 |
+
},
|
689 |
+
{
|
690 |
+
"metadata": {
|
691 |
+
"ExecuteTime": {
|
692 |
+
"end_time": "2024-10-15T18:47:15.834218Z",
|
693 |
+
"start_time": "2024-10-15T18:47:15.831258Z"
|
694 |
+
}
|
695 |
+
},
|
696 |
+
"cell_type": "code",
|
697 |
+
"source": "len(_.loc[~_.is_related])",
|
698 |
+
"id": "d347941cbb4b2db1",
|
699 |
+
"outputs": [
|
700 |
+
{
|
701 |
+
"data": {
|
702 |
+
"text/plain": [
|
703 |
+
"3753"
|
704 |
+
]
|
705 |
+
},
|
706 |
+
"execution_count": 58,
|
707 |
+
"metadata": {},
|
708 |
+
"output_type": "execute_result"
|
709 |
+
}
|
710 |
+
],
|
711 |
+
"execution_count": 58
|
712 |
+
},
|
713 |
+
{
|
714 |
+
"metadata": {
|
715 |
+
"ExecuteTime": {
|
716 |
+
"end_time": "2024-10-15T18:47:16.138798Z",
|
717 |
+
"start_time": "2024-10-15T18:47:16.133397Z"
|
718 |
+
}
|
719 |
+
},
|
720 |
+
"cell_type": "code",
|
721 |
+
"source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
|
722 |
+
"id": "2db4d96713a8634d",
|
723 |
+
"outputs": [
|
724 |
+
{
|
725 |
+
"data": {
|
726 |
+
"text/plain": [
|
727 |
+
"268.07142857142856"
|
728 |
+
]
|
729 |
+
},
|
730 |
+
"execution_count": 59,
|
731 |
+
"metadata": {},
|
732 |
+
"output_type": "execute_result"
|
733 |
+
}
|
734 |
+
],
|
735 |
+
"execution_count": 59
|
736 |
+
}
|
737 |
+
],
|
738 |
+
"metadata": {
|
739 |
+
"kernelspec": {
|
740 |
+
"display_name": "Python 3",
|
741 |
+
"language": "python",
|
742 |
+
"name": "python3"
|
743 |
+
},
|
744 |
+
"language_info": {
|
745 |
+
"codemirror_mode": {
|
746 |
+
"name": "ipython",
|
747 |
+
"version": 2
|
748 |
+
},
|
749 |
+
"file_extension": ".py",
|
750 |
+
"mimetype": "text/x-python",
|
751 |
+
"name": "python",
|
752 |
+
"nbconvert_exporter": "python",
|
753 |
+
"pygments_lexer": "ipython2",
|
754 |
+
"version": "2.7.6"
|
755 |
+
}
|
756 |
+
},
|
757 |
+
"nbformat": 4,
|
758 |
+
"nbformat_minor": 5
|
759 |
+
}
|
dataset_statistics.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
|
3 |
+
import Levenshtein
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import plotly.figure_factory as ff
|
7 |
+
|
8 |
+
import config
|
9 |
+
|
10 |
+
|
11 |
+
def get_statistics_for_sample(start_msg, end_msg, row=None):
|
12 |
+
edit_ops = Levenshtein.editops(start_msg, end_msg)
|
13 |
+
n_deletes = sum([1 if op == "delete" else 0 for op, _, _ in edit_ops])
|
14 |
+
n_inserts = sum([1 if op == "insert" else 0 for op, _, _ in edit_ops])
|
15 |
+
n_replaces = sum([1 if op == "replace" else 0 for op, _, _ in edit_ops])
|
16 |
+
|
17 |
+
n_changes = n_deletes + n_inserts + n_replaces
|
18 |
+
n_deletes += n_replaces
|
19 |
+
n_inserts += n_replaces
|
20 |
+
|
21 |
+
return {
|
22 |
+
"deletions": n_deletes,
|
23 |
+
"insertions": n_inserts,
|
24 |
+
"changes": n_changes,
|
25 |
+
"deletions_norm": n_deletes / len(start_msg),
|
26 |
+
"insertions_norm": n_inserts / len(end_msg),
|
27 |
+
"changes_norm": n_changes / len(end_msg),
|
28 |
+
"lendiff": abs(len(start_msg) - len(end_msg)),
|
29 |
+
"editdist": row["editdist"] if row is not None else Levenshtein.distance(start_msg, end_msg),
|
30 |
+
}
|
31 |
+
|
32 |
+
|
33 |
+
def get_statistics_for_row(row):
|
34 |
+
if "commit_msg_start" in row:
|
35 |
+
start = row["commit_msg_start"]
|
36 |
+
else:
|
37 |
+
start = row["G_text"]
|
38 |
+
if "commit_msg_end" in row:
|
39 |
+
end = row["commit_msg_end"]
|
40 |
+
else:
|
41 |
+
end = row["E_text"]
|
42 |
+
return get_statistics_for_sample(start, end, row=row)
|
43 |
+
|
44 |
+
|
45 |
+
def get_statistics_for_df(df: pd.DataFrame):
|
46 |
+
stats = [get_statistics_for_row(row) for _, row in df.iterrows()]
|
47 |
+
|
48 |
+
assert len(stats) > 0
|
49 |
+
|
50 |
+
return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}
|
51 |
+
|
52 |
+
|
53 |
+
def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name):
|
54 |
+
hist_data = [
|
55 |
+
stat_golden,
|
56 |
+
stat_e2s,
|
57 |
+
stat_s2e,
|
58 |
+
stat_e2s_s2e,
|
59 |
+
np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0),
|
60 |
+
]
|
61 |
+
|
62 |
+
group_labels = ["Golden", "e2s", "s2e", "e2s+s2e", "Synthetic"]
|
63 |
+
|
64 |
+
fig = ff.create_distplot(hist_data, group_labels, bin_size=0.05, show_rug=False, show_hist=False)
|
65 |
+
|
66 |
+
fig.update_layout(title_text=stat_name)
|
67 |
+
|
68 |
+
with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f:
|
69 |
+
pickle.dump(hist_data, f)
|
70 |
+
|
71 |
+
return fig
|
generate_annotated_diffs.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import diff_match_patch as dmp_module
|
2 |
+
from tqdm import tqdm
|
3 |
+
|
4 |
+
from api_wrappers import hf_data_loader
|
5 |
+
|
6 |
+
|
7 |
+
def get_annotated_diff(start_text, end_text):
|
8 |
+
dmp = dmp_module.diff_match_patch()
|
9 |
+
dmp_mapping = {-1: "-", 0: None, 1: "+"}
|
10 |
+
|
11 |
+
diff = dmp.diff_main(start_text, end_text)
|
12 |
+
dmp.diff_cleanupSemantic(diff)
|
13 |
+
|
14 |
+
result = [[w, dmp_mapping[t]] for t, w in diff]
|
15 |
+
|
16 |
+
return result
|
17 |
+
|
18 |
+
|
19 |
+
def annotated_diff_for_row(row):
|
20 |
+
if "commit_msg_start" in row:
|
21 |
+
start = row["commit_msg_start"]
|
22 |
+
else:
|
23 |
+
start = row["G_text"]
|
24 |
+
if "commit_msg_end" in row:
|
25 |
+
end = row["commit_msg_end"]
|
26 |
+
else:
|
27 |
+
end = row["E_text"]
|
28 |
+
return get_annotated_diff(start, end)
|
29 |
+
|
30 |
+
|
31 |
+
def data_with_annotated_diffs():
|
32 |
+
tqdm.pandas()
|
33 |
+
|
34 |
+
df = hf_data_loader.load_synthetic_as_pandas()
|
35 |
+
df = df.loc[df.is_related].copy()
|
36 |
+
annotated = df.progress_apply(annotated_diff_for_row, axis=1)
|
37 |
+
df["annotated_diff"] = annotated
|
38 |
+
return df
|
generated_message_length_comparison.ipynb
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"metadata": {},
|
5 |
+
"cell_type": "markdown",
|
6 |
+
"source": [
|
7 |
+
"### How to run\n",
|
8 |
+
"\n",
|
9 |
+
"* Install libraries using the cell below (for grazie-api-gateway-client you will have to add a custom JB repository)\n",
|
10 |
+
"* Put the production prompt to file `data/prod_prompt.txt`\n",
|
11 |
+
"* Environment variables:\n",
|
12 |
+
" - `GRAZIE_API_JWT_TOKEN` -- JWT token for grazie (check `api_wrappers/grazie_wrapper.py` to adjust the client initialization if necessary)\n",
|
13 |
+
" - `HF_TOKEN` -- should _not_ be required; however, if it is, set it to a valid Hugging Face token"
|
14 |
+
],
|
15 |
+
"id": "77d51d55b41735cf"
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"metadata": {
|
19 |
+
"ExecuteTime": {
|
20 |
+
"end_time": "2024-06-20T16:09:07.968406Z",
|
21 |
+
"start_time": "2024-06-20T16:09:07.955405Z"
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"cell_type": "code",
|
25 |
+
"source": [
|
26 |
+
"# !pip install grazie-api-gateway-client\n",
|
27 |
+
"# !pip install tqdm\n",
|
28 |
+
"# !pip install pandas\n",
|
29 |
+
"# !pip install datasets"
|
30 |
+
],
|
31 |
+
"id": "91fa273e8987f6f6",
|
32 |
+
"outputs": [],
|
33 |
+
"execution_count": 1
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"metadata": {
|
37 |
+
"ExecuteTime": {
|
38 |
+
"end_time": "2024-06-20T16:09:10.353479Z",
|
39 |
+
"start_time": "2024-06-20T16:09:07.970405Z"
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"cell_type": "code",
|
43 |
+
"source": [
|
44 |
+
"from api_wrappers.grazie_wrapper import generate_for_prompt\n",
|
45 |
+
"from api_wrappers.hf_data_loader import load_full_commit_with_predictions_as_pandas\n",
|
46 |
+
"from tqdm import tqdm\n",
|
47 |
+
"\n",
|
48 |
+
"tqdm.pandas()"
|
49 |
+
],
|
50 |
+
"id": "ce11a4c781c152e",
|
51 |
+
"outputs": [],
|
52 |
+
"execution_count": 2
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"metadata": {
|
56 |
+
"ExecuteTime": {
|
57 |
+
"end_time": "2024-06-20T16:09:10.368996Z",
|
58 |
+
"start_time": "2024-06-20T16:09:10.354434Z"
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"cell_type": "code",
|
62 |
+
"source": [
|
63 |
+
"with open(\"data/prod_prompt.txt\") as f:\n",
|
64 |
+
"\tPROD_PROMPT = f.read().strip()\n",
|
65 |
+
"\n",
|
66 |
+
"def prod_prompt(diff):\n",
|
67 |
+
"\treturn PROD_PROMPT.replace(\"$diff\", diff).replace(\"$text\", \"\")\n",
|
68 |
+
"\n",
|
69 |
+
"def generate_commit_message_prod(diff):\n",
|
70 |
+
"\treturn generate_for_prompt(prod_prompt(diff))"
|
71 |
+
],
|
72 |
+
"id": "84a769c8765a7b64",
|
73 |
+
"outputs": [],
|
74 |
+
"execution_count": 3
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"metadata": {
|
78 |
+
"ExecuteTime": {
|
79 |
+
"end_time": "2024-06-20T16:09:10.384590Z",
|
80 |
+
"start_time": "2024-06-20T16:09:10.371410Z"
|
81 |
+
}
|
82 |
+
},
|
83 |
+
"cell_type": "code",
|
84 |
+
"source": "generate_commit_message_prod(\"TEST\")",
|
85 |
+
"id": "af2f20def94b0490",
|
86 |
+
"outputs": [
|
87 |
+
{
|
88 |
+
"data": {
|
89 |
+
"text/plain": [
|
90 |
+
"\"Certainly! I'll need to see the specific code differences (diffs) you would like to have summarized into a commit message. Please provide the diffs so I can assist you properly.\""
|
91 |
+
]
|
92 |
+
},
|
93 |
+
"execution_count": 4,
|
94 |
+
"metadata": {},
|
95 |
+
"output_type": "execute_result"
|
96 |
+
}
|
97 |
+
],
|
98 |
+
"execution_count": 4
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"metadata": {
|
102 |
+
"ExecuteTime": {
|
103 |
+
"end_time": "2024-06-20T16:09:22.224167Z",
|
104 |
+
"start_time": "2024-06-20T16:09:10.388409Z"
|
105 |
+
}
|
106 |
+
},
|
107 |
+
"cell_type": "code",
|
108 |
+
"source": [
|
109 |
+
"DATA = load_full_commit_with_predictions_as_pandas()[[\"mods\", \"prediction\"]].rename(columns={\"mods\": \"diff\", \"prediction\": \"prediction_current\"})\n",
|
110 |
+
"DATA.head()"
|
111 |
+
],
|
112 |
+
"id": "a49cabf576c9d692",
|
113 |
+
"outputs": [
|
114 |
+
{
|
115 |
+
"name": "stderr",
|
116 |
+
"output_type": "stream",
|
117 |
+
"text": [
|
118 |
+
"Using the latest cached version of the dataset since JetBrains-Research/lca-commit-message-generation couldn't be found on the Hugging Face Hub\n",
|
119 |
+
"Found the latest cached dataset configuration 'commitchronicle-py-long' at cache\\JetBrains-Research___lca-commit-message-generation\\commitchronicle-py-long\\0.0.0\\58dcef83a63cccebacd3e786afd73181cc9175e5 (last modified on Sun Apr 7 11:16:22 2024).\n",
|
120 |
+
"Using the latest cached version of the dataset since JetBrains-Research/lca-results couldn't be found on the Hugging Face Hub\n",
|
121 |
+
"Found the latest cached dataset configuration 'cmg_gpt_4_0613' at cache\\JetBrains-Research___lca-results\\cmg_gpt_4_0613\\0.0.0\\4b56bbf7243da371b3e0a42a0c9db1f37af98c39 (last modified on Fri May 31 16:00:33 2024).\n"
|
122 |
+
]
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"data": {
|
126 |
+
"text/plain": [
|
127 |
+
" diff \\\n",
|
128 |
+
"0 [{'change_type': 'MODIFY', 'old_path': 'cupy/c... \n",
|
129 |
+
"1 [{'change_type': 'MODIFY', 'old_path': 'tests/... \n",
|
130 |
+
"2 [{'change_type': 'MODIFY', 'old_path': 'numpy/... \n",
|
131 |
+
"3 [{'change_type': 'MODIFY', 'old_path': 'numpy/... \n",
|
132 |
+
"4 [{'change_type': 'MODIFY', 'old_path': 'numpy/... \n",
|
133 |
+
"\n",
|
134 |
+
" prediction_current \n",
|
135 |
+
"0 Extend memory management to consider CUDA stre... \n",
|
136 |
+
"1 Implement utility methods for parameterized te... \n",
|
137 |
+
"2 Update numpy function imports to use numpy as ... \n",
|
138 |
+
"3 Switch to using internal implementation method... \n",
|
139 |
+
"4 Add type hints and refine array API wrappers\\n... "
|
140 |
+
],
|
141 |
+
"text/html": [
|
142 |
+
"<div>\n",
|
143 |
+
"<style scoped>\n",
|
144 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
145 |
+
" vertical-align: middle;\n",
|
146 |
+
" }\n",
|
147 |
+
"\n",
|
148 |
+
" .dataframe tbody tr th {\n",
|
149 |
+
" vertical-align: top;\n",
|
150 |
+
" }\n",
|
151 |
+
"\n",
|
152 |
+
" .dataframe thead th {\n",
|
153 |
+
" text-align: right;\n",
|
154 |
+
" }\n",
|
155 |
+
"</style>\n",
|
156 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
157 |
+
" <thead>\n",
|
158 |
+
" <tr style=\"text-align: right;\">\n",
|
159 |
+
" <th></th>\n",
|
160 |
+
" <th>diff</th>\n",
|
161 |
+
" <th>prediction_current</th>\n",
|
162 |
+
" </tr>\n",
|
163 |
+
" </thead>\n",
|
164 |
+
" <tbody>\n",
|
165 |
+
" <tr>\n",
|
166 |
+
" <th>0</th>\n",
|
167 |
+
" <td>[{'change_type': 'MODIFY', 'old_path': 'cupy/c...</td>\n",
|
168 |
+
" <td>Extend memory management to consider CUDA stre...</td>\n",
|
169 |
+
" </tr>\n",
|
170 |
+
" <tr>\n",
|
171 |
+
" <th>1</th>\n",
|
172 |
+
" <td>[{'change_type': 'MODIFY', 'old_path': 'tests/...</td>\n",
|
173 |
+
" <td>Implement utility methods for parameterized te...</td>\n",
|
174 |
+
" </tr>\n",
|
175 |
+
" <tr>\n",
|
176 |
+
" <th>2</th>\n",
|
177 |
+
" <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n",
|
178 |
+
" <td>Update numpy function imports to use numpy as ...</td>\n",
|
179 |
+
" </tr>\n",
|
180 |
+
" <tr>\n",
|
181 |
+
" <th>3</th>\n",
|
182 |
+
" <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n",
|
183 |
+
" <td>Switch to using internal implementation method...</td>\n",
|
184 |
+
" </tr>\n",
|
185 |
+
" <tr>\n",
|
186 |
+
" <th>4</th>\n",
|
187 |
+
" <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n",
|
188 |
+
" <td>Add type hints and refine array API wrappers\\n...</td>\n",
|
189 |
+
" </tr>\n",
|
190 |
+
" </tbody>\n",
|
191 |
+
"</table>\n",
|
192 |
+
"</div>"
|
193 |
+
]
|
194 |
+
},
|
195 |
+
"execution_count": 5,
|
196 |
+
"metadata": {},
|
197 |
+
"output_type": "execute_result"
|
198 |
+
}
|
199 |
+
],
|
200 |
+
"execution_count": 5
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"metadata": {
|
204 |
+
"ExecuteTime": {
|
205 |
+
"end_time": "2024-06-20T16:21:20.410778Z",
|
206 |
+
"start_time": "2024-06-20T16:09:22.227258Z"
|
207 |
+
}
|
208 |
+
},
|
209 |
+
"cell_type": "code",
|
210 |
+
"source": "DATA[\"prediction_prod\"] = DATA.progress_apply(lambda row: generate_commit_message_prod(str(row[\"diff\"])), axis=1)",
|
211 |
+
"id": "9ded493e087f991d",
|
212 |
+
"outputs": [
|
213 |
+
{
|
214 |
+
"name": "stderr",
|
215 |
+
"output_type": "stream",
|
216 |
+
"text": [
|
217 |
+
"100%|ββββββββββ| 163/163 [11:58<00:00, 4.41s/it]\n"
|
218 |
+
]
|
219 |
+
}
|
220 |
+
],
|
221 |
+
"execution_count": 6
|
222 |
+
},
|
223 |
+
{
|
224 |
+
"metadata": {
|
225 |
+
"ExecuteTime": {
|
226 |
+
"end_time": "2024-06-20T16:21:20.426781Z",
|
227 |
+
"start_time": "2024-06-20T16:21:20.414781Z"
|
228 |
+
}
|
229 |
+
},
|
230 |
+
"cell_type": "code",
|
231 |
+
"source": [
|
232 |
+
"current_avg_length = DATA[\"prediction_current\"].str.len().mean()\n",
|
233 |
+
"print(f\"Current average length: {current_avg_length}\")"
|
234 |
+
],
|
235 |
+
"id": "ad38c2dce387f26d",
|
236 |
+
"outputs": [
|
237 |
+
{
|
238 |
+
"name": "stdout",
|
239 |
+
"output_type": "stream",
|
240 |
+
"text": [
|
241 |
+
"Current average length: 625.5644171779142\n"
|
242 |
+
]
|
243 |
+
}
|
244 |
+
],
|
245 |
+
"execution_count": 7
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"metadata": {
|
249 |
+
"ExecuteTime": {
|
250 |
+
"end_time": "2024-06-20T16:21:20.442017Z",
|
251 |
+
"start_time": "2024-06-20T16:21:20.429913Z"
|
252 |
+
}
|
253 |
+
},
|
254 |
+
"cell_type": "code",
|
255 |
+
"source": [
|
256 |
+
"prod_avg_length = DATA[\"prediction_prod\"].str.len().mean()\n",
|
257 |
+
"print(f\"Prod average length: {prod_avg_length}\")"
|
258 |
+
],
|
259 |
+
"id": "ec8b4412410794a4",
|
260 |
+
"outputs": [
|
261 |
+
{
|
262 |
+
"name": "stdout",
|
263 |
+
"output_type": "stream",
|
264 |
+
"text": [
|
265 |
+
"Prod average length: 352.88957055214723\n"
|
266 |
+
]
|
267 |
+
}
|
268 |
+
],
|
269 |
+
"execution_count": 8
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"metadata": {
|
273 |
+
"ExecuteTime": {
|
274 |
+
"end_time": "2024-06-20T16:21:20.457884Z",
|
275 |
+
"start_time": "2024-06-20T16:21:20.444852Z"
|
276 |
+
}
|
277 |
+
},
|
278 |
+
"cell_type": "code",
|
279 |
+
"source": "print(f\"Length ratio (current / prod): {current_avg_length / prod_avg_length})\")",
|
280 |
+
"id": "10f087784896eca3",
|
281 |
+
"outputs": [
|
282 |
+
{
|
283 |
+
"name": "stdout",
|
284 |
+
"output_type": "stream",
|
285 |
+
"text": [
|
286 |
+
"Length ratio (current / prod): 1.772691712591923)\n"
|
287 |
+
]
|
288 |
+
}
|
289 |
+
],
|
290 |
+
"execution_count": 9
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"metadata": {
|
294 |
+
"kernelspec": {
|
295 |
+
"display_name": "Python 3",
|
296 |
+
"language": "python",
|
297 |
+
"name": "python3"
|
298 |
+
},
|
299 |
+
"language_info": {
|
300 |
+
"codemirror_mode": {
|
301 |
+
"name": "ipython",
|
302 |
+
"version": 2
|
303 |
+
},
|
304 |
+
"file_extension": ".py",
|
305 |
+
"mimetype": "text/x-python",
|
306 |
+
"name": "python",
|
307 |
+
"nbconvert_exporter": "python",
|
308 |
+
"pygments_lexer": "ipython2",
|
309 |
+
"version": "2.7.6"
|
310 |
+
}
|
311 |
+
},
|
312 |
+
"nbformat": 4,
|
313 |
+
"nbformat_minor": 5
|
314 |
+
}
|
generation_steps/__init__.py
ADDED
File without changes
|
generation_steps/examples.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import config
|
2 |
+
from api_wrappers import hf_data_loader
|
3 |
+
|
4 |
+
N_EXAMPLES = 15
|
5 |
+
|
6 |
+
|
7 |
+
def get_example_prompt_end_to_start(start_msg, end_msg):
|
8 |
+
return f"""START OF THE EXAMPLE
|
9 |
+
|
10 |
+
For the following edited commit message:
|
11 |
+
START OF THE EDITED COMMIT MESSAGE
|
12 |
+
{end_msg}
|
13 |
+
END OF THE EDITED COMMIT MESSAGE
|
14 |
+
|
15 |
+
You would output the following initial commit message:
|
16 |
+
START OF THE INITIAL COMMIT MESSAGE
|
17 |
+
{start_msg}
|
18 |
+
END OF THE INITIAL COMMIT MESSAGE
|
19 |
+
|
20 |
+
END OF THE EXAMPLE"""
|
21 |
+
|
22 |
+
|
23 |
+
def get_example_prompt_start_to_end(start_msg, end_msg):
|
24 |
+
return f"""START OF THE EXAMPLE
|
25 |
+
|
26 |
+
For the following LLM-generated commit message:
|
27 |
+
START OF THE GENERATED COMMIT MESSAGE
|
28 |
+
{start_msg}
|
29 |
+
END OF THE GENERATED COMMIT MESSAGE
|
30 |
+
|
31 |
+
You would output the following improved commit message:
|
32 |
+
START OF THE IMPROVED COMMIT MESSAGE
|
33 |
+
{end_msg}
|
34 |
+
END OF THE IMPROVED COMMIT MESSAGE
|
35 |
+
|
36 |
+
END OF THE EXAMPLE"""
|
37 |
+
|
38 |
+
|
39 |
+
manual_df = hf_data_loader.load_raw_rewriting_as_pandas()[["commit_msg_start", "commit_msg_end"]]
|
40 |
+
manual_df = manual_df.sample(n=N_EXAMPLES, random_state=config.RANDOM_STATE)
|
41 |
+
|
42 |
+
|
43 |
+
def generate_examples(end_to_start):
|
44 |
+
prompt_fn = get_example_prompt_end_to_start if end_to_start else get_example_prompt_start_to_end
|
45 |
+
examples = [prompt_fn(row["commit_msg_start"], row["commit_msg_end"]) for _, row in manual_df.iterrows()]
|
46 |
+
|
47 |
+
return "\n".join(examples)
|
48 |
+
|
49 |
+
|
50 |
+
EXAMPLES_END_TO_START = generate_examples(end_to_start=True)
|
51 |
+
EXAMPLES_START_TO_END = generate_examples(end_to_start=False)
|
generation_steps/for_labeling.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
from tqdm import tqdm
|
4 |
+
|
5 |
+
import config
|
6 |
+
from api_wrappers import hf_data_loader
|
7 |
+
from generation_steps import synthetic_forward
|
8 |
+
|
9 |
+
|
10 |
+
def transform(df):
|
11 |
+
print("Generating data for labeling:")
|
12 |
+
synthetic_forward.print_config()
|
13 |
+
tqdm.pandas()
|
14 |
+
|
15 |
+
manual_df = hf_data_loader.load_raw_rewriting_as_pandas()
|
16 |
+
|
17 |
+
manual_df = manual_df.sample(frac=1, random_state=config.RANDOM_STATE).set_index(["hash", "repo"])[
|
18 |
+
["commit_msg_start", "commit_msg_end"]
|
19 |
+
]
|
20 |
+
|
21 |
+
manual_df = manual_df[~manual_df.index.duplicated(keep="first")]
|
22 |
+
|
23 |
+
def get_is_manually_rewritten(row):
|
24 |
+
commit_id = (row["hash"], row["repo"])
|
25 |
+
return commit_id in manual_df.index
|
26 |
+
|
27 |
+
result = df
|
28 |
+
result["manual_sample"] = result.progress_apply(get_is_manually_rewritten, axis=1)
|
29 |
+
|
30 |
+
def get_prediction_message(row):
|
31 |
+
commit_id = (row["hash"], row["repo"])
|
32 |
+
if row["manual_sample"]:
|
33 |
+
return manual_df.loc[commit_id]["commit_msg_start"]
|
34 |
+
return row["prediction"]
|
35 |
+
|
36 |
+
def get_enhanced_message(row):
|
37 |
+
commit_id = (row["hash"], row["repo"])
|
38 |
+
if row["manual_sample"]:
|
39 |
+
return manual_df.loc[commit_id]["commit_msg_end"]
|
40 |
+
return synthetic_forward.generate_end_msg(start_msg=row["prediction"], diff=row["mods"])
|
41 |
+
|
42 |
+
result["enhanced"] = result.progress_apply(get_enhanced_message, axis=1)
|
43 |
+
result["prediction"] = result.progress_apply(get_prediction_message, axis=1)
|
44 |
+
result["mods"] = result["mods"].progress_apply(json.dumps)
|
45 |
+
|
46 |
+
result.to_csv(config.DATA_FOR_LABELING_ARTIFACT)
|
47 |
+
print("Done")
|
48 |
+
return result
|
49 |
+
|
50 |
+
|
51 |
+
def main():
|
52 |
+
synthetic_forward.GENERATION_ATTEMPTS = 3
|
53 |
+
df = hf_data_loader.load_full_commit_with_predictions_as_pandas()
|
54 |
+
transform(df)
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
main()
|
generation_steps/metrics_analysis.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import evaluate
|
2 |
+
from rapidfuzz.distance.Levenshtein import distance, normalized_similarity
|
3 |
+
|
4 |
+
import config
|
5 |
+
|
6 |
+
BLEU = evaluate.load("saridormi/b_norm", cache_dir=config.CACHE_DIR)
|
7 |
+
|
8 |
+
|
9 |
+
def bleu_fn(pred, ref, **kwargs):
|
10 |
+
if "refs" in kwargs:
|
11 |
+
return BLEU.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["b_norm"]
|
12 |
+
return BLEU.compute(predictions=[pred], references=[ref])["b_norm"]
|
13 |
+
|
14 |
+
|
15 |
+
METEOR = evaluate.load("meteor", cache_dir=config.CACHE_DIR)
|
16 |
+
|
17 |
+
|
18 |
+
def meteor_fn(pred, ref, **kwargs):
|
19 |
+
if "refs" in kwargs:
|
20 |
+
return METEOR.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["meteor"]
|
21 |
+
return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
|
22 |
+
|
23 |
+
|
24 |
+
ROUGE = evaluate.load("rouge", cache_dir=config.CACHE_DIR)
|
25 |
+
|
26 |
+
|
27 |
+
def rouge1_fn(pred, ref, **kwargs):
|
28 |
+
if "refs" in kwargs:
|
29 |
+
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge1"]
|
30 |
+
return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
|
31 |
+
|
32 |
+
|
33 |
+
def rouge2_fn(pred, ref, **kwargs):
|
34 |
+
if "refs" in kwargs:
|
35 |
+
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge2"]
|
36 |
+
return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
|
37 |
+
|
38 |
+
|
39 |
+
def rougeL_fn(pred, ref, **kwargs):
|
40 |
+
if "refs" in kwargs:
|
41 |
+
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rougeL"]
|
42 |
+
return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
|
43 |
+
|
44 |
+
|
45 |
+
BERTSCORE = evaluate.load("bertscore", cache_dir=config.CACHE_DIR)
|
46 |
+
|
47 |
+
|
48 |
+
def bertscore_fn(pred, ref, **kwargs):
|
49 |
+
if "refs" in kwargs:
|
50 |
+
return BERTSCORE.compute(predictions=[pred], references=[kwargs["refs"]], model_type="distilbert-base-uncased")[
|
51 |
+
"f1"
|
52 |
+
][0]
|
53 |
+
return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
|
54 |
+
|
55 |
+
|
56 |
+
CHRF = evaluate.load("chrf")
|
57 |
+
|
58 |
+
|
59 |
+
def chrf_fn(pred, ref, **kwargs):
|
60 |
+
if "refs" in kwargs:
|
61 |
+
return CHRF.compute(predictions=[pred], references=[kwargs["refs"]])["score"]
|
62 |
+
return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
|
63 |
+
|
64 |
+
|
65 |
+
def edit_distance_fn(pred, ref, **kwargs):
|
66 |
+
if "refs" in kwargs:
|
67 |
+
scores = [distance(pred, ref) for ref in kwargs["refs"]]
|
68 |
+
return sum(scores) / len(scores)
|
69 |
+
return distance(pred, ref)
|
70 |
+
|
71 |
+
|
72 |
+
def edit_distance_norm_fn(pred, ref, **kwargs):
|
73 |
+
if "refs" in kwargs:
|
74 |
+
scores = [normalized_similarity(pred, ref) for ref in kwargs["refs"]]
|
75 |
+
return sum(scores) / len(scores)
|
76 |
+
return normalized_similarity(pred, ref)
|
77 |
+
|
78 |
+
|
79 |
+
AGGR_METRICS = {
|
80 |
+
"editdist": edit_distance_fn,
|
81 |
+
"editsim": edit_distance_norm_fn,
|
82 |
+
"bleu": bleu_fn,
|
83 |
+
"meteor": meteor_fn,
|
84 |
+
"rouge1": rouge1_fn,
|
85 |
+
"rouge2": rouge2_fn,
|
86 |
+
"rougeL": rougeL_fn,
|
87 |
+
"bertscore": bertscore_fn,
|
88 |
+
"chrF": chrf_fn,
|
89 |
+
}
|
90 |
+
|
91 |
+
|
92 |
+
REL_METRICS = {
|
93 |
+
"editdist": edit_distance_fn,
|
94 |
+
}
|
generation_steps/synthetic_backward.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from itertools import chain
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
from tqdm import tqdm
|
5 |
+
|
6 |
+
import config
|
7 |
+
import dataset_statistics
|
8 |
+
from api_wrappers import grazie_wrapper, hf_data_loader
|
9 |
+
from generation_steps import examples
|
10 |
+
|
11 |
+
GENERATION_MULTIPLIER = 3
|
12 |
+
REL_INSERTIONS_THRESHOLD = 0.5
|
13 |
+
GENERATION_ATTEMPTS = 3
|
14 |
+
|
15 |
+
|
16 |
+
def build_prompt(reference, diff):
|
17 |
+
return f"""A software developer uses a LLM to generate commit messages.
|
18 |
+
|
19 |
+
They generated a commit message for the following source code changes:
|
20 |
+
START OF THE SOURCE CODE CHANGES
|
21 |
+
{diff}
|
22 |
+
END OF THE SOURCE CODE CHANGES
|
23 |
+
|
24 |
+
After generating the commit message the developer understands that it is not perfect. After making dome changes,
|
25 |
+
they come up with an edited version of the message. Here is this edited message:
|
26 |
+
START OF THE COMMIT MESSAGE
|
27 |
+
{reference}
|
28 |
+
END OF THE COMMIT MESSAGE
|
29 |
+
|
30 |
+
Your task is to print the initial, LLM-generated commit message.
|
31 |
+
The message you print must share some fragments with the edited message.
|
32 |
+
Here are some examples of what you should output:
|
33 |
+
START OF THE EXAMPLES LIST
|
34 |
+
{examples.EXAMPLES_END_TO_START}
|
35 |
+
END OF THE EXAMPLES LIST
|
36 |
+
|
37 |
+
|
38 |
+
Print only the initial commit message's text after the
|
39 |
+
token "OUTPUT".
|
40 |
+
|
41 |
+
OUTPUT"""
|
42 |
+
|
43 |
+
|
44 |
+
def generate_start_msg(end_msg, diff):
|
45 |
+
prompt = build_prompt(reference=end_msg, diff=diff)
|
46 |
+
results = []
|
47 |
+
|
48 |
+
for i in range(GENERATION_ATTEMPTS):
|
49 |
+
start_msg_pred = grazie_wrapper.generate_for_prompt(prompt)
|
50 |
+
|
51 |
+
stats = dataset_statistics.get_statistics_for_sample(
|
52 |
+
start_msg=start_msg_pred,
|
53 |
+
end_msg=end_msg,
|
54 |
+
)
|
55 |
+
|
56 |
+
if stats["insertions"] < REL_INSERTIONS_THRESHOLD:
|
57 |
+
return start_msg_pred
|
58 |
+
else:
|
59 |
+
results.append((stats["insertions"], start_msg_pred))
|
60 |
+
|
61 |
+
results.sort()
|
62 |
+
return results[0][1]
|
63 |
+
|
64 |
+
|
65 |
+
COLS_TO_KEEP = ["hash", "repo", "commit_msg_end", "mods", "session"]
|
66 |
+
|
67 |
+
COLS_TO_DEFAULT = {"edit_time": None}
|
68 |
+
|
69 |
+
|
70 |
+
def transform(df):
|
71 |
+
print("End -> start synthesis:")
|
72 |
+
print(f"NUMBER OF EXAMPLES PER PROMPT = {examples.N_EXAMPLES}")
|
73 |
+
print(f"GENERATION_MULTIPLIER = {GENERATION_MULTIPLIER}")
|
74 |
+
print(f"REL_INSERTIONS_THRESHOLD = {REL_INSERTIONS_THRESHOLD}")
|
75 |
+
print(f"GENERATION_ATTEMPTS = {GENERATION_ATTEMPTS}")
|
76 |
+
|
77 |
+
df["end_to_start"] = False
|
78 |
+
|
79 |
+
generated_data = {"commit_msg_start": []}
|
80 |
+
|
81 |
+
for col in chain(COLS_TO_KEEP, COLS_TO_DEFAULT):
|
82 |
+
generated_data[col] = []
|
83 |
+
|
84 |
+
for _, row in tqdm(df.iterrows(), total=len(df)):
|
85 |
+
for i in range(GENERATION_MULTIPLIER):
|
86 |
+
commit_msg_start_pred = generate_start_msg(end_msg=row["commit_msg_end"], diff=row["mods"])
|
87 |
+
|
88 |
+
generated_data["commit_msg_start"].append(commit_msg_start_pred)
|
89 |
+
for col in COLS_TO_KEEP:
|
90 |
+
generated_data[col].append(row[col])
|
91 |
+
|
92 |
+
for col in COLS_TO_DEFAULT:
|
93 |
+
generated_data[col].append(COLS_TO_DEFAULT[col])
|
94 |
+
|
95 |
+
generated_df = pd.DataFrame.from_dict(generated_data)
|
96 |
+
generated_df["end_to_start"] = True
|
97 |
+
|
98 |
+
result = pd.concat([df, generated_df], ignore_index=True)
|
99 |
+
result.to_csv(config.END_TO_START_ARTIFACT)
|
100 |
+
|
101 |
+
print("Done")
|
102 |
+
return result
|
103 |
+
|
104 |
+
|
105 |
+
def main():
|
106 |
+
df = hf_data_loader.load_processed_rewriting_as_pandas()
|
107 |
+
transform(df)
|
108 |
+
|
109 |
+
|
110 |
+
if __name__ == "__main__":
|
111 |
+
main()
|
generation_steps/synthetic_forward.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from tqdm import tqdm
|
3 |
+
|
4 |
+
import config
|
5 |
+
import dataset_statistics
|
6 |
+
from api_wrappers import grazie_wrapper
|
7 |
+
from generation_steps import examples
|
8 |
+
|
9 |
+
GENERATION_MULTIPLIER = 3
|
10 |
+
REL_DELETIONS_THRESHOLD = 0.75
|
11 |
+
GENERATION_ATTEMPTS = 3
|
12 |
+
|
13 |
+
|
14 |
+
def build_prompt(prediction, diff):
|
15 |
+
return f"""A LLM generated a commit message for the following source code changes:
|
16 |
+
START OF THE SOURCE CODE CHANGES
|
17 |
+
{diff}
|
18 |
+
END OF THE SOURCE CODE CHANGES
|
19 |
+
|
20 |
+
Here is the message the LLM generated:
|
21 |
+
START OF THE COMMIT MESSAGE
|
22 |
+
{prediction}
|
23 |
+
END OF THE COMMIT MESSAGE
|
24 |
+
|
25 |
+
This generated message is not perfect. Your task is to rewrite and improve it.
|
26 |
+
You have to simulate a human software developer who manually rewrites the LLM-generated commit message,
|
27 |
+
so the message you print must share some fragments with the generated message.
|
28 |
+
Your message should be concise.
|
29 |
+
Follow the Conventional Commits guidelines.
|
30 |
+
Here are some examples of what you should output:
|
31 |
+
START OF THE EXAMPLES LIST
|
32 |
+
{examples.EXAMPLES_START_TO_END}
|
33 |
+
END OF THE EXAMPLES LIST
|
34 |
+
|
35 |
+
|
36 |
+
Print only the improved commit message's text after the
|
37 |
+
token "OUTPUT".
|
38 |
+
|
39 |
+
OUTPUT"""
|
40 |
+
|
41 |
+
|
42 |
+
def generate_end_msg(start_msg, diff):
|
43 |
+
prompt = build_prompt(prediction=start_msg, diff=diff)
|
44 |
+
results = []
|
45 |
+
|
46 |
+
for i in range(GENERATION_ATTEMPTS):
|
47 |
+
end_msg_pred = grazie_wrapper.generate_for_prompt(prompt)
|
48 |
+
|
49 |
+
stats = dataset_statistics.get_statistics_for_sample(
|
50 |
+
start_msg=start_msg,
|
51 |
+
end_msg=end_msg_pred,
|
52 |
+
)
|
53 |
+
if stats["deletions"] < REL_DELETIONS_THRESHOLD:
|
54 |
+
return end_msg_pred
|
55 |
+
else:
|
56 |
+
results.append((stats["deletions"], end_msg_pred))
|
57 |
+
|
58 |
+
results.sort()
|
59 |
+
return results[0][1]
|
60 |
+
|
61 |
+
|
62 |
+
COLS_TO_KEEP = ["hash", "repo", "commit_msg_start", "mods", "session", "end_to_start"]
|
63 |
+
|
64 |
+
|
65 |
+
def print_config():
|
66 |
+
print(f"NUMBER OF EXAMPLES PER PROMPT = {examples.N_EXAMPLES}")
|
67 |
+
print(f"GENERATION_MULTIPLIER = {GENERATION_MULTIPLIER}")
|
68 |
+
print(f"REL_DELETIONS_THRESHOLD = {REL_DELETIONS_THRESHOLD}")
|
69 |
+
print(f"GENERATION_ATTEMPTS = {GENERATION_ATTEMPTS}")
|
70 |
+
|
71 |
+
|
72 |
+
def transform(df):
|
73 |
+
print("Start -> send synthesis:")
|
74 |
+
print_config()
|
75 |
+
|
76 |
+
df["start_to_end"] = False
|
77 |
+
|
78 |
+
generated_data = {"commit_msg_end": []}
|
79 |
+
|
80 |
+
for col in COLS_TO_KEEP:
|
81 |
+
generated_data[col] = []
|
82 |
+
|
83 |
+
for _, row in tqdm(df.iterrows(), total=len(df)):
|
84 |
+
for i in range(GENERATION_MULTIPLIER):
|
85 |
+
commit_msg_end_pred = generate_end_msg(start_msg=row["commit_msg_start"], diff=row["mods"])
|
86 |
+
|
87 |
+
generated_data["commit_msg_end"].append(commit_msg_end_pred)
|
88 |
+
for col in COLS_TO_KEEP:
|
89 |
+
generated_data[col].append(row[col])
|
90 |
+
|
91 |
+
generated_df = pd.DataFrame.from_dict(generated_data)
|
92 |
+
generated_df["start_to_end"] = True
|
93 |
+
|
94 |
+
result = pd.concat([df, generated_df], ignore_index=True)
|
95 |
+
result.to_csv(config.START_TO_END_ARTIFACT)
|
96 |
+
|
97 |
+
print("Done")
|
98 |
+
return result
|
99 |
+
|
100 |
+
|
101 |
+
def main():
|
102 |
+
df = pd.read_csv(config.END_TO_START_ARTIFACT, index_col=[0])
|
103 |
+
transform(df)
|
104 |
+
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
main()
|
metrics_analysis.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "commit-message-editing-visualization"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Utilities for synthetic data generation, metrics analysis and visualization space for CMG Evaluaton."
|
5 |
+
authors = ["Your Name <[email protected]>"]
|
6 |
+
license = "MIT"
|
7 |
+
|
8 |
+
[tool.poetry.dependencies]
|
9 |
+
python = "^3.9"
|
10 |
+
absl-py = "2.1.0"
|
11 |
+
aiofiles = "23.2.1"
|
12 |
+
aiohttp = "3.9.3"
|
13 |
+
aiosignal = "1.3.1"
|
14 |
+
altair = "5.3.0"
|
15 |
+
annotated-types = "0.6.0"
|
16 |
+
anyio = "4.3.0"
|
17 |
+
argon2-cffi = "23.1.0"
|
18 |
+
argon2-cffi-bindings = "21.2.0"
|
19 |
+
arrow = "1.3.0"
|
20 |
+
asttokens = "2.4.1"
|
21 |
+
async-lru = "2.0.4"
|
22 |
+
async-timeout = "4.0.3"
|
23 |
+
attrs = "23.2.0"
|
24 |
+
Babel = "2.14.0"
|
25 |
+
beautifulsoup4 = "4.12.3"
|
26 |
+
bert-score = "0.3.13"
|
27 |
+
bleach = "6.1.0"
|
28 |
+
cbor2 = "5.6.2"
|
29 |
+
certifi = "2024.2.2"
|
30 |
+
cffi = "1.16.0"
|
31 |
+
charset-normalizer = "3.3.2"
|
32 |
+
click = "8.1.7"
|
33 |
+
colorama = "0.4.6"
|
34 |
+
comm = "0.2.2"
|
35 |
+
contourpy = "1.2.1"
|
36 |
+
cycler = "0.12.1"
|
37 |
+
datasets = "2.18.0"
|
38 |
+
debugpy = "1.8.1"
|
39 |
+
decorator = "5.1.1"
|
40 |
+
defusedxml = "0.7.1"
|
41 |
+
diff-match-patch = "20230430"
|
42 |
+
dill = "0.3.8"
|
43 |
+
evaluate = "0.4.1"
|
44 |
+
exceptiongroup = "1.2.0"
|
45 |
+
executing = "2.0.1"
|
46 |
+
fastapi = "0.110.1"
|
47 |
+
fastjsonschema = "2.19.1"
|
48 |
+
ffmpy = "0.3.2"
|
49 |
+
filelock = "3.13.3"
|
50 |
+
fonttools = "4.50.0"
|
51 |
+
fqdn = "1.5.1"
|
52 |
+
frozenlist = "1.4.1"
|
53 |
+
fsspec = "2024.2.0"
|
54 |
+
gradio = "4.25.0"
|
55 |
+
gradio_client = "0.15.0"
|
56 |
+
h11 = "0.14.0"
|
57 |
+
httpcore = "1.0.5"
|
58 |
+
httpx = "0.27.0"
|
59 |
+
huggingface-hub = "0.22.2"
|
60 |
+
idna = "3.6"
|
61 |
+
importlib_metadata = "7.1.0"
|
62 |
+
importlib_resources = "6.4.0"
|
63 |
+
ipykernel = "6.29.4"
|
64 |
+
ipython = "8.18.1"
|
65 |
+
ipywidgets = "8.1.2"
|
66 |
+
isoduration = "20.11.0"
|
67 |
+
jedi = "0.19.1"
|
68 |
+
Jinja2 = "3.1.3"
|
69 |
+
joblib = "1.4.0"
|
70 |
+
json5 = "0.9.25"
|
71 |
+
jsonpointer = "2.4"
|
72 |
+
jsonschema = "4.21.1"
|
73 |
+
jsonschema-specifications = "2023.12.1"
|
74 |
+
kiwisolver = "1.4.5"
|
75 |
+
lxml = "5.2.1"
|
76 |
+
markdown-it-py = "3.0.0"
|
77 |
+
MarkupSafe = "2.1.5"
|
78 |
+
matplotlib = "3.8.4"
|
79 |
+
matplotlib-inline = "0.1.7"
|
80 |
+
mdurl = "0.1.2"
|
81 |
+
mistune = "3.0.2"
|
82 |
+
mpmath = "1.3.0"
|
83 |
+
multidict = "6.0.5"
|
84 |
+
multiprocess = "0.70.16"
|
85 |
+
nbclient = "0.10.0"
|
86 |
+
nbconvert = "7.16.4"
|
87 |
+
nbformat = "5.10.4"
|
88 |
+
nest-asyncio = "1.6.0"
|
89 |
+
networkx = "3.2.1"
|
90 |
+
nltk = "3.8.1"
|
91 |
+
numpy = "1.26.4"
|
92 |
+
orjson = "3.10.0"
|
93 |
+
overrides = "7.7.0"
|
94 |
+
packaging = "24.0"
|
95 |
+
pandas = "2.2.1"
|
96 |
+
pandocfilters = "1.5.1"
|
97 |
+
parso = "0.8.4"
|
98 |
+
pillow = "10.3.0"
|
99 |
+
platformdirs = "4.2.1"
|
100 |
+
portalocker = "2.8.2"
|
101 |
+
prometheus_client = "0.20.0"
|
102 |
+
prompt-toolkit = "3.0.43"
|
103 |
+
psutil = "5.9.8"
|
104 |
+
pure-eval = "0.2.2"
|
105 |
+
pyarrow = "15.0.2"
|
106 |
+
pyarrow-hotfix = "0.6"
|
107 |
+
pycparser = "2.22"
|
108 |
+
pydantic = "2.6.4"
|
109 |
+
pydantic_core = "2.16.3"
|
110 |
+
pydub = "0.25.1"
|
111 |
+
Pygments = "2.17.2"
|
112 |
+
pyparsing = "3.1.2"
|
113 |
+
python-dateutil = "2.9.0.post0"
|
114 |
+
python-json-logger = "2.0.7"
|
115 |
+
python-multipart = "0.0.9"
|
116 |
+
pytz = "2024.1"
|
117 |
+
PyYAML = "6.0.1"
|
118 |
+
pyzmq = "26.0.2"
|
119 |
+
rapidfuzz = "3.8.1"
|
120 |
+
referencing = "0.34.0"
|
121 |
+
regex = "2023.12.25"
|
122 |
+
requests = "2.31.0"
|
123 |
+
responses = "0.18.0"
|
124 |
+
rfc3339-validator = "0.1.4"
|
125 |
+
rfc3986-validator = "0.1.1"
|
126 |
+
rich = "13.7.1"
|
127 |
+
rouge-score = "0.1.2"
|
128 |
+
rpds-py = "0.18.0"
|
129 |
+
ruff = "0.3.5"
|
130 |
+
sacrebleu = "2.4.2"
|
131 |
+
safetensors = "0.4.2"
|
132 |
+
scikit-learn = "1.4.2"
|
133 |
+
scipy = "1.13.0"
|
134 |
+
semantic-version = "2.10.0"
|
135 |
+
Send2Trash = "1.8.3"
|
136 |
+
shellingham = "1.5.4"
|
137 |
+
six = "1.16.0"
|
138 |
+
sniffio = "1.3.1"
|
139 |
+
soupsieve = "2.5"
|
140 |
+
stack-data = "0.6.3"
|
141 |
+
starlette = "0.37.2"
|
142 |
+
sympy = "1.12"
|
143 |
+
tabulate = "0.9.0"
|
144 |
+
terminado = "0.18.1"
|
145 |
+
threadpoolctl = "3.4.0"
|
146 |
+
tinycss2 = "1.3.0"
|
147 |
+
tokenizers = "0.15.2"
|
148 |
+
tomli = "2.0.1"
|
149 |
+
tomlkit = "0.12.0"
|
150 |
+
toolz = "0.12.1"
|
151 |
+
torch = "2.2.2"
|
152 |
+
tornado = "6.4"
|
153 |
+
tqdm = "4.66.2"
|
154 |
+
traitlets = "5.14.3"
|
155 |
+
transformers = "4.39.3"
|
156 |
+
typer = "0.12.1"
|
157 |
+
types-python-dateutil = "2.9.0.20240316"
|
158 |
+
typing_extensions = "4.10.0"
|
159 |
+
tzdata = "2024.1"
|
160 |
+
uri-template = "1.3.0"
|
161 |
+
urllib3 = "2.2.1"
|
162 |
+
uvicorn = "0.29.0"
|
163 |
+
wcwidth = "0.2.13"
|
164 |
+
webcolors = "1.13"
|
165 |
+
webencodings = "0.5.1"
|
166 |
+
websocket-client = "1.8.0"
|
167 |
+
websockets = "11.0.3"
|
168 |
+
widgetsnbextension = "4.0.10"
|
169 |
+
xxhash = "3.4.1"
|
170 |
+
yarl = "1.9.4"
|
171 |
+
zipp = "3.18.1"
|
172 |
+
plotly = "5.22.0"
|
173 |
+
tenacity = "8.2.3"
|
174 |
+
Levenshtein = "0.25.1"
|
175 |
+
kaleido = "0.2.1"
|
176 |
+
jupyter = "^1.0.0"
|
177 |
+
grazie-api-gateway-client = {version = "^0.1.3", source = "space-grazie-ml"}
|
178 |
+
seaborn = "^0.13.2"
|
179 |
+
|
180 |
+
[tool.ruff]
|
181 |
+
line-length = 120
|
182 |
+
target-version = "py310"
|
183 |
+
|
184 |
+
[tool.ruff.lint]
|
185 |
+
extend-select = ["I"]
|
186 |
+
|
187 |
+
[tool.isort]
|
188 |
+
profile = "black"
|
189 |
+
force_sort_within_sections = true
|
190 |
+
order_by_type = true
|
191 |
+
|
192 |
+
[[tool.poetry.source]]
|
193 |
+
name = "space-grazie-ml"
|
194 |
+
url = "https://packages.jetbrains.team/pypi/p/grazi/grazie-ml/simple"
|
195 |
+
priority="supplemental"
|
196 |
+
|
197 |
+
[build-system]
|
198 |
+
requires = ["poetry-core"]
|
199 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
aiofiles==23.2.1
|
3 |
+
aiohttp==3.9.3
|
4 |
+
aiosignal==1.3.1
|
5 |
+
altair==5.3.0
|
6 |
+
annotated-types==0.6.0
|
7 |
+
anyio==4.3.0
|
8 |
+
argon2-cffi==23.1.0
|
9 |
+
argon2-cffi-bindings==21.2.0
|
10 |
+
arrow==1.3.0
|
11 |
+
asttokens==2.4.1
|
12 |
+
async-lru==2.0.4
|
13 |
+
async-timeout==4.0.3
|
14 |
+
attrs==23.2.0
|
15 |
+
Babel==2.14.0
|
16 |
+
beautifulsoup4==4.12.3
|
17 |
+
bert-score==0.3.13
|
18 |
+
bleach==6.1.0
|
19 |
+
cbor2==5.6.2
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
colorama==0.4.6
|
25 |
+
comm==0.2.2
|
26 |
+
contourpy==1.2.1
|
27 |
+
cycler==0.12.1
|
28 |
+
datasets==2.18.0
|
29 |
+
debugpy==1.8.1
|
30 |
+
decorator==5.1.1
|
31 |
+
defusedxml==0.7.1
|
32 |
+
diff-match-patch==20230430
|
33 |
+
dill==0.3.8
|
34 |
+
evaluate==0.4.1
|
35 |
+
exceptiongroup==1.2.0
|
36 |
+
executing==2.0.1
|
37 |
+
fastapi==0.110.1
|
38 |
+
fastjsonschema==2.19.1
|
39 |
+
ffmpy==0.3.2
|
40 |
+
filelock==3.13.3
|
41 |
+
fonttools==4.50.0
|
42 |
+
fqdn==1.5.1
|
43 |
+
frozenlist==1.4.1
|
44 |
+
fsspec==2024.2.0
|
45 |
+
gradio==4.25.0
|
46 |
+
gradio_client==0.15.0
|
47 |
+
h11==0.14.0
|
48 |
+
httpcore==1.0.5
|
49 |
+
httpx==0.27.0
|
50 |
+
huggingface-hub==0.22.2
|
51 |
+
idna==3.6
|
52 |
+
importlib_metadata==7.1.0
|
53 |
+
importlib_resources==6.4.0
|
54 |
+
ipykernel==6.29.4
|
55 |
+
ipython==8.18.1
|
56 |
+
ipywidgets==8.1.2
|
57 |
+
isoduration==20.11.0
|
58 |
+
jedi==0.19.1
|
59 |
+
Jinja2==3.1.3
|
60 |
+
joblib==1.4.0
|
61 |
+
json5==0.9.25
|
62 |
+
jsonpointer==2.4
|
63 |
+
jsonschema==4.21.1
|
64 |
+
jsonschema-specifications==2023.12.1
|
65 |
+
kiwisolver==1.4.5
|
66 |
+
lxml==5.2.1
|
67 |
+
markdown-it-py==3.0.0
|
68 |
+
MarkupSafe==2.1.5
|
69 |
+
matplotlib==3.8.4
|
70 |
+
matplotlib-inline==0.1.7
|
71 |
+
mdurl==0.1.2
|
72 |
+
mistune==3.0.2
|
73 |
+
mpmath==1.3.0
|
74 |
+
multidict==6.0.5
|
75 |
+
multiprocess==0.70.16
|
76 |
+
nbclient==0.10.0
|
77 |
+
nbconvert==7.16.4
|
78 |
+
nbformat==5.10.4
|
79 |
+
nest-asyncio==1.6.0
|
80 |
+
networkx==3.2.1
|
81 |
+
nltk==3.8.1
|
82 |
+
numpy==1.26.4
|
83 |
+
orjson==3.10.0
|
84 |
+
overrides==7.7.0
|
85 |
+
packaging==24.0
|
86 |
+
pandas==2.2.1
|
87 |
+
pandocfilters==1.5.1
|
88 |
+
parso==0.8.4
|
89 |
+
pillow==10.3.0
|
90 |
+
platformdirs==4.2.1
|
91 |
+
portalocker==2.8.2
|
92 |
+
prometheus_client==0.20.0
|
93 |
+
prompt-toolkit==3.0.43
|
94 |
+
psutil==5.9.8
|
95 |
+
pure-eval==0.2.2
|
96 |
+
pyarrow==15.0.2
|
97 |
+
pyarrow-hotfix==0.6
|
98 |
+
pycparser==2.22
|
99 |
+
pydantic==2.6.4
|
100 |
+
pydantic_core==2.16.3
|
101 |
+
pydub==0.25.1
|
102 |
+
Pygments==2.17.2
|
103 |
+
pyparsing==3.1.2
|
104 |
+
python-dateutil==2.9.0.post0
|
105 |
+
python-json-logger==2.0.7
|
106 |
+
python-multipart==0.0.9
|
107 |
+
pytz==2024.1
|
108 |
+
PyYAML==6.0.1
|
109 |
+
pyzmq==26.0.2
|
110 |
+
rapidfuzz==3.8.1
|
111 |
+
referencing==0.34.0
|
112 |
+
regex==2023.12.25
|
113 |
+
requests==2.31.0
|
114 |
+
responses==0.18.0
|
115 |
+
rfc3339-validator==0.1.4
|
116 |
+
rfc3986-validator==0.1.1
|
117 |
+
rich==13.7.1
|
118 |
+
rouge-score==0.1.2
|
119 |
+
rpds-py==0.18.0
|
120 |
+
ruff==0.3.5
|
121 |
+
sacrebleu==2.4.2
|
122 |
+
safetensors==0.4.2
|
123 |
+
scikit-learn==1.4.2
|
124 |
+
scipy==1.13.0
|
125 |
+
semantic-version==2.10.0
|
126 |
+
Send2Trash==1.8.3
|
127 |
+
shellingham==1.5.4
|
128 |
+
six==1.16.0
|
129 |
+
sniffio==1.3.1
|
130 |
+
soupsieve==2.5
|
131 |
+
stack-data==0.6.3
|
132 |
+
starlette==0.37.2
|
133 |
+
sympy==1.12
|
134 |
+
tabulate==0.9.0
|
135 |
+
terminado==0.18.1
|
136 |
+
threadpoolctl==3.4.0
|
137 |
+
tinycss2==1.3.0
|
138 |
+
tokenizers==0.15.2
|
139 |
+
tomli==2.0.1
|
140 |
+
tomlkit==0.12.0
|
141 |
+
toolz==0.12.1
|
142 |
+
torch==2.2.2
|
143 |
+
tornado==6.4
|
144 |
+
tqdm==4.66.2
|
145 |
+
traitlets==5.14.3
|
146 |
+
transformers==4.39.3
|
147 |
+
typer==0.12.1
|
148 |
+
types-python-dateutil==2.9.0.20240316
|
149 |
+
typing_extensions==4.10.0
|
150 |
+
tzdata==2024.1
|
151 |
+
uri-template==1.3.0
|
152 |
+
urllib3==2.2.1
|
153 |
+
uvicorn==0.29.0
|
154 |
+
wcwidth==0.2.13
|
155 |
+
webcolors==1.13
|
156 |
+
webencodings==0.5.1
|
157 |
+
websocket-client==1.8.0
|
158 |
+
websockets==11.0.3
|
159 |
+
widgetsnbextension==4.0.10
|
160 |
+
xxhash==3.4.1
|
161 |
+
yarl==1.9.4
|
162 |
+
zipp==3.18.1
|
163 |
+
|
164 |
+
plotly==5.22.0
|
165 |
+
tenacity==8.2.3
|
166 |
+
Levenshtein==0.25.1
|
167 |
+
kaleido==0.2.1
|
run_pipeline.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import config
|
2 |
+
from api_wrappers import hf_data_loader
|
3 |
+
from generation_steps import metrics_analysis, synthetic_end_to_start, synthetic_start_to_end
|
4 |
+
|
5 |
+
|
6 |
+
def run():
|
7 |
+
df = hf_data_loader.load_processed_rewriting_as_pandas()
|
8 |
+
|
9 |
+
df = synthetic_end_to_start.transform(df)
|
10 |
+
df = synthetic_start_to_end.transform(df)
|
11 |
+
df = metrics_analysis.transform(df)
|
12 |
+
|
13 |
+
df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
|
14 |
+
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
run()
|