File size: 4,451 Bytes
bcbf898 12c19cf 35a3912 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
---
datasets:
- nuprl/EditPackFT-Multi
tags:
- code
---
# What is this
This is a deepseek coder 7b model trained to predict commit messages for a diff.
# Languages trained on:
```py
LANGS = [
"Python",
"Rust",
"JavaScript",
"Java",
"Go",
"C++",
"C#",
"Ruby",
"PHP",
"TypeScript",
"C",
"Scala",
"Swift",
"Kotlin",
"Objective-C",
"Perl",
"Haskell",
"Bash",
"Sh",
"Lua",
"R",
"Julia",
]
```
# How to prompt:
```python
import difflib
class NDiff:
def __init__(self, s1, s2):
self.s1 = s1
self.s2 = s2
self.diff = difflib.ndiff(s1.split("\n"), s2.split("\n"))
def __str__(self):
return "\n".join([l for l in self.diff if l[0] != "?"])
def str_colored(self):
import colored
buf = ""
for l in self.diff:
if l[0] == "?":
continue
if l[0] == "-":
buf += colored.stylize(l, colored.fg("red"))
elif l[0] == "+":
buf += colored.stylize(l, colored.fg("green"))
else:
buf += l
buf += "\n"
return buf
def num_removed(self):
return len([l for l in self.diff if l[0] == "-"])
def num_added(self):
return len([l for l in self.diff if l[0] == "+"])
def __repr__(self):
return self.__str__()
def format_prompt(old, new):
diff_header = "<diff>"
instr_header = "<commit_message>"
diff = str(NDiff(old, new))
return f"{diff_header}\n{diff}\n{instr_header}\n"
def gen(old, new, max_new_tokens=200, temperature=0.45, top_p=0.90):
prompt = format_prompt(old, new)
toks = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
outs = model.generate(toks, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p)
return [tokenizer.decode(out[len(toks[0]):], skip_special_tokens=True) for out in outs]
```
use the "gen" function with the old and new code
# Example:
```py
- import datasets
- from pathlib import Path
from code_editing.models import CodeLlamaEditModel, LlamaChatModel, EditModel, EditCommand, ChatAdaptorEditModel, OctoCoderChatModel, codellama_edit_prompt_diff, apply_rel_diff_trim, OpenAIChatModel, StarCoderCommitEditModel
from code_editing.humanevalpack import batch_prompts_from_example
from code_editing.utils import gunzip_json_write
from typing import List, Callable
from tqdm import tqdm
# NOTE: this is the factory for each model type. to add a new model type, add a new case here
# and implement it in models.py. Also, add a new case in the argument parser below.
- def model_factory(model_type: str, quantize=False, num_gpus=1) -> Callable[[str], EditModel]:
+ def model_factory(
+ model_type: str,
+ quantize=False,
+ num_gpus=1,
+ system_supported=True,
+ ) -> Callable[[str], EditModel]:
if model_type == "codellama" or model_type == "deepseek":
return CodeLlamaEditModel
elif model_type == "starcoder":
return StarCoderCommitEditModel
elif model_type == "codellama-diff":
return (lambda path: CodeLlamaEditModel(path, prompt_format=codellama_edit_prompt_diff, post_process=apply_rel_diff_trim))
elif model_type == "openai":
return (lambda path: ChatAdaptorEditModel(OpenAIChatModel(path)))
elif model_type == "codellama-chat":
- return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus)))
+ return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus, system_supported=system_supported)))
elif model_type == "octocoder":
return (lambda path: ChatAdaptorEditModel(OctoCoderChatModel(path, quantization=quantize, num_gpus=num_gpus)))
else:
raise ValueError(f"Unknown model type: {model_type}")
def complete_problem(example: EditCommand, model: EditModel, batch_size: int, completion_limit: int, **kwargs) -> List[str]:
batches = batch_prompts_from_example(example, batch_size, completion_limit)
completions = []
for batch in batches:
resps = model.generate(batch, **kwargs)
for resp in resps:
completions.append(resp["content"])
return completions
```
Produced:
```
Add system_supported argument to model_factory
``` |