Lazyhope commited on
Commit
e922192
·
1 Parent(s): 85e656b

Upload pipeline "repo-embedding"

Browse files
config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Lazyhope/unixcoder-nine-advtest",
3
+ "architectures": [
4
+ "RobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "custom_pipelines": {
10
+ "repo-embedding": {
11
+ "default": {
12
+ "model": {
13
+ "pt": [
14
+ "Lazyhope/unixcoder-nine-advtest",
15
+ "main"
16
+ ]
17
+ }
18
+ },
19
+ "impl": "pipeline.RepoEmbeddingPipeline",
20
+ "pt": [
21
+ "AutoModel"
22
+ ],
23
+ "tf": [],
24
+ "type": "text"
25
+ }
26
+ },
27
+ "eos_token_id": 2,
28
+ "gradient_checkpointing": false,
29
+ "hidden_act": "gelu",
30
+ "hidden_dropout_prob": 0.1,
31
+ "hidden_size": 768,
32
+ "initializer_range": 0.02,
33
+ "intermediate_size": 3072,
34
+ "layer_norm_eps": 1e-05,
35
+ "max_position_embeddings": 1026,
36
+ "model_type": "roberta",
37
+ "num_attention_heads": 12,
38
+ "num_hidden_layers": 12,
39
+ "output_past": true,
40
+ "pad_token_id": 1,
41
+ "position_embedding_type": "absolute",
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.24.0",
44
+ "type_vocab_size": 10,
45
+ "use_cache": true,
46
+ "vocab_size": 51416
47
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pipeline.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import os
3
+ import tarfile
4
+ from ast import AsyncFunctionDef, ClassDef, FunctionDef, Module
5
+ from io import BytesIO
6
+
7
+ import numpy as np
8
+ import requests
9
+ import torch
10
+ from transformers import Pipeline
11
+
12
+ API_HEADERS = {"Accept": "application/vnd.github+json"}
13
+ if os.environ.get("GITHUB_TOKEN") is None:
14
+ print(
15
+ "[!] Consider setting GITHUB_TOKEN environment variable to avoid hitting rate limits\n"
16
+ "For more info, see:"
17
+ "https://docs.github.com/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token"
18
+ )
19
+ else:
20
+ API_HEADERS["Authorization"] = f"Bearer {os.environ['GITHUB_TOKEN']}"
21
+ print("[+] Using GITHUB_TOKEN for authentication")
22
+
23
+
24
+ def extract_code_and_docs(text: str):
25
+ """Extract code and documentation from a Python file.
26
+
27
+ Args:
28
+ text (str): Source code of a Python file
29
+
30
+ Returns:
31
+ tuple: A tuple of two sets, the first is the code set, and the second is the docs set,
32
+ each set contains unique code string or docstring, respectively.
33
+ """
34
+ root = ast.parse(text)
35
+ def_nodes = [
36
+ node
37
+ for node in ast.walk(root)
38
+ if isinstance(node, (AsyncFunctionDef, FunctionDef, ClassDef, Module))
39
+ ]
40
+
41
+ code_set = set()
42
+ docs_set = set()
43
+ for node in def_nodes:
44
+ docs = ast.get_docstring(node)
45
+ node_without_docs = node
46
+ if docs is not None:
47
+ docs_set.add(docs)
48
+ # Remove docstrings from the node
49
+ node_without_docs.body = node_without_docs.body[1:]
50
+ if isinstance(node, (AsyncFunctionDef, FunctionDef)):
51
+ code_set.add(ast.unparse(node_without_docs))
52
+
53
+ return code_set, docs_set
54
+
55
+
56
+ def get_topics(repo_name):
57
+ api_url = f"https://api.github.com/repos/{repo_name}"
58
+ print(f"[+] Getting topics for {repo_name}")
59
+ try:
60
+ response = requests.get(api_url, headers=API_HEADERS)
61
+ response.raise_for_status()
62
+ except requests.exceptions.HTTPError as e:
63
+ print(f"[-] Failed to get topics for {repo_name}: {e}")
64
+ return []
65
+
66
+ metadata = response.json()
67
+ topics = metadata.get("topics", [])
68
+ if topics:
69
+ print(f"[+] Topics found for {repo_name}: {topics}")
70
+
71
+ return topics
72
+
73
+
74
+ def download_and_extract(repos):
75
+ extracted_info = {}
76
+ for repo_name in repos:
77
+ extracted_info[repo_name] = {
78
+ "funcs": set(),
79
+ "docs": set(),
80
+ "topics": get_topics(repo_name),
81
+ }
82
+
83
+ download_url = f"https://api.github.com/repos/{repo_name}/tarball"
84
+ print(f"[+] Extracting functions and docstrings from {repo_name}")
85
+ try:
86
+ response = requests.get(download_url, headers=API_HEADERS, stream=True)
87
+ response.raise_for_status()
88
+ except requests.exceptions.HTTPError as e:
89
+ print(f"[-] Failed to download {repo_name}: {e}")
90
+ continue
91
+
92
+ repo_bytes = BytesIO(response.raw.read())
93
+ print(f"[+] Extracting {repo_name} info")
94
+ with tarfile.open(fileobj=repo_bytes) as tar:
95
+ for member in tar.getmembers():
96
+ if member.isfile() and member.name.endswith(".py"):
97
+ file_content = tar.extractfile(member).read().decode("utf-8")
98
+ try:
99
+ code_set, docs_set = extract_code_and_docs(file_content)
100
+ except SyntaxError as e:
101
+ print(f"[-] SyntaxError in {member.name}: {e}, skipping")
102
+ continue
103
+ extracted_info[repo_name]["funcs"].update(code_set)
104
+ extracted_info[repo_name]["docs"].update(docs_set)
105
+
106
+ return extracted_info
107
+
108
+
109
+ class RepoEmbeddingPipeline(Pipeline):
110
+ def _sanitize_parameters(self, **kwargs):
111
+ _forward_kwargs = {}
112
+ if "max_length" in kwargs:
113
+ _forward_kwargs["max_length"] = kwargs["max_length"]
114
+
115
+ return {}, _forward_kwargs, {}
116
+
117
+ def preprocess(self, inputs):
118
+ if isinstance(inputs, str):
119
+ inputs = (inputs,)
120
+
121
+ extracted_infos = download_and_extract(inputs)
122
+
123
+ return extracted_infos
124
+
125
+ def encode(self, text, max_length):
126
+ """
127
+ Generates an embedding for a input string.
128
+
129
+ Parameters:
130
+
131
+ * `text`- The input string to be embedded.
132
+ * `max_length`- The maximum total source sequence length after tokenization.
133
+ """
134
+ assert max_length < 1024
135
+
136
+ tokenizer = self.tokenizer
137
+
138
+ tokens = (
139
+ [tokenizer.cls_token, "<encoder-only>", tokenizer.sep_token]
140
+ + tokenizer.tokenize(text)[: max_length - 4]
141
+ + [tokenizer.sep_token]
142
+ )
143
+ tokens_id = tokenizer.convert_tokens_to_ids(tokens)
144
+ source_ids = torch.tensor([tokens_id])
145
+
146
+ token_embeddings = self.model(source_ids)[0]
147
+ sentence_embeddings = token_embeddings.mean(dim=1)
148
+
149
+ return sentence_embeddings
150
+
151
+ def _forward(self, extracted_infos, max_length=512):
152
+ repo_dataset = {}
153
+ for repo_name, repo_info in extracted_infos.items():
154
+ entry = {"topics": repo_info.get("topics")}
155
+
156
+ print(f"[+] Generating embeddings for {repo_name}")
157
+ if entry.get("code_embeddings") is None:
158
+ code_embeddings = [
159
+ [func, self.encode(func, max_length).squeeze().tolist()]
160
+ for func in repo_info["funcs"]
161
+ ]
162
+ entry["code_embeddings"] = code_embeddings
163
+ entry["mean_code_embeddings"] = (
164
+ np.mean([x[1] for x in code_embeddings], axis=0).tolist()
165
+ if code_embeddings
166
+ else None
167
+ )
168
+ if entry.get("doc_embeddings") is None:
169
+ doc_embeddings = [
170
+ [doc, self.encode(doc, max_length).squeeze().tolist()]
171
+ for doc in repo_info["docs"]
172
+ ]
173
+ entry["doc_embeddings"] = doc_embeddings
174
+ entry["mean_doc_embeddings"] = (
175
+ np.mean([x[1] for x in doc_embeddings], axis=0).tolist()
176
+ if doc_embeddings
177
+ else None
178
+ )
179
+
180
+ repo_dataset[repo_name] = entry
181
+
182
+ return repo_dataset
183
+
184
+ def postprocess(self, repo_dataset):
185
+ return repo_dataset
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec359ccef197b85f9cea791cc2af5728aafd1adcd06713e2a3fb8290c43df3e3
3
+ size 503791405
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "errors": "replace",
28
+ "mask_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<mask>",
31
+ "lstrip": true,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
36
+ "name_or_path": "Lazyhope/unixcoder-nine-advtest",
37
+ "pad_token": {
38
+ "__type": "AddedToken",
39
+ "content": "<pad>",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ "sep_token": {
46
+ "__type": "AddedToken",
47
+ "content": "</s>",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ },
53
+ "special_tokens_map_file": null,
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": {
57
+ "__type": "AddedToken",
58
+ "content": "<unk>",
59
+ "lstrip": false,
60
+ "normalized": true,
61
+ "rstrip": false,
62
+ "single_word": false
63
+ }
64
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff