kevinlu1248 commited on
Commit
d596fb5
·
1 Parent(s): f749736

made app.py better

Browse files
Files changed (1) hide show
  1. app.py +155 -4
app.py CHANGED
@@ -1,7 +1,158 @@
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import requests
5
+ from dataclasses import dataclass
6
+
7
  import gradio as gr
8
+ from tree_sitter import Tree, Node
9
+ from tree_sitter_languages import get_parser
10
+
11
+ def non_whitespace_len(s: str) -> int: # new len function
12
+ return len(re.sub("\s", "", s))
13
+
14
+ def get_line_number(index: int, source_code: str) -> int:
15
+ total_chars = 0
16
+ for line_number, line in enumerate(source_code.splitlines(keepends=True), start=1):
17
+ total_chars += len(line)
18
+ if total_chars > index:
19
+ return line_number - 1
20
+ return line_number
21
+
22
+ @dataclass
23
+ class Span:
24
+ # Represents a slice of a string
25
+ start: int = 0
26
+ end: int = 0
27
+
28
+ def __post_init__(self):
29
+ # If end is None, set it to start
30
+ if self.end is None:
31
+ self.end = self.start
32
+
33
+ def extract(self, s: str) -> str:
34
+ # Grab the corresponding substring of string s by bytes
35
+ return s[self.start: self.end]
36
+
37
+ def extract_lines(self, s: str) -> str:
38
+ # Grab the corresponding substring of string s by lines
39
+ return "\n".join(s.splitlines()[self.start:self.end])
40
+
41
+ def __add__(self, other: Span | int) -> Span:
42
+ # e.g. Span(1, 2) + Span(2, 4) = Span(1, 4) (concatenation)
43
+ # There are no safety checks: Span(a, b) + Span(c, d) = Span(a, d)
44
+ # and there are no requirements for b = c.
45
+ if isinstance(other, int):
46
+ return Span(self.start + other, self.end + other)
47
+ elif isinstance(other, Span):
48
+ return Span(self.start, other.end)
49
+ else:
50
+ raise NotImplementedError()
51
+
52
+ def __len__(self) -> int:
53
+ # i.e. Span(a, b) = b - a
54
+ return self.end - self.start
55
+
56
+ def chunk_tree(
57
+ tree: Tree,
58
+ source_code: bytes,
59
+ MAX_CHARS=512 * 3,
60
+ coalesce=50 # Any chunk less than 50 characters long gets coalesced with the next chunk
61
+ ) -> list[Span]:
62
+
63
+ # 1. Recursively form chunks based on the last post (https://docs.sweep.dev/blogs/chunking-2m-files)
64
+ def chunk_node(node: Node) -> list[Span]:
65
+ chunks: list[Span] = []
66
+ current_chunk: Span = Span(node.start_byte, node.start_byte)
67
+ node_children = node.children
68
+ for child in node_children:
69
+ if child.end_byte - child.start_byte > MAX_CHARS:
70
+ chunks.append(current_chunk)
71
+ current_chunk = Span(child.end_byte, child.end_byte)
72
+ chunks.extend(chunk_node(child))
73
+ elif child.end_byte - child.start_byte + len(current_chunk) > MAX_CHARS:
74
+ chunks.append(current_chunk)
75
+ current_chunk = Span(child.start_byte, child.end_byte)
76
+ else:
77
+ current_chunk += Span(child.start_byte, child.end_byte)
78
+ chunks.append(current_chunk)
79
+ return chunks
80
+ chunks = chunk_node(tree.root_node)
81
+
82
+ # 2. Filling in the gaps
83
+ for prev, curr in zip(chunks[:-1], chunks[1:]):
84
+ prev.end = curr.start
85
+ curr.start = tree.root_node.end_byte
86
+
87
+ # 3. Combining small chunks with bigger ones
88
+ new_chunks = []
89
+ current_chunk = Span(0, 0)
90
+ for chunk in chunks:
91
+ current_chunk += chunk
92
+ if non_whitespace_len(current_chunk.extract(source_code.decode("utf-8"))) > coalesce \
93
+ and "\n" in current_chunk.extract(source_code.decode("utf-8")):
94
+ new_chunks.append(current_chunk)
95
+ current_chunk = Span(chunk.end, chunk.end)
96
+ if len(current_chunk) > 0:
97
+ new_chunks.append(current_chunk)
98
+
99
+ # 4. Changing line numbers
100
+ line_chunks = [
101
+ Span(
102
+ get_line_number(chunk.start, source_code),
103
+ get_line_number(chunk.end, source_code)
104
+ )
105
+ for chunk in new_chunks
106
+ ]
107
+
108
+ # 5. Eliminating empty chunks
109
+ line_chunks = [chunk for chunk in line_chunks if len(chunk) > 0]
110
+
111
+ return line_chunks
112
+
113
+ css = """
114
+ .code_container {
115
+ }
116
+ """
117
+
118
+ def chunk_code(
119
+ code: str,
120
+ language: str,
121
+ MAX_CHARS: int,
122
+ coalesce: int
123
+ ):
124
+ try:
125
+ parser = get_parser(language)
126
+ tree = parser.parse(code.encode("utf-8"))
127
+ chunks = chunk_tree(tree, code.encode("utf-8"), MAX_CHARS=MAX_CHARS, coalesce=coalesce)
128
+ chunks = [chunk.extract_lines(code) for chunk in chunks]
129
+ return "\n\n====================\n\n".join(chunks)
130
+ except Exception as e:
131
+ return str(e)
132
+
133
+ with gr.Blocks(css=css) as demo:
134
+ gr.Markdown("Start typing below and the chunked output will automatically show up.")
135
+
136
+ default_file = "https://raw.githubusercontent.com/sweepai/sweep/b267b613d4c706eaf959fe6789f11e9a856521d1/sweepai/handlers/on_check_suite.py"
137
+ default_code = requests.get(default_file).text
138
+
139
+ with gr.Row():
140
+ language = gr.Dropdown(["python", "javascript", "go", "ruby", "java", "php", "c", "cpp", "rust", "haskell"], label="Language", value="python")
141
+ max_chars = gr.Slider(100, 3000, 1500, label="Max Characters")
142
+ coalesce = gr.Slider(0, 300, 100, label="Coalesce")
143
+ with gr.Row():
144
+ inp = gr.Code(placeholder="Enter the code here", label="Code to Chunk", language=language.value, lines=60, elem_classes="code_container", value=default_code)
145
+ out = gr.Code(label="Chunked Code", language=language.value, lines=60, value=chunk_code(default_code, language.value, max_chars.value, coalesce.value))
146
+
147
+ def update_language(inp, language, max_chars, coalesce):
148
+ return (
149
+ gr.update(language=language),
150
+ gr.update(language=language, value=chunk_code(inp.value, language, max_chars, coalesce))
151
+ )
152
 
153
+ language.change(fn=update_language, inputs=[inp, language, max_chars, coalesce], outputs=[inp, out])
154
+ max_chars.change(fn=chunk_code, inputs=[inp, language, max_chars, coalesce], outputs=out)
155
+ coalesce.change(fn=chunk_code, inputs=[inp, language, max_chars, coalesce], outputs=out)
156
+ inp.change(fn=chunk_code, inputs=[inp, language, max_chars, coalesce], outputs=out)
157
 
158
+ demo.launch()