bahakizil commited on
Commit
23e3a6c
·
verified ·
1 Parent(s): 9e17941

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +211 -1
app.py CHANGED
@@ -1,3 +1,213 @@
 
1
  import gradio as gr
 
 
 
2
 
3
- gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import gradio as gr
3
+ import tiktoken
4
+ import docx
5
+ import PyPDF2
6
 
7
+ #######################################
8
+ # 1) MODEL YÜKLEME
9
+ #######################################
10
+ # Hugging Face Spaces'de barındırılan bir modeli "gr.load" ile çağırabilirsiniz.
11
+ # Örn: model_iface = gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
12
+
13
+ model_iface = gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
14
+
15
+ def call_model(prompt: str) -> str:
16
+ """
17
+ Model arayüzünü (model_iface) tek satırda çağırarak sonuç döndürür.
18
+ """
19
+ result = model_iface(prompt)
20
+ if isinstance(result, str):
21
+ return result
22
+ return str(result)
23
+
24
+ #######################################
25
+ # 2) DOSYA OKUMA (PDF/DOCX/TXT)
26
+ #######################################
27
+ def read_file_to_text(file_obj) -> str:
28
+ """
29
+ file_obj: gradio'dan gelen dosya (pdf/docx/txt).
30
+ Returns: metin (str)
31
+ """
32
+ if file_obj is None:
33
+ return ""
34
+
35
+ file_path = file_obj.name
36
+ # Uzantı kontrolü
37
+ _, ext = os.path.splitext(file_path)
38
+ ext = ext.lower()
39
+
40
+ if ext == ".pdf":
41
+ return read_pdf(file_path)
42
+ elif ext == ".docx":
43
+ return read_docx(file_path)
44
+ elif ext == ".txt":
45
+ return read_txt(file_path)
46
+ else:
47
+ # Bilinmeyen format - basitçe hata ya da boş dönebilir
48
+ return ""
49
+
50
+ def read_pdf(file_path: str) -> str:
51
+ text = ""
52
+ with open(file_path, "rb") as f:
53
+ reader = PyPDF2.PdfReader(f)
54
+ for page in reader.pages:
55
+ text += page.extract_text() + "\n"
56
+ return text
57
+
58
+ def read_docx(file_path: str) -> str:
59
+ doc = docx.Document(file_path)
60
+ full_text = []
61
+ for para in doc.paragraphs:
62
+ full_text.append(para.text)
63
+ return "\n".join(full_text)
64
+
65
+ def read_txt(file_path: str) -> str:
66
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
67
+ return f.read()
68
+
69
+ #######################################
70
+ # 3) TIKTOKEN CHUNK
71
+ #######################################
72
+ def chunk_text_with_tiktoken(text: str, chunk_size=500, model_name="gpt-3.5-turbo"):
73
+ """
74
+ text'i 'chunk_size' token uzunluklarında parçalara böler (token bazlı).
75
+ """
76
+ encoding = tiktoken.encoding_for_model(model_name)
77
+ tokens = encoding.encode(text)
78
+ chunks = []
79
+ for i in range(0, len(tokens), chunk_size):
80
+ sub_tokens = tokens[i:i+chunk_size]
81
+ chunk_str = encoding.decode(sub_tokens)
82
+ chunks.append(chunk_str)
83
+ return chunks
84
+
85
+ #######################################
86
+ # 4) 11 CHUNK: 4 HEADING + 3 VALIDATION
87
+ #######################################
88
+ def generate_4_headings_3_validation(full_text: str) -> str:
89
+ """
90
+ 4 heading (her heading 2 chunk: üretici + kontrol = 8) + 3 validation = 11 chunk
91
+ """
92
+
93
+ final_output = ""
94
+
95
+ # ========== HEADING 1 ==========
96
+ # 1) üretici
97
+ h1_prod = call_model(
98
+ f"[HEADING 1 PRODUCTION]\n"
99
+ f"Input:\n{full_text}\n"
100
+ "Task: 'Heading 1: Introductory overview' with 3000-6000 chars."
101
+ )
102
+ # 2) kontrol
103
+ h1_ctrl = call_model(
104
+ f"[HEADING 1 CONTROL]\n"
105
+ f"H1 Production:\n{h1_prod}\n"
106
+ "Check 3000-6000 chars, fix if needed."
107
+ )
108
+ final_output += f"<b>HEADING 1 (Corrected)</b><hr>\n{h1_ctrl}\n\n"
109
+
110
+ # ========== HEADING 2 ==========
111
+ # 3) üretici
112
+ h2_prod = call_model(
113
+ f"[HEADING 2 PRODUCTION]\n"
114
+ f"Input:\n{full_text}\n"
115
+ "Task: 'Heading 2: Detailed explanation of common risks' with 500-1200 chars."
116
+ )
117
+ # 4) kontrol
118
+ h2_ctrl = call_model(
119
+ f"[HEADING 2 CONTROL]\n"
120
+ f"H2 Production:\n{h2_prod}\n"
121
+ "Check 500-1200 chars, fix if needed."
122
+ )
123
+ final_output += f"<b>HEADING 2 (Corrected)</b><hr>\n{h2_ctrl}\n\n"
124
+
125
+ # ========== HEADING 3 ==========
126
+ # 5) üretici
127
+ h3_prod = call_model(
128
+ f"[HEADING 3 PRODUCTION]\n"
129
+ f"Input:\n{full_text}\n"
130
+ "Task: 'Heading 3: Practical examples and solutions' with 500-1200 chars."
131
+ )
132
+ # 6) kontrol
133
+ h3_ctrl = call_model(
134
+ f"[HEADING 3 CONTROL]\n"
135
+ f"H3 Production:\n{h3_prod}\n"
136
+ "Check 500-1200 chars, fix if needed."
137
+ )
138
+ final_output += f"<b>HEADING 3 (Corrected)</b><hr>\n{h3_ctrl}\n\n"
139
+
140
+ # ========== HEADING 4 ==========
141
+ # 7) üretici
142
+ h4_prod = call_model(
143
+ f"[HEADING 4 PRODUCTION]\n"
144
+ f"Input:\n{full_text}\n"
145
+ "Task: 'Heading 4: Summary and next steps for students' with 500-1200 chars."
146
+ )
147
+ # 8) kontrol
148
+ h4_ctrl = call_model(
149
+ f"[HEADING 4 CONTROL]\n"
150
+ f"H4 Production:\n{h4_prod}\n"
151
+ "Check 500-1200 chars, fix if needed."
152
+ )
153
+ final_output += f"<b>HEADING 4 (Corrected)</b><hr>\n{h4_ctrl}\n\n"
154
+
155
+ # ========== 3 VALIDATION CHUNK ==========
156
+ current_text = final_output
157
+ for i in range(1, 4):
158
+ validation_out = call_model(
159
+ f"[VALIDATION #{i}]\n"
160
+ f"Current text:\n{current_text}\n"
161
+ "Check headings' constraints. If fixes needed, do them. Otherwise 'No changes needed.'"
162
+ )
163
+ current_text = validation_out
164
+
165
+ return current_text
166
+
167
+ #######################################
168
+ # 5) GRADIO ARAYÜZ FONKSİYONU
169
+ #######################################
170
+ def main_interface(file, manual_text, chunk_size):
171
+ """
172
+ file: Yüklenen dosya (PDF/DOCX/TXT)
173
+ manual_text: Kullanıcının girdiği ham metin
174
+ chunk_size: Tiktoken chunk uzunluğu
175
+ """
176
+
177
+ # 1) Dosya varsa, ondan metin çekelim
178
+ doc_text = read_file_to_text(file)
179
+
180
+ # 2) Metni oluştur -> file metni + manual_text
181
+ combined_text = (doc_text + "\n" + manual_text).strip()
182
+ if not combined_text:
183
+ return "No input text found."
184
+
185
+ # 3) Tiktoken chunk
186
+ chunks = chunk_text_with_tiktoken(combined_text, chunk_size=chunk_size)
187
+
188
+ # 4) Tüm chunk'ları birleştirip (veya isterseniz parça parça da işleyebilirsiniz),
189
+ # 11-chunk mantığına sokalım
190
+ full_text = "\n".join(chunks)
191
+ final_output = generate_4_headings_3_validation(full_text)
192
+
193
+ return final_output.replace("\n", "<br>")
194
+
195
+ #######################################
196
+ # 6) GRADIO ARAYÜZ TANIMI
197
+ #######################################
198
+ demo = gr.Interface(
199
+ fn=main_interface,
200
+ inputs=[
201
+ gr.File(label="Upload PDF/DOCX/TXT (optional)"),
202
+ gr.Textbox(lines=5, label="Or Paste Some Text"),
203
+ gr.Slider(minimum=100, maximum=2000, step=100, value=500, label="Chunk Size (tokens)")
204
+ ],
205
+ outputs="html",
206
+ title="PDF/DOCX + Tiktoken + 4 Heading + 3 Validation (11 Chunk)"
207
+ )
208
+
209
+ def run():
210
+ demo.launch()
211
+
212
+ if __name__ == "__main__":
213
+ run()