Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,213 @@
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
2 |
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
import gradio as gr
|
3 |
+
import tiktoken
|
4 |
+
import docx
|
5 |
+
import PyPDF2
|
6 |
|
7 |
+
#######################################
|
8 |
+
# 1) MODEL YÜKLEME
|
9 |
+
#######################################
|
10 |
+
# Hugging Face Spaces'de barındırılan bir modeli "gr.load" ile çağırabilirsiniz.
|
11 |
+
# Örn: model_iface = gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
|
12 |
+
|
13 |
+
model_iface = gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
|
14 |
+
|
15 |
+
def call_model(prompt: str) -> str:
|
16 |
+
"""
|
17 |
+
Model arayüzünü (model_iface) tek satırda çağırarak sonuç döndürür.
|
18 |
+
"""
|
19 |
+
result = model_iface(prompt)
|
20 |
+
if isinstance(result, str):
|
21 |
+
return result
|
22 |
+
return str(result)
|
23 |
+
|
24 |
+
#######################################
|
25 |
+
# 2) DOSYA OKUMA (PDF/DOCX/TXT)
|
26 |
+
#######################################
|
27 |
+
def read_file_to_text(file_obj) -> str:
|
28 |
+
"""
|
29 |
+
file_obj: gradio'dan gelen dosya (pdf/docx/txt).
|
30 |
+
Returns: metin (str)
|
31 |
+
"""
|
32 |
+
if file_obj is None:
|
33 |
+
return ""
|
34 |
+
|
35 |
+
file_path = file_obj.name
|
36 |
+
# Uzantı kontrolü
|
37 |
+
_, ext = os.path.splitext(file_path)
|
38 |
+
ext = ext.lower()
|
39 |
+
|
40 |
+
if ext == ".pdf":
|
41 |
+
return read_pdf(file_path)
|
42 |
+
elif ext == ".docx":
|
43 |
+
return read_docx(file_path)
|
44 |
+
elif ext == ".txt":
|
45 |
+
return read_txt(file_path)
|
46 |
+
else:
|
47 |
+
# Bilinmeyen format - basitçe hata ya da boş dönebilir
|
48 |
+
return ""
|
49 |
+
|
50 |
+
def read_pdf(file_path: str) -> str:
|
51 |
+
text = ""
|
52 |
+
with open(file_path, "rb") as f:
|
53 |
+
reader = PyPDF2.PdfReader(f)
|
54 |
+
for page in reader.pages:
|
55 |
+
text += page.extract_text() + "\n"
|
56 |
+
return text
|
57 |
+
|
58 |
+
def read_docx(file_path: str) -> str:
|
59 |
+
doc = docx.Document(file_path)
|
60 |
+
full_text = []
|
61 |
+
for para in doc.paragraphs:
|
62 |
+
full_text.append(para.text)
|
63 |
+
return "\n".join(full_text)
|
64 |
+
|
65 |
+
def read_txt(file_path: str) -> str:
|
66 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
67 |
+
return f.read()
|
68 |
+
|
69 |
+
#######################################
|
70 |
+
# 3) TIKTOKEN CHUNK
|
71 |
+
#######################################
|
72 |
+
def chunk_text_with_tiktoken(text: str, chunk_size=500, model_name="gpt-3.5-turbo"):
|
73 |
+
"""
|
74 |
+
text'i 'chunk_size' token uzunluklarında parçalara böler (token bazlı).
|
75 |
+
"""
|
76 |
+
encoding = tiktoken.encoding_for_model(model_name)
|
77 |
+
tokens = encoding.encode(text)
|
78 |
+
chunks = []
|
79 |
+
for i in range(0, len(tokens), chunk_size):
|
80 |
+
sub_tokens = tokens[i:i+chunk_size]
|
81 |
+
chunk_str = encoding.decode(sub_tokens)
|
82 |
+
chunks.append(chunk_str)
|
83 |
+
return chunks
|
84 |
+
|
85 |
+
#######################################
|
86 |
+
# 4) 11 CHUNK: 4 HEADING + 3 VALIDATION
|
87 |
+
#######################################
|
88 |
+
def generate_4_headings_3_validation(full_text: str) -> str:
|
89 |
+
"""
|
90 |
+
4 heading (her heading 2 chunk: üretici + kontrol = 8) + 3 validation = 11 chunk
|
91 |
+
"""
|
92 |
+
|
93 |
+
final_output = ""
|
94 |
+
|
95 |
+
# ========== HEADING 1 ==========
|
96 |
+
# 1) üretici
|
97 |
+
h1_prod = call_model(
|
98 |
+
f"[HEADING 1 PRODUCTION]\n"
|
99 |
+
f"Input:\n{full_text}\n"
|
100 |
+
"Task: 'Heading 1: Introductory overview' with 3000-6000 chars."
|
101 |
+
)
|
102 |
+
# 2) kontrol
|
103 |
+
h1_ctrl = call_model(
|
104 |
+
f"[HEADING 1 CONTROL]\n"
|
105 |
+
f"H1 Production:\n{h1_prod}\n"
|
106 |
+
"Check 3000-6000 chars, fix if needed."
|
107 |
+
)
|
108 |
+
final_output += f"<b>HEADING 1 (Corrected)</b><hr>\n{h1_ctrl}\n\n"
|
109 |
+
|
110 |
+
# ========== HEADING 2 ==========
|
111 |
+
# 3) üretici
|
112 |
+
h2_prod = call_model(
|
113 |
+
f"[HEADING 2 PRODUCTION]\n"
|
114 |
+
f"Input:\n{full_text}\n"
|
115 |
+
"Task: 'Heading 2: Detailed explanation of common risks' with 500-1200 chars."
|
116 |
+
)
|
117 |
+
# 4) kontrol
|
118 |
+
h2_ctrl = call_model(
|
119 |
+
f"[HEADING 2 CONTROL]\n"
|
120 |
+
f"H2 Production:\n{h2_prod}\n"
|
121 |
+
"Check 500-1200 chars, fix if needed."
|
122 |
+
)
|
123 |
+
final_output += f"<b>HEADING 2 (Corrected)</b><hr>\n{h2_ctrl}\n\n"
|
124 |
+
|
125 |
+
# ========== HEADING 3 ==========
|
126 |
+
# 5) üretici
|
127 |
+
h3_prod = call_model(
|
128 |
+
f"[HEADING 3 PRODUCTION]\n"
|
129 |
+
f"Input:\n{full_text}\n"
|
130 |
+
"Task: 'Heading 3: Practical examples and solutions' with 500-1200 chars."
|
131 |
+
)
|
132 |
+
# 6) kontrol
|
133 |
+
h3_ctrl = call_model(
|
134 |
+
f"[HEADING 3 CONTROL]\n"
|
135 |
+
f"H3 Production:\n{h3_prod}\n"
|
136 |
+
"Check 500-1200 chars, fix if needed."
|
137 |
+
)
|
138 |
+
final_output += f"<b>HEADING 3 (Corrected)</b><hr>\n{h3_ctrl}\n\n"
|
139 |
+
|
140 |
+
# ========== HEADING 4 ==========
|
141 |
+
# 7) üretici
|
142 |
+
h4_prod = call_model(
|
143 |
+
f"[HEADING 4 PRODUCTION]\n"
|
144 |
+
f"Input:\n{full_text}\n"
|
145 |
+
"Task: 'Heading 4: Summary and next steps for students' with 500-1200 chars."
|
146 |
+
)
|
147 |
+
# 8) kontrol
|
148 |
+
h4_ctrl = call_model(
|
149 |
+
f"[HEADING 4 CONTROL]\n"
|
150 |
+
f"H4 Production:\n{h4_prod}\n"
|
151 |
+
"Check 500-1200 chars, fix if needed."
|
152 |
+
)
|
153 |
+
final_output += f"<b>HEADING 4 (Corrected)</b><hr>\n{h4_ctrl}\n\n"
|
154 |
+
|
155 |
+
# ========== 3 VALIDATION CHUNK ==========
|
156 |
+
current_text = final_output
|
157 |
+
for i in range(1, 4):
|
158 |
+
validation_out = call_model(
|
159 |
+
f"[VALIDATION #{i}]\n"
|
160 |
+
f"Current text:\n{current_text}\n"
|
161 |
+
"Check headings' constraints. If fixes needed, do them. Otherwise 'No changes needed.'"
|
162 |
+
)
|
163 |
+
current_text = validation_out
|
164 |
+
|
165 |
+
return current_text
|
166 |
+
|
167 |
+
#######################################
|
168 |
+
# 5) GRADIO ARAYÜZ FONKSİYONU
|
169 |
+
#######################################
|
170 |
+
def main_interface(file, manual_text, chunk_size):
|
171 |
+
"""
|
172 |
+
file: Yüklenen dosya (PDF/DOCX/TXT)
|
173 |
+
manual_text: Kullanıcının girdiği ham metin
|
174 |
+
chunk_size: Tiktoken chunk uzunluğu
|
175 |
+
"""
|
176 |
+
|
177 |
+
# 1) Dosya varsa, ondan metin çekelim
|
178 |
+
doc_text = read_file_to_text(file)
|
179 |
+
|
180 |
+
# 2) Metni oluştur -> file metni + manual_text
|
181 |
+
combined_text = (doc_text + "\n" + manual_text).strip()
|
182 |
+
if not combined_text:
|
183 |
+
return "No input text found."
|
184 |
+
|
185 |
+
# 3) Tiktoken chunk
|
186 |
+
chunks = chunk_text_with_tiktoken(combined_text, chunk_size=chunk_size)
|
187 |
+
|
188 |
+
# 4) Tüm chunk'ları birleştirip (veya isterseniz parça parça da işleyebilirsiniz),
|
189 |
+
# 11-chunk mantığına sokalım
|
190 |
+
full_text = "\n".join(chunks)
|
191 |
+
final_output = generate_4_headings_3_validation(full_text)
|
192 |
+
|
193 |
+
return final_output.replace("\n", "<br>")
|
194 |
+
|
195 |
+
#######################################
|
196 |
+
# 6) GRADIO ARAYÜZ TANIMI
|
197 |
+
#######################################
|
198 |
+
demo = gr.Interface(
|
199 |
+
fn=main_interface,
|
200 |
+
inputs=[
|
201 |
+
gr.File(label="Upload PDF/DOCX/TXT (optional)"),
|
202 |
+
gr.Textbox(lines=5, label="Or Paste Some Text"),
|
203 |
+
gr.Slider(minimum=100, maximum=2000, step=100, value=500, label="Chunk Size (tokens)")
|
204 |
+
],
|
205 |
+
outputs="html",
|
206 |
+
title="PDF/DOCX + Tiktoken + 4 Heading + 3 Validation (11 Chunk)"
|
207 |
+
)
|
208 |
+
|
209 |
+
def run():
|
210 |
+
demo.launch()
|
211 |
+
|
212 |
+
if __name__ == "__main__":
|
213 |
+
run()
|