Spaces:

bahakizil
/

Transcript_Creater

Sleeping

App Files Files Community

bahakizil commited on Jan 30

Commit

23e3a6c

verified ·

1 Parent(s): 9e17941

Update app.py

Browse files

Files changed (1) hide show

app.py +211 -1

app.py CHANGED Viewed

@@ -1,3 +1,213 @@
 import gradio as gr
-gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B").launch()

+import os
 import gradio as gr
+import tiktoken
+import docx
+import PyPDF2
+#######################################
+# 1) MODEL YÜKLEME
+#######################################
+# Hugging Face Spaces'de barındırılan bir modeli "gr.load" ile çağırabilirsiniz.
+# Örn: model_iface = gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
+model_iface = gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
+def call_model(prompt: str) -> str:
+    """
+    Model arayüzünü (model_iface) tek satırda çağırarak sonuç döndürür.
+    """
+    result = model_iface(prompt)
+    if isinstance(result, str):
+        return result
+    return str(result)
+#######################################
+# 2) DOSYA OKUMA (PDF/DOCX/TXT)
+#######################################
+def read_file_to_text(file_obj) -> str:
+    """
+    file_obj: gradio'dan gelen dosya (pdf/docx/txt).
+    Returns: metin (str)
+    """
+    if file_obj is None:
+        return ""
+    file_path = file_obj.name
+    # Uzantı kontrolü
+    _, ext = os.path.splitext(file_path)
+    ext = ext.lower()
+    if ext == ".pdf":
+        return read_pdf(file_path)
+    elif ext == ".docx":
+        return read_docx(file_path)
+    elif ext == ".txt":
+        return read_txt(file_path)
+    else:
+        # Bilinmeyen format - basitçe hata ya da boş dönebilir
+        return ""
+def read_pdf(file_path: str) -> str:
+    text = ""
+    with open(file_path, "rb") as f:
+        reader = PyPDF2.PdfReader(f)
+        for page in reader.pages:
+            text += page.extract_text() + "\n"
+    return text
+def read_docx(file_path: str) -> str:
+    doc = docx.Document(file_path)
+    full_text = []
+    for para in doc.paragraphs:
+        full_text.append(para.text)
+    return "\n".join(full_text)
+def read_txt(file_path: str) -> str:
+    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+        return f.read()
+#######################################
+# 3) TIKTOKEN CHUNK
+#######################################
+def chunk_text_with_tiktoken(text: str, chunk_size=500, model_name="gpt-3.5-turbo"):
+    """
+    text'i 'chunk_size' token uzunluklarında parçalara böler (token bazlı).
+    """
+    encoding = tiktoken.encoding_for_model(model_name)
+    tokens = encoding.encode(text)
+    chunks = []
+    for i in range(0, len(tokens), chunk_size):
+        sub_tokens = tokens[i:i+chunk_size]
+        chunk_str = encoding.decode(sub_tokens)
+        chunks.append(chunk_str)
+    return chunks
+#######################################
+# 4) 11 CHUNK: 4 HEADING + 3 VALIDATION
+#######################################
+def generate_4_headings_3_validation(full_text: str) -> str:
+    """
+    4 heading (her heading 2 chunk: üretici + kontrol = 8) + 3 validation = 11 chunk
+    """
+    final_output = ""
+    # ========== HEADING 1 ==========
+    # 1) üretici
+    h1_prod = call_model(
+        f"[HEADING 1 PRODUCTION]\n"
+        f"Input:\n{full_text}\n"
+        "Task: 'Heading 1: Introductory overview' with 3000-6000 chars."
+    )
+    # 2) kontrol
+    h1_ctrl = call_model(
+        f"[HEADING 1 CONTROL]\n"
+        f"H1 Production:\n{h1_prod}\n"
+        "Check 3000-6000 chars, fix if needed."
+    )
+    final_output += f"<b>HEADING 1 (Corrected)</b><hr>\n{h1_ctrl}\n\n"
+    # ========== HEADING 2 ==========
+    # 3) üretici
+    h2_prod = call_model(
+        f"[HEADING 2 PRODUCTION]\n"
+        f"Input:\n{full_text}\n"
+        "Task: 'Heading 2: Detailed explanation of common risks' with 500-1200 chars."
+    )
+    # 4) kontrol
+    h2_ctrl = call_model(
+        f"[HEADING 2 CONTROL]\n"
+        f"H2 Production:\n{h2_prod}\n"
+        "Check 500-1200 chars, fix if needed."
+    )
+    final_output += f"<b>HEADING 2 (Corrected)</b><hr>\n{h2_ctrl}\n\n"
+    # ========== HEADING 3 ==========
+    # 5) üretici
+    h3_prod = call_model(
+        f"[HEADING 3 PRODUCTION]\n"
+        f"Input:\n{full_text}\n"
+        "Task: 'Heading 3: Practical examples and solutions' with 500-1200 chars."
+    )
+    # 6) kontrol
+    h3_ctrl = call_model(
+        f"[HEADING 3 CONTROL]\n"
+        f"H3 Production:\n{h3_prod}\n"
+        "Check 500-1200 chars, fix if needed."
+    )
+    final_output += f"<b>HEADING 3 (Corrected)</b><hr>\n{h3_ctrl}\n\n"
+    # ========== HEADING 4 ==========
+    # 7) üretici
+    h4_prod = call_model(
+        f"[HEADING 4 PRODUCTION]\n"
+        f"Input:\n{full_text}\n"
+        "Task: 'Heading 4: Summary and next steps for students' with 500-1200 chars."
+    )
+    # 8) kontrol
+    h4_ctrl = call_model(
+        f"[HEADING 4 CONTROL]\n"
+        f"H4 Production:\n{h4_prod}\n"
+        "Check 500-1200 chars, fix if needed."
+    )
+    final_output += f"<b>HEADING 4 (Corrected)</b><hr>\n{h4_ctrl}\n\n"
+    # ========== 3 VALIDATION CHUNK ==========
+    current_text = final_output
+    for i in range(1, 4):
+        validation_out = call_model(
+            f"[VALIDATION #{i}]\n"
+            f"Current text:\n{current_text}\n"
+            "Check headings' constraints. If fixes needed, do them. Otherwise 'No changes needed.'"
+        )
+        current_text = validation_out
+    return current_text
+#######################################
+# 5) GRADIO ARAYÜZ FONKSİYONU
+#######################################
+def main_interface(file, manual_text, chunk_size):
+    """
+    file: Yüklenen dosya (PDF/DOCX/TXT)
+    manual_text: Kullanıcının girdiği ham metin
+    chunk_size: Tiktoken chunk uzunluğu
+    """
+    # 1) Dosya varsa, ondan metin çekelim
+    doc_text = read_file_to_text(file)
+    # 2) Metni oluştur -> file metni + manual_text
+    combined_text = (doc_text + "\n" + manual_text).strip()
+    if not combined_text:
+        return "No input text found."
+    # 3) Tiktoken chunk
+    chunks = chunk_text_with_tiktoken(combined_text, chunk_size=chunk_size)
+    # 4) Tüm chunk'ları birleştirip (veya isterseniz parça parça da işleyebilirsiniz),
+    #    11-chunk mantığına sokalım
+    full_text = "\n".join(chunks)
+    final_output = generate_4_headings_3_validation(full_text)
+    return final_output.replace("\n", "<br>")
+#######################################
+# 6) GRADIO ARAYÜZ TANIMI
+#######################################
+demo = gr.Interface(
+    fn=main_interface,
+    inputs=[
+        gr.File(label="Upload PDF/DOCX/TXT (optional)"),
+        gr.Textbox(lines=5, label="Or Paste Some Text"),
+        gr.Slider(minimum=100, maximum=2000, step=100, value=500, label="Chunk Size (tokens)")
+    ],
+    outputs="html",
+    title="PDF/DOCX + Tiktoken + 4 Heading + 3 Validation (11 Chunk)"
+)
+def run():
+    demo.launch()
+if __name__ == "__main__":
+    run()