OCRonos-TextCorrect

Sleeping

App Files Files Community

Pclanglais commited on Aug 4, 2024

Commit

b6cc9e1

verified ·

1 Parent(s): 1fca231

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -178

app.py CHANGED Viewed

@@ -1,201 +1,66 @@
 import spaces
 import transformers
 import re
-from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
-from vllm import LLM, SamplingParams
 import torch
 import gradio as gr
-import json
 import os
-import shutil
-import requests
-import pandas as pd
-import difflib
 from concurrent.futures import ThreadPoolExecutor
 # Define the device
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# OCR Correction Model
-ocr_model_name = "PleIAs/OCRonos-Vintage"
-import torch
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
-# Load pre-trained model and tokenizer
-model_name = "PleIAs/OCRonos-Vintage"
-model = GPT2LMHeadModel.from_pretrained(model_name)
-tokenizer = GPT2Tokenizer.from_pretrained(model_name)
-# Set the device to GPU if available, otherwise use CPU
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-# CSS for formatting
 css = """
 <style>
-.generation {
-    margin-left: 2em;
-    margin-right: 2em;
-    font-size: 1.2em;
-}
-:target {
-    background-color: #CCF3DF;
-}
-.source {
-    float: left;
-    max-width: 17%;
-    margin-left: 2%;
-}
-.tooltip {
-    position: relative;
-    cursor: pointer;
-    font-variant-position: super;
-    color: #97999b;
-}
-.tooltip:hover::after {
-    content: attr(data-text);
-    position: absolute;
-    left: 0;
-    top: 120%;
-    white-space: pre-wrap;
-    width: 500px;
-    max-width: 500px;
-    z-index: 1;
-    background-color: #f9f9f9;
-    color: #000;
-    border: 1px solid #ddd;
-    border-radius: 5px;
-    padding: 5px;
-    display: block;
-    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
-}
-.deleted {
-    background-color: #ffcccb;
-    text-decoration: line-through;
-}
-.inserted {
-    background-color: #90EE90;
-}
-.manuscript {
-    display: flex;
-    margin-bottom: 10px;
-    align-items: baseline;
-}
-.annotation {
-    width: 15%;
-    padding-right: 20px;
-    color: grey !important;
-    font-style: italic;
-    text-align: right;
-}
-.content {
-    width: 80%;
-}
-h2 {
-    margin: 0;
-    font-size: 1.5em;
-}
-.title-content h2 {
-    font-weight: bold;
-}
-.bibliography-content {
-    color: darkgreen !important;
-    margin-top: -5px;
-}
-.paratext-content {
-    color: #a4a4a4 !important;
-    margin-top: -5px;
-}
 </style>
 """
 # Helper functions
 def generate_html_diff(old_text, new_text):
-    d = difflib.Differ()
-    diff = list(d.compare(old_text.split(), new_text.split()))
-    html_diff = []
-    for word in diff:
-        if word.startswith(' '):
-            html_diff.append(word[2:])
-        elif word.startswith('+ '):
-            html_diff.append(f'<span style="background-color: #90EE90;">{word[2:]}</span>')
-    return ' '.join(html_diff)
 def preprocess_text(text):
-    text = re.sub(r'<[^>]+>', '', text)
-    text = re.sub(r'\n', ' ', text)
-    text = re.sub(r'\s+', ' ', text)
-    return text.strip()
-def split_text(text, max_tokens=500):
-    parts = text.split("\n")
-    chunks = []
-    current_chunk = ""
-    for part in parts:
-        if current_chunk:
-            temp_chunk = current_chunk + "\n" + part
-        else:
-            temp_chunk = part
-        num_tokens = len(tokenizer.tokenize(temp_chunk))
-        if num_tokens <= max_tokens:
-            current_chunk = temp_chunk
-        else:
-            if current_chunk:
-                chunks.append(current_chunk)
-            current_chunk = part
-    if current_chunk:
-        chunks.append(current_chunk)
-    if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens:
-        long_text = chunks[0]
-        chunks = []
-        while len(tokenizer.tokenize(long_text)) > max_tokens:
-            split_point = len(long_text) // 2
-            while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]):
-                split_point += 1
-            if split_point >= len(long_text):
-                split_point = len(long_text) - 1
-            chunks.append(long_text[:split_point].strip())
-            long_text = long_text[split_point:].strip()
-        if long_text:
-            chunks.append(long_text)
-    return chunks
-# Function to generate text
-def ocr_correction(prompt, max_new_tokens=600, num_threads=os.cpu_count()):
-    prompt = f"""### Text ###\n{prompt}\n\n\n### Correction ###\n"""
-    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
-    # Set the number of threads for PyTorch
-    torch.set_num_threads(num_threads)
-    # Generate text
-    with ThreadPoolExecutor(max_workers=num_threads) as executor:
-        future = executor.submit(
-            model.generate,
-            input_ids,
-            max_new_tokens=max_new_tokens,
-            pad_token_id=tokenizer.eos_token_id,
-            top_k=50,
-            num_return_sequences=1,
-            do_sample=True,
-            temperature=0.7
-        )
-        output = future.result()
-    # Decode and return the generated text
-    result = tokenizer.decode(output[0], skip_special_tokens=True)
-    print(result)
-    result = result.split("### Correction ###")[1]
-    return result
 # OCR Correction Class
 class OCRCorrector:
@@ -214,7 +79,7 @@ class TextProcessor:
     @spaces.GPU(duration=120)
     def process(self, user_message):
-        #OCR Correction
         corrected_text, html_diff = self.ocr_corrector.correct(user_message)
         # Combine results

 import spaces
 import transformers
 import re
 import torch
 import gradio as gr
 import os
+import ctranslate2
 from concurrent.futures import ThreadPoolExecutor
 # Define the device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load CTranslate2 model and tokenizer
+model_path = "PleIAs/OCRonos-Vintage-CT2"
+generator = ctranslate2.Generator(model_path, device=device)
+tokenizer = transformers.AutoTokenizer.from_pretrained("PleIAs/OCRonos-Vintage")
+# CSS for formatting (unchanged)
 css = """
 <style>
+... (your existing CSS)
 </style>
 """
 # Helper functions
 def generate_html_diff(old_text, new_text):
+    # (unchanged)
+    ...
 def preprocess_text(text):
+    # (unchanged)
+    ...
+def split_text(text, max_tokens=400):
+    encoded = tokenizer.encode(text)
+    splits = []
+    for i in range(0, len(encoded), max_tokens):
+        split = encoded[i:i+max_tokens]
+        splits.append(tokenizer.decode(split))
+    return splits
+# Function to generate text using CTranslate2
+def ocr_correction(prompt, max_new_tokens=600):
+    splits = split_text(prompt, max_tokens=400)
+    corrected_splits = []
+    for split in splits:
+        full_prompt = f"### Text ###\n{split}\n\n\n### Correction ###\n"
+        encoded = tokenizer.encode(full_prompt)
+        prompt_tokens = tokenizer.convert_ids_to_tokens(encoded)
+        result = generator.generate_batch(
+            [prompt_tokens],
+            max_length=max_new_tokens,
+            sampling_temperature=0.7,
+            sampling_topk=20,
+            include_prompt_in_result=False
+        )[0]
+        corrected_text = tokenizer.decode(result.sequences_ids[0])
+        corrected_splits.append(corrected_text)
+    return " ".join(corrected_splits)
 # OCR Correction Class
 class OCRCorrector:
     @spaces.GPU(duration=120)
     def process(self, user_message):
+        # OCR Correction
         corrected_text, html_diff = self.ocr_corrector.correct(user_message)
         # Combine results