Pclanglais commited on
Commit
eed441d
·
verified ·
1 Parent(s): 8482186

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -5
app.py CHANGED
@@ -16,20 +16,102 @@ generator = ctranslate2.Generator(model_path, device=device)
16
  tokenizer = transformers.AutoTokenizer.from_pretrained("PleIAs/OCRonos-Vintage")
17
 
18
  # CSS for formatting (unchanged)
 
19
  css = """
20
  <style>
21
- ... (your existing CSS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  </style>
23
  """
24
 
25
  # Helper functions
26
  def generate_html_diff(old_text, new_text):
27
- # (unchanged)
28
- ...
 
 
 
 
 
 
 
29
 
30
  def preprocess_text(text):
31
- # (unchanged)
32
- ...
 
 
33
 
34
  def split_text(text, max_tokens=400):
35
  encoded = tokenizer.encode(text)
 
16
  tokenizer = transformers.AutoTokenizer.from_pretrained("PleIAs/OCRonos-Vintage")
17
 
18
  # CSS for formatting (unchanged)
19
+ # CSS for formatting
20
  css = """
21
  <style>
22
+ .generation {
23
+ margin-left: 2em;
24
+ margin-right: 2em;
25
+ font-size: 1.2em;
26
+ }
27
+ :target {
28
+ background-color: #CCF3DF;
29
+ }
30
+ .source {
31
+ float: left;
32
+ max-width: 17%;
33
+ margin-left: 2%;
34
+ }
35
+ .tooltip {
36
+ position: relative;
37
+ cursor: pointer;
38
+ font-variant-position: super;
39
+ color: #97999b;
40
+ }
41
+ .tooltip:hover::after {
42
+ content: attr(data-text);
43
+ position: absolute;
44
+ left: 0;
45
+ top: 120%;
46
+ white-space: pre-wrap;
47
+ width: 500px;
48
+ max-width: 500px;
49
+ z-index: 1;
50
+ background-color: #f9f9f9;
51
+ color: #000;
52
+ border: 1px solid #ddd;
53
+ border-radius: 5px;
54
+ padding: 5px;
55
+ display: block;
56
+ box-shadow: 0 4px 8px rgba(0,0,0,0.1);
57
+ }
58
+ .deleted {
59
+ background-color: #ffcccb;
60
+ text-decoration: line-through;
61
+ }
62
+ .inserted {
63
+ background-color: #90EE90;
64
+ }
65
+ .manuscript {
66
+ display: flex;
67
+ margin-bottom: 10px;
68
+ align-items: baseline;
69
+ }
70
+ .annotation {
71
+ width: 15%;
72
+ padding-right: 20px;
73
+ color: grey !important;
74
+ font-style: italic;
75
+ text-align: right;
76
+ }
77
+ .content {
78
+ width: 80%;
79
+ }
80
+ h2 {
81
+ margin: 0;
82
+ font-size: 1.5em;
83
+ }
84
+ .title-content h2 {
85
+ font-weight: bold;
86
+ }
87
+ .bibliography-content {
88
+ color: darkgreen !important;
89
+ margin-top: -5px;
90
+ }
91
+ .paratext-content {
92
+ color: #a4a4a4 !important;
93
+ margin-top: -5px;
94
+ }
95
  </style>
96
  """
97
 
98
  # Helper functions
99
  def generate_html_diff(old_text, new_text):
100
+ d = difflib.Differ()
101
+ diff = list(d.compare(old_text.split(), new_text.split()))
102
+ html_diff = []
103
+ for word in diff:
104
+ if word.startswith(' '):
105
+ html_diff.append(word[2:])
106
+ elif word.startswith('+ '):
107
+ html_diff.append(f'<span style="background-color: #90EE90;">{word[2:]}</span>')
108
+ return ' '.join(html_diff)
109
 
110
  def preprocess_text(text):
111
+ text = re.sub(r'<[^>]+>', '', text)
112
+ text = re.sub(r'\n', ' ', text)
113
+ text = re.sub(r'\s+', ' ', text)
114
+ return text.strip()
115
 
116
  def split_text(text, max_tokens=400):
117
  encoded = tokenizer.encode(text)