Spaces:
Running
Running
Commit
·
3243581
1
Parent(s):
92873c0
Update code files
Browse files
app.py
CHANGED
@@ -37,26 +37,6 @@ def extract_text(image, query):
|
|
37 |
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
38 |
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
39 |
|
40 |
-
# def post_process_text(text):
|
41 |
-
# # Split the text into lines
|
42 |
-
# lines = text.split('. ')
|
43 |
-
|
44 |
-
# processed_lines = []
|
45 |
-
# for line in lines:
|
46 |
-
# # Separate Hindi and English text
|
47 |
-
# parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
|
48 |
-
# if len(parts) > 1:
|
49 |
-
# processed_lines.append(f"{parts[0]}{parts[1]}\n {parts[2]}")
|
50 |
-
# else:
|
51 |
-
# processed_lines.append(line)
|
52 |
-
|
53 |
-
# # Join the lines with double line breaks
|
54 |
-
# text = '\n\n'.join(processed_lines)
|
55 |
-
|
56 |
-
# # Remove repeated phrases
|
57 |
-
# unique_phrases = list(dict.fromkeys(text.split('\n\n')))
|
58 |
-
# text = '\n\n'.join(unique_phrases)
|
59 |
-
# return text
|
60 |
|
61 |
def ocr(image):
|
62 |
queries = [
|
|
|
37 |
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
38 |
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
def ocr(image):
|
42 |
queries = [
|