intuitive262 commited on
Commit
3243581
·
1 Parent(s): 92873c0

Update code files

Browse files
Files changed (1) hide show
  1. app.py +0 -20
app.py CHANGED
@@ -37,26 +37,6 @@ def extract_text(image, query):
37
  generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
38
  return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
39
 
40
- # def post_process_text(text):
41
- # # Split the text into lines
42
- # lines = text.split('. ')
43
-
44
- # processed_lines = []
45
- # for line in lines:
46
- # # Separate Hindi and English text
47
- # parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
48
- # if len(parts) > 1:
49
- # processed_lines.append(f"{parts[0]}{parts[1]}\n {parts[2]}")
50
- # else:
51
- # processed_lines.append(line)
52
-
53
- # # Join the lines with double line breaks
54
- # text = '\n\n'.join(processed_lines)
55
-
56
- # # Remove repeated phrases
57
- # unique_phrases = list(dict.fromkeys(text.split('\n\n')))
58
- # text = '\n\n'.join(unique_phrases)
59
- # return text
60
 
61
  def ocr(image):
62
  queries = [
 
37
  generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
38
  return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def ocr(image):
42
  queries = [