Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import easyocr | |
| from transformers import AutoModel, AutoTokenizer | |
| from PIL import Image | |
| import warnings | |
| from transformers import logging | |
| import re | |
| #To Surpaas warnings | |
| warnings.filterwarnings("ignore", message="The attention mask and the pad token id were not set.") | |
| warnings.filterwarnings("ignore", message="Setting `pad_token_id` to `eos_token_id`") | |
| warnings.filterwarnings("ignore", message="The `seen_tokens` attribute is deprecated") | |
| logging.set_verbosity_error() | |
| tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True) | |
| model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, low_cpu_mem_usage=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id) | |
| model = model.eval() | |
| easyocr_reader = easyocr.Reader(['hi'], gpu=False) | |
| # Function to perform OCR based on selected language | |
| def perform_ocr(image, language): | |
| if language == "Hindi": | |
| image_np = np.array(image) | |
| result = easyocr_reader.readtext(image_np, detail=0) | |
| return ' '.join(result) | |
| elif language == "English": | |
| image_path = 'temp_image.png' | |
| image.save(image_path) | |
| result = model.chat(tokenizer, image_path, ocr_type='ocr') | |
| return result | |
| else: | |
| return "Invalid language selection. Please choose Hindi or English." | |
| def process_keyword(image, language, keyword): | |
| extracted_text = perform_ocr(image, language) | |
| if keyword: | |
| keyword_regex = re.escape(keyword) | |
| highlighted_text = re.sub( | |
| f'({keyword_regex})', r'<mark style="background-color: yellow">\1</mark>', extracted_text, flags=re.IGNORECASE | |
| ) | |
| if highlighted_text != extracted_text: | |
| return highlighted_text | |
| else: | |
| return f"No keyword '{keyword}' found in the text." | |
| else: | |
| return extracted_text | |