| import pdfplumber | |
| from transformers import AutoModelForTokenClassification, AutoTokenizer | |
| import torch | |
| import gradio as gr | |
| def load_model_and_tokenizer(model_name="dbmdz/bert-large-cased-finetuned-conll03-english"): | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForTokenClassification.from_pretrained(model_name) | |
| return tokenizer, model | |
| def named_entity_recognition(text, tokenizer, model): | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) | |
| tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predictions = torch.argmax(outputs.logits, dim=-1) | |
| entities = [(token, model.config.id2label[prediction.item()]) for token, prediction in zip(tokens, predictions[0])] | |
| return [entity for entity in entities if entity[1] != 'O'] | |
| def extract_text_from_pdf(pdf): | |
| text = "" | |
| with pdfplumber.open(pdf) as pdf_file: | |
| for page in pdf_file.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + " " | |
| return text.strip() | |
| def process_pdf(pdf): | |
| text = extract_text_from_pdf(pdf) | |
| if not text: | |
| return "No text found in the PDF." | |
| entities = named_entity_recognition(text, tokenizer, model) | |
| return entities if entities else "No named entities found." | |
| tokenizer, model = load_model_and_tokenizer() | |
| gr.Interface( | |
| fn=process_pdf, | |
| inputs=gr.File(label="Upload PDF"), | |
| outputs="text", | |
| title="Named Entity Recognition from PDF", | |
| description="Upload a PDF file to extract text and perform Named Entity Recognition using a pre-trained BERT model." | |
| ).launch() |