import streamlit as st import pdfplumber from PIL import Image import pytesseract #from transformers import pipeline import io import os from dotenv import load_dotenv from groqSummarizer import GroqSummarizer # SwedishBeagle-dare from transformers import AutoTokenizer import transformers import torch from streamlit import session_state as ss from streamlit_pdf_viewer import pdf_viewer class Summarizer: def __init__(self, model = "groq"): self.model = model def run_app(self): # For displaying pdf if 'pdf_ref' not in ss: ss.pdf_ref = None uploaded_file = st.file_uploader("Upload an Image or PDF", type=["jpg", "jpeg", "png", "pdf"], key="file") if uploaded_file is not None: if uploaded_file.type == "application/pdf": with st.spinner("Extracting text from PDF..."): text = self.extract_text_from_pdf(uploaded_file) if ss.file: ss.pdf_ref = ss.file else: image = Image.open(uploaded_file) with st.spinner("Extracting text from image..."): text = self.extract_text_from_image(image) if text: with st.spinner("Summarizing text..."): summary = self.summarize_using_groq(text) st.subheader("Summary") st.write(summary) with st.expander("Extracted Text", expanded = False): st.write(text) if ss.pdf_ref: st.subheader("Original pdf") binary_data = ss.pdf_ref.getvalue() pdf_viewer(input=binary_data, width=700) # Function to extract text from an image def extract_text_from_image(self, image): text = pytesseract.image_to_string(image) return text # Function to extract text from a PDF def extract_text_from_pdf(self, pdf): text = "" with pdfplumber.open(pdf) as pdf_file: for page in pdf_file.pages: text += page.extract_text() return text def shorten_text(self, text, max_tokens): tokens = text.split(" ") if len(tokens) > max_tokens: tokens = tokens[:max_tokens] text = " ".join(tokens) print("Shortened text to " + str(max_tokens) + " tokens") return text def summarize_using_groq(self, text): # Decrease the number of tokens if the response is 429, i.e. too many tokens in the request # # https://context.ai/compare/llama3-70b-instruct-v1/gpt-4 # ^^ Säger att max tokens är 8000, men efter tester så verkar det vara # närmare 2000 om man räknar att tokens är separerade med blanksteg. # (Detta är inte ett helt korrekt sätt att räkna det) # max_tokens = 8000 max_tokens = 2000 while True: try: gs = GroqSummarizer() return gs.summarize(text) except Exception as e: if e.response.status_code == 429: text = self.shorten_text(text, max_tokens) max_tokens = int(max_tokens * 0.9) else: return "Error: " + str(e) def summarize_using_swedishbeagle(self, text): # https://huggingface.co/FredrikBL/SwedishBeagle-dare model = "FredrikBL/SwedishBeagle-dare" messages = [ { "role": "system", "content": "You summarize texts that the users sends" }, { "role": "user", "content": text } ] tokenizer = AutoTokenizer.from_pretrained(model) prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) pipeline = transformers.pipeline( "text-generation", model=model, torch_dtype=torch.float16, device_map="auto", ) outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) return outputs[0]["generated_text"] def summarize(self, text): if(self.model == "groq"): return self.summarize_using_groq(text) elif(self.model == "SwedishBeagle-dare"): return self.summarize_using_swedishbeagle(text) # Streamlit app def main(): # Models: # - groq # - SwedishBeagle-dare summarizer = Summarizer(model = "groq") summarizer.run_app() if __name__ == "__main__": main()