Summarize / summarize.py
IsakNordgren's picture
working locally
79b4e95
raw
history blame
4.01 kB
import streamlit as st
import pdfplumber
from PIL import Image
import pytesseract
#from transformers import pipeline
import io
import os
from dotenv import load_dotenv
# groq
from groq import Groq
# SwedishBeagle-dare
from transformers import AutoTokenizer
import transformers
import torch
class Summarizer:
def __init__(self, model = "groq"):
self.model = model
self.client = self.load_groq()
def run_app(self):
uploaded_file = st.file_uploader("Upload an Image or PDF", type=["jpg", "jpeg", "png", "pdf"])
if uploaded_file is not None:
if uploaded_file.type == "application/pdf":
with st.spinner("Extracting text from PDF..."):
text = self.extract_text_from_pdf(uploaded_file)
else:
image = Image.open(uploaded_file)
with st.spinner("Extracting text from image..."):
text = self.extract_text_from_image(image)
if text:
with st.spinner("Summarizing text..."):
summary = self.summarize_using_groq(text)
st.subheader("Summary")
st.write(summary)
st.subheader("Extracted Text")
st.write(text)
# Function to extract text from an image
def extract_text_from_image(self, image):
text = pytesseract.image_to_string(image)
return text
# Function to extract text from a PDF
def extract_text_from_pdf(self, pdf):
text = ""
with pdfplumber.open(pdf) as pdf_file:
for page in pdf_file.pages:
text += page.extract_text()
return text
# Function to summarize text
#def summarize_text(self, text):
# summarizer = pipeline("summarization")
# summary = summarizer(text, max_length=150, min_length=30, do_sample=False)
# return summary[0]['summary_text']
def load_groq(self):
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
client = Groq(
api_key=GROQ_API_KEY
)
return client
def summarize_using_groq(self, text):
chat_completion = self.client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You summarize texts that the users sends"
},
{
"role": "user",
"content": text,
}
],
model="mixtral-8x7b-32768",
)
return chat_completion.choices[0].message.content
def summarize_using_swedishbeagle(self, text):
# https://huggingface.co/FredrikBL/SwedishBeagle-dare
model = "FredrikBL/SwedishBeagle-dare"
messages = [
{
"role": "system",
"content": "You summarize texts that the users sends"
},
{
"role": "user",
"content": text
}
]
tokenizer = AutoTokenizer.from_pretrained(model)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipeline = transformers.pipeline(
"text-generation",
model=model,
torch_dtype=torch.float16,
device_map="auto",
)
outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
return outputs[0]["generated_text"]
def summarize(self, text):
if(self.model == "groq"):
return self.summarize_using_groq(text)
elif(self.model == "SwedishBeagle-dare"):
return self.summarize_using_swedishbeagle(text)
# Streamlit app
def main():
# Models:
# - groq
# - SwedishBeagle-dare
summarizer = Summarizer(model="SwedishBeagle-dare")
summarizer.run_app()
if __name__ == "__main__":
main()