Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pdfplumber | |
from PIL import Image | |
import pytesseract | |
#from transformers import pipeline | |
import io | |
import os | |
from dotenv import load_dotenv | |
# groq | |
from groq import Groq | |
# SwedishBeagle-dare | |
from transformers import AutoTokenizer | |
import transformers | |
import torch | |
class Summarizer: | |
def __init__(self, model = "groq"): | |
self.model = model | |
self.client = self.load_groq() | |
def run_app(self): | |
uploaded_file = st.file_uploader("Upload an Image or PDF", type=["jpg", "jpeg", "png", "pdf"]) | |
if uploaded_file is not None: | |
if uploaded_file.type == "application/pdf": | |
with st.spinner("Extracting text from PDF..."): | |
text = self.extract_text_from_pdf(uploaded_file) | |
else: | |
image = Image.open(uploaded_file) | |
with st.spinner("Extracting text from image..."): | |
text = self.extract_text_from_image(image) | |
if text: | |
with st.spinner("Summarizing text..."): | |
summary = self.summarize_using_groq(text) | |
st.subheader("Summary") | |
st.write(summary) | |
st.subheader("Extracted Text") | |
st.write(text) | |
# Function to extract text from an image | |
def extract_text_from_image(self, image): | |
text = pytesseract.image_to_string(image) | |
return text | |
# Function to extract text from a PDF | |
def extract_text_from_pdf(self, pdf): | |
text = "" | |
with pdfplumber.open(pdf) as pdf_file: | |
for page in pdf_file.pages: | |
text += page.extract_text() | |
return text | |
# Function to summarize text | |
#def summarize_text(self, text): | |
# summarizer = pipeline("summarization") | |
# summary = summarizer(text, max_length=150, min_length=30, do_sample=False) | |
# return summary[0]['summary_text'] | |
def load_groq(self): | |
load_dotenv() | |
GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
client = Groq( | |
api_key=GROQ_API_KEY | |
) | |
return client | |
def summarize_using_groq(self, text): | |
chat_completion = self.client.chat.completions.create( | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You summarize texts that the users sends" | |
}, | |
{ | |
"role": "user", | |
"content": text, | |
} | |
], | |
model="mixtral-8x7b-32768", | |
) | |
return chat_completion.choices[0].message.content | |
def summarize_using_swedishbeagle(self, text): | |
# https://huggingface.co/FredrikBL/SwedishBeagle-dare | |
model = "FredrikBL/SwedishBeagle-dare" | |
messages = [ | |
{ | |
"role": "system", | |
"content": "You summarize texts that the users sends" | |
}, | |
{ | |
"role": "user", | |
"content": text | |
} | |
] | |
tokenizer = AutoTokenizer.from_pretrained(model) | |
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
pipeline = transformers.pipeline( | |
"text-generation", | |
model=model, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
) | |
outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) | |
return outputs[0]["generated_text"] | |
def summarize(self, text): | |
if(self.model == "groq"): | |
return self.summarize_using_groq(text) | |
elif(self.model == "SwedishBeagle-dare"): | |
return self.summarize_using_swedishbeagle(text) | |
# Streamlit app | |
def main(): | |
# Models: | |
# - groq | |
# - SwedishBeagle-dare | |
summarizer = Summarizer(model="SwedishBeagle-dare") | |
summarizer.run_app() | |
if __name__ == "__main__": | |
main() |