File size: 7,730 Bytes
ff81e3f 3911265 ff81e3f 65b1bb3 ff81e3f 3911265 ff81e3f acd3106 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
import streamlit as st
import gradio as gr
import numpy as np
import whisper
import os
import streamlit.components.v1 as components
import tempfile
import io
import requests
import json
import openai
from transformers import AutoConfig, AutoTokenizer, AutoModel
from summarizer import Summarizer
# File upload size bug?
# st.set_option('server.maxUploadSize', 500)
# Initialize session state for claims_extraction
st.session_state.claims_extraction = ""
# Initialize session state for userinput
st.session_state.userinput = "" # Initialize user input
# Define a function to split text into chunks
def chunk_text(text, chunk_size=2000):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
chunks.append(chunk)
start = end
return chunks
# Streamlit Session State
if 'learning_objectives' not in st.session_state:
st.session_state.learning_objectives = ""
# Initialize the Whisper model outside the button
if 'whisper_model' not in st.session_state:
st.session_state.whisper_model = whisper.load_model("base")
# Streamlit Interface
markdown_text = """
# 👋🏻Welcome to [Team](https://huggingface.co/TeamTonic) [Tonic](https://huggingface.co/Tonic) 's Patentable Claims Extractor.
Here you can input audio and text and extract patentable claims from these conversational inputs using [LegalBert](nlpaueb/legal-bert-base-uncased).
- Save time and effort when ideating for your future business. Expect latency upwards of 2.5 hours !
"""
# Render the Markdown content
st.markdown(markdown_text)
# API Key Input
api_key = st.text_input("Enter your OpenAI API Key:", type="password")
# Audio Upload
st.write("Upload an audio file (supported formats: mp3, wav, ogg)")
audio_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "ogg"], key="audio_file")
audio_data = None
if audio_file is not None:
audio_data = audio_file.read()
st.audio(audio_data, format="audio/wav")
st.info("Transcribing...")
st.success("Transcription complete")
# Moved the submit_button check here
if st.button('Start Transcription'):
model = st.session_state.whisper_model
if audio_data:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_file:
audio_file.write(audio_data)
audio_file_path = audio_file.name
st.audio(audio_file_path, format="audio/wav")
st.info("Transcribing...")
st.success("Transcription complete")
result = model.transcribe(audio_file_path)
transcript = result['text'] # Define the 'transcript' variable
with st.expander("See transcript"):
st.markdown(transcript)
# Display the Whisper transcription
if 'transcript' in locals():
st.text("Transcription:")
st.text(transcript)
# Update the user input field with the transcription
st.session_state.userinput = st.text_area("Input Text:", transcript)
# Model Selection Dropdown
model_choice = st.selectbox(
"Select the model you want to use:",
["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-0613", "gpt-4"]
)
# Context, Subject, and Level
context = "You are a patent claims identifier and extractor. You will freeform text, identify any claims contained therein that may be patentable. You identify, extract, print such claims, briefly explain why each claim is patentable."
# Initialize OpenAI API
if api_key:
openai.api_key = api_key
# Learning Objectives
st.write("### Patentable Claims:")
# Initialize autogenerated objectives
claims_extraction = ""
# Initialize status placeholder
learning_status_placeholder = st.empty()
disable_button_bool = False
if st.session_state.userinput and api_key and st.button("Extract Claims", key="claims_extraction", disabled=disable_button_bool):
# Split the user input into chunks
input_chunks = chunk_text(userinput)
# Initialize a variable to store the extracted claims
all_extracted_claims = ""
for chunk in input_chunks:
# Display status message for the current chunk
learning_status_placeholder.text(f"Extracting Patentable Claims for chunk {input_chunks.index(chunk) + 1}...")
# API call to generate objectives for the current chunk
claims_extraction_response = openai.ChatCompletion.create(
model=model_choice,
messages=[
{"role": "user", "content": f"Extract any patentable claims from the following: \n {chunk}. \n Extract each claim. Briefly explain why you extracted this word phrase. Exclude any additional commentary."}
]
)
# Extract the generated objectives from the API response
claims_extraction = claims_extraction_response['choices'][0]['message']['content']
# Append the extracted claims from the current chunk to the overall results
all_extracted_claims += claims_extraction.strip()
# Save the generated objectives to session state
st.session_state.claims_extraction = all_extracted_claims
# Display generated objectives for all chunks
learning_status_placeholder.text(f"Patentable Claims Extracted!\n{all_extracted_claims.strip()}")
# Get the extracted claims from Streamlit's session state
claims_extracted = st.session_state.claims_extraction
# Display the Extracted Claims
if 'claims_extracted' in st.session_state:
st.text("Extracted Claims:")
st.text(st.session_state.claims_extracted)
# Define the BERT-based model name
model_name = 'nlpaueb/legal-bert-base-uncased'
# Initialize BERT-based model and tokenizer
custom_config = AutoConfig.from_pretrained(model_name)
custom_config.output_hidden_states = True
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
print('Using model {}\n'.format(model_name))
# Get the extracted claims from Streamlit's session state
# claims_extracted = st.session_state.claims_extraction #moved up
# Define the chunk size
chunk_size = 350
# Split the extracted claims into chunks
if isinstance(claims_extracted, str):
chunks = [claims_extracted[i:i+chunk_size] for i in range(0, len(claims_extracted), chunk_size)]
else:
chunks = []
# Process each chunk with the BERT-based model
summaries = []
for chunk in chunks:
summary = bert_legal_model(chunk, min_length=20, ratio=0.9)
summaries.append(summary)
# Now you have a list of summaries for each chunk
# You can access them using `summaries[0]`, `summaries[1]`, etc.
# After generating summaries
for i, summary in enumerate(summaries):
st.write(f"### Summary {i+1}")
st.write(summary)
# Display the BERT Summaries
if summaries:
st.text("BERT Summaries:")
for i, summary in enumerate(summaries):
st.text(f"Summary {i + 1}:\n{summary}")
# Citation for the GitHub repo
st.markdown("<sub>This app was created by [Tonic](https://huggingface.co/tonic) with help from [MIND INTERFACES](https://huggingface.co/MIND-INTERFACES) & [Taylor](https://huggingface.co/Cloudfaith) [join us on discord](https://discord.gg/5RmtZVVfgQ) </sub>", unsafe_allow_html=True)
|