|
import streamlit as st |
|
import gradio as gr |
|
import numpy as np |
|
import whisper |
|
import os |
|
import streamlit.components.v1 as components |
|
import tempfile |
|
import io |
|
import requests |
|
import json |
|
import openai |
|
from transformers import AutoConfig, AutoTokenizer, AutoModel |
|
from summarizer import Summarizer |
|
|
|
|
|
|
|
|
|
|
|
|
|
st.session_state.claims_extraction = "" |
|
|
|
|
|
st.session_state.userinput = "" |
|
|
|
|
|
def chunk_text(text, chunk_size=2000): |
|
chunks = [] |
|
start = 0 |
|
while start < len(text): |
|
end = start + chunk_size |
|
chunk = text[start:end] |
|
chunks.append(chunk) |
|
start = end |
|
return chunks |
|
|
|
|
|
if 'learning_objectives' not in st.session_state: |
|
st.session_state.learning_objectives = "" |
|
|
|
|
|
if 'whisper_model' not in st.session_state: |
|
st.session_state.whisper_model = whisper.load_model("base") |
|
|
|
|
|
|
|
markdown_text = """ |
|
# 👋🏻Welcome to [Team](https://huggingface.co/TeamTonic) [Tonic](https://huggingface.co/Tonic) 's Patentable Claims Extractor. |
|
Here you can input audio and text and extract patentable claims from these conversational inputs using [LegalBert](nlpaueb/legal-bert-base-uncased). |
|
- Save time and effort when ideating for your future business. Expect latency upwards of 2.5 hours ! |
|
""" |
|
|
|
|
|
st.markdown(markdown_text) |
|
|
|
|
|
api_key = st.text_input("Enter your OpenAI API Key:", type="password") |
|
|
|
|
|
st.write("Upload an audio file (supported formats: mp3, wav, ogg)") |
|
audio_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "ogg"], key="audio_file") |
|
|
|
audio_data = None |
|
|
|
if audio_file is not None: |
|
audio_data = audio_file.read() |
|
st.audio(audio_data, format="audio/wav") |
|
st.info("Transcribing...") |
|
st.success("Transcription complete") |
|
|
|
|
|
if st.button('Start Transcription'): |
|
model = st.session_state.whisper_model |
|
|
|
if audio_data: |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_file: |
|
audio_file.write(audio_data) |
|
audio_file_path = audio_file.name |
|
st.audio(audio_file_path, format="audio/wav") |
|
st.info("Transcribing...") |
|
st.success("Transcription complete") |
|
result = model.transcribe(audio_file_path) |
|
transcript = result['text'] |
|
|
|
with st.expander("See transcript"): |
|
st.markdown(transcript) |
|
|
|
|
|
if 'transcript' in locals(): |
|
st.text("Transcription:") |
|
st.text(transcript) |
|
|
|
|
|
st.session_state.userinput = st.text_area("Input Text:", transcript) |
|
|
|
|
|
model_choice = st.selectbox( |
|
"Select the model you want to use:", |
|
["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-0613", "gpt-4"] |
|
) |
|
|
|
|
|
context = "You are a patent claims identifier and extractor. You will freeform text, identify any claims contained therein that may be patentable. You identify, extract, print such claims, briefly explain why each claim is patentable." |
|
|
|
|
|
if api_key: |
|
openai.api_key = api_key |
|
|
|
|
|
st.write("### Patentable Claims:") |
|
|
|
|
|
claims_extraction = "" |
|
|
|
|
|
learning_status_placeholder = st.empty() |
|
|
|
disable_button_bool = False |
|
|
|
if st.session_state.userinput and api_key and st.button("Extract Claims", key="claims_extraction", disabled=disable_button_bool): |
|
|
|
input_chunks = chunk_text(userinput) |
|
|
|
|
|
all_extracted_claims = "" |
|
|
|
for chunk in input_chunks: |
|
|
|
learning_status_placeholder.text(f"Extracting Patentable Claims for chunk {input_chunks.index(chunk) + 1}...") |
|
|
|
|
|
claims_extraction_response = openai.ChatCompletion.create( |
|
model=model_choice, |
|
messages=[ |
|
{"role": "user", "content": f"Extract any patentable claims from the following: \n {chunk}. \n Extract each claim. Briefly explain why you extracted this word phrase. Exclude any additional commentary."} |
|
] |
|
) |
|
|
|
|
|
claims_extraction = claims_extraction_response['choices'][0]['message']['content'] |
|
|
|
|
|
all_extracted_claims += claims_extraction.strip() |
|
|
|
|
|
st.session_state.claims_extraction = all_extracted_claims |
|
|
|
|
|
learning_status_placeholder.text(f"Patentable Claims Extracted!\n{all_extracted_claims.strip()}") |
|
|
|
|
|
claims_extracted = st.session_state.claims_extraction |
|
|
|
|
|
if 'claims_extracted' in st.session_state: |
|
st.text("Extracted Claims:") |
|
st.text(st.session_state.claims_extracted) |
|
|
|
|
|
model_name = 'nlpaueb/legal-bert-base-uncased' |
|
|
|
|
|
custom_config = AutoConfig.from_pretrained(model_name) |
|
custom_config.output_hidden_states = True |
|
custom_tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
custom_model = AutoModel.from_pretrained(model_name, config=custom_config) |
|
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) |
|
print('Using model {}\n'.format(model_name)) |
|
|
|
|
|
|
|
|
|
|
|
chunk_size = 350 |
|
|
|
|
|
if isinstance(claims_extracted, str): |
|
chunks = [claims_extracted[i:i+chunk_size] for i in range(0, len(claims_extracted), chunk_size)] |
|
else: |
|
chunks = [] |
|
|
|
|
|
summaries = [] |
|
for chunk in chunks: |
|
summary = bert_legal_model(chunk, min_length=20, ratio=0.9) |
|
summaries.append(summary) |
|
|
|
|
|
|
|
|
|
for i, summary in enumerate(summaries): |
|
st.write(f"### Summary {i+1}") |
|
st.write(summary) |
|
|
|
|
|
if summaries: |
|
st.text("BERT Summaries:") |
|
for i, summary in enumerate(summaries): |
|
st.text(f"Summary {i + 1}:\n{summary}") |
|
|
|
|
|
st.markdown("<sub>This app was created by [Tonic](https://huggingface.co/tonic) with help from [MIND INTERFACES](https://huggingface.co/MIND-INTERFACES) & [Taylor](https://huggingface.co/Cloudfaith) [join us on discord](https://discord.gg/5RmtZVVfgQ) </sub>", unsafe_allow_html=True) |
|
|