Spaces:
Running
Running
File size: 2,998 Bytes
71ee167 0d30c2b 71ee167 24c49f4 71ee167 24c49f4 71ee167 24c49f4 71ee167 24c49f4 0d30c2b 71ee167 0d30c2b 71ee167 0d30c2b 71ee167 0d30c2b 71ee167 24c49f4 0d30c2b 71ee167 24c49f4 b6121f8 71ee167 0d30c2b 38aee5a 0d30c2b 71ee167 24c49f4 71ee167 0d30c2b 38aee5a 24c49f4 0d30c2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import streamlit as st
import torch
from datasets import combine
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import pipeline
# Load HUPD dataset
dataset_dict = load_dataset(
"HUPD/hupd",
name="sample",
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
icpr_label=None,
train_filing_start_date="2016-01-01",
train_filing_end_date="2016-01-21",
val_filing_start_date="2016-01-22",
val_filing_end_date="2016-01-31",
)
# Process data
filtered_dataset = dataset_dict["validation"].filter(
lambda e: e["decision"] == "ACCEPTED" or e["decision"] == "REJECTED"
)
seed = 88
accepted = filtered_dataset.filter(lambda e: e["decision"] == "ACCEPTED").shuffle(seed).select(range(5))
rejected = filtered_dataset.filter(lambda e: e["decision"] == "REJECTED").shuffle(seed).select(range(5))
dataset = combine.concatenate_datasets([accepted, rejected])
dataset = dataset.sort("patent_number")
# Create pipeline using model trainned on Colab
model = torch.load("patent_classifier_v4.pt", map_location=torch.device("cpu"))
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer_kwargs = {'padding':True,'truncation':True}
def load_data():
selected_application = dataset.select([applications[st.session_state.id]])
st.session_state.abstract = selected_application["abstract"][0]
st.session_state.claims = selected_application["claims"][0]
st.session_state.title = selected_application["title"][0]
st.session_state.decision = selected_application["decision"][0]
st.title("CS-GY-6613 Project Milestone 3")
# List patent numbers for select box
applications = {}
for ds_index, example in enumerate(dataset):
applications.update({example["patent_number"]: ds_index})
st.selectbox(
"Select a sample patent application:", applications, on_change=load_data, key="id"
)
# Sample title/decision displayed for additional context only, not used with model
st.text_input("Sample Title", key="title", value=dataset[0]["title"],)
st.text_input("Sample Decision", key="decision", value=dataset[0]["decision"])
# Classifier input form
with st.form("Input Form"):
abstract = st.text_area(
"Abstract", key="abstract", value=dataset[0]["abstract"], height=200
)
claims = st.text_area(
"Claims", key="claims", value=dataset[0]["abstract"], height=200
)
submitted = st.form_submit_button("Get Patentability Score")
if submitted:
tokens = tokenizer(abstract, claims, return_tensors='pt', **tokenizer_kwargs)
with torch.no_grad():
output = model(**tokens)
logits = output.logits
pred = torch.softmax(logits, dim=1)
score = pred[0][1] # index 1 of softmax output is probability that decision = ACCEPTED
st.markdown(
"This application's patentability score is **{}**.".format(score)
)
|