DrishtiSharma commited on
Commit
11f12ef
·
verified ·
1 Parent(s): 986681b

Create temp1.py

Browse files
Files changed (1) hide show
  1. mylab/temp1.py +73 -0
mylab/temp1.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+ from datasets import load_dataset
5
+ import pandas as pd
6
+
7
+ # Model selection: Use a fine-tuned model for patent classification
8
+ model_name = "juliaannjose/finetuned_model"
9
+
10
+ @st.cache_resource
11
+ def load_model(model_name):
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
14
+ return tokenizer, model
15
+
16
+ tokenizer, model = load_model(model_name)
17
+
18
+ # Load dataset with training and validation data for more comprehensive analysis
19
+ with st.spinner("Loading patent dataset..."):
20
+ dataset_dict = load_dataset(
21
+ "HUPD/hupd",
22
+ name="sample",
23
+ data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
24
+ icpr_label=None,
25
+ train_filing_start_date="2016-01-01",
26
+ train_filing_end_date="2016-01-31",
27
+ val_filing_start_date="2017-01-22",
28
+ val_filing_end_date="2017-01-31",
29
+ )
30
+ df_train = pd.DataFrame(dataset_dict["train"])
31
+ df_val = pd.DataFrame(dataset_dict["validation"])
32
+ df = pd.concat([df_train, df_val], ignore_index=True)
33
+
34
+ # Clean and structure the DataFrame
35
+ df = df[["patent_number", "decision", "abstract", "claims", "filing_date"]]
36
+ PAN = df["patent_number"].drop_duplicates()
37
+
38
+ # Streamlit UI
39
+ st.title("Harvard USPTO Patentability Predictor")
40
+
41
+ with st.form("patent-form"):
42
+ make_choice = st.selectbox("Select the Patent Application Number:", PAN)
43
+ submitted = st.form_submit_button(label="Submit")
44
+
45
+ if submitted:
46
+ abstract = df["abstract"].loc[df["patent_number"] == make_choice].values[0]
47
+ claims = df["claims"].loc[df["patent_number"] == make_choice].values[0]
48
+ decision = df["decision"].loc[df["patent_number"] == make_choice].values[0]
49
+
50
+ st.subheader(":blue[Patent Abstract]")
51
+ st.info(abstract)
52
+ st.subheader(":blue[Patent Claims]")
53
+ st.info(claims)
54
+
55
+ # Combine abstract and claims for a comprehensive prediction
56
+ input_text = abstract + " " + claims
57
+ inputs = tokenizer(input_text, truncation=True, padding=True, return_tensors="pt")
58
+
59
+ with torch.no_grad():
60
+ outputs = model(**inputs)
61
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
62
+
63
+ id2label = {0: "REJECTED", 1: "ACCEPTED"}
64
+ predicted_class_id = probabilities.argmax().item()
65
+ pred_label = id2label[predicted_class_id]
66
+
67
+ st.subheader(":green[Prediction Result]")
68
+ if pred_label == "ACCEPTED":
69
+ st.success(f"The patent is likely to be **ACCEPTED** with a score of {probabilities[0][1].item():.2f}.")
70
+ else:
71
+ st.error(f"The patent is likely to be **REJECTED** with a score of {probabilities[0][0].item():.2f}.")
72
+
73
+ st.write(f"**Decision Summary:** {decision}")