File size: 3,229 Bytes
802c888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from transformers import pipeline, DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
import streamlit as st

# load tokenizer and fine-tuned model
save_diirectory = "saved"
model = DistilBertForSequenceClassification.from_pretrained(save_diirectory)
model.eval()
tokenizer = DistilBertTokenizerFast.from_pretrained(save_diirectory)

# method to transform dataset to (patent_numbers, abstracts, claims, texts, labels), labels are 1/0 from decision
def dataset_to_lists(dataset):
    patent_numbers = []
    abstracts = []
    claims = []
    texts = []
    labels = []
    for data in dataset:
        patent_number = data['patent_number']
        abstract = data['abstract']
        claim = data['claims']
        text = data['abstract'] + data['claims']
        label = 1 if data['decision'] == 'ACCEPTED' else 0
        patent_numbers.append(patent_number)
        abstracts.append(abstract)
        claims.append(claim)
        texts.append(text)
        labels.append(label)
    return patent_numbers, abstracts, claims, texts, labels

# dataset class to fit in dataloader
class TextEncodeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


# load dataset, only focus on "patent_number", "abstract", "claims", "decision" at Jan
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-31',
    val_filing_start_date='2016-02-01',
    val_filing_end_date='2016-02-01',
)
dataset = dataset_dict['train']
dataset.set_format(type="torch", columns=["patent_number", "abstract", "claims", "decision"])

# transform dataset to lists of data
patent_numbers, abstracts, claims, texts, labels = dataset_to_lists(dataset)


# select a patent_number and get relevant info
patent_number = str(st.selectbox('select a patent', patent_numbers))
selected_idx = patent_numbers.index(patent_number)
abstract = abstracts[selected_idx]
claim = claims[selected_idx]
text = texts[selected_idx]
label = labels[selected_idx]

# display abstract and claim
st.write('abstract: ' + abstract)
st.write('claim: ' + claim)

# get encoding text
encoding = tokenizer(text, truncation=True, padding=True)

# click to make prediction
if st.button('Run'):
    with torch.no_grad():
        output = model(torch.tensor(encoding['input_ids']).unsqueeze(dim=0), attention_mask=torch.tensor(encoding['attention_mask']).unsqueeze(dim=0), labels=torch.tensor(label))
        predictions = F.softmax(output.logits, dim=1)
        score = predictions[0][1].item()
        st.write('score: ' + str(score))
        st.write('actual result: ' + str(label))
        st.write('output: ' + str(output))