File size: 3,821 Bytes
562ccd6
 
 
9232135
562ccd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf27007
562ccd6
 
a8b401c
 
 
 
 
 
 
 
 
 
 
 
 
 
10692da
a8b401c
 
 
 
06e7238
a8b401c
 
562ccd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625231b
bf27007
562ccd6
 
 
 
be2a07b
9232135
 
 
 
 
 
 
 
2b313b9
9232135
 
994d177
562ccd6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import gradio as gr
import os
import gc
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter

from sklearn.preprocessing import LabelEncoder

from keras.models import Model
from keras.regularizers import l2
from keras.constraints import max_norm
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout, Flatten, Activation
from keras.layers import Conv1D, Add, MaxPooling1D, BatchNormalization
from keras.layers import Embedding, Bidirectional, LSTM, CuDNNLSTM, GlobalMaxPooling1D

import tensorflow as tf
from huggingface_hub import hf_hub_url, cached_download


class Sequence:
    codes = {c: i+1 for i, c in enumerate(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])}

    @classmethod
    def integer_encoding(cls, data):
        """
        - Encodes code sequence to integer values.
        - 20 common amino acids are taken into consideration
        and remaining four are categorized as 0.
        """
        return np.array([cls.codes.get(code, 0) for code in data])

    @classmethod
    def prepare(cls, sequence):
        sequence = sequence.strip().upper()
        ie = cls.integer_encoding(sequence)
        max_length = 100
        padded_ie = pad_sequences([ie], maxlen=max_length, padding='post', truncating='post')
        all_ohe = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] + [0]*(100-21))
        return to_categorical(np.array([padded_ie[0], all_ohe]))[:1]


def residual_block(data, filters, d_rate):
    """
    _data: input
    _filters: convolution filters
    _d_rate: dilation rate
    """
    
    shortcut = data
    
    bn1 = BatchNormalization()(data)
    act1 = Activation('relu')(bn1)
    conv1 = Conv1D(filters, 1, dilation_rate=d_rate, padding='same', kernel_regularizer=l2(0.001))(act1)
    
    #bottleneck convolution
    bn2 = BatchNormalization()(conv1)
    act2 = Activation('relu')(bn2)
    conv2 = Conv1D(filters, 3, padding='same', kernel_regularizer=l2(0.001))(act2)
    
    #skip connection
    x = Add()([conv2, shortcut])
    
    return x

def get_model():
    # model
    x_input = Input(shape=(100, 21))
    
    #initial conv
    conv = Conv1D(128, 1, padding='same')(x_input) 
    
    # per-residue representation
    res1 = residual_block(conv, 128, 2)
    res2 = residual_block(res1, 128, 3)
    
    x = MaxPooling1D(3)(res2)
    x = Dropout(0.5)(x)
    
    # softmax classifier
    x = Flatten()(x)
    x_output = Dense(1000, activation='softmax', kernel_regularizer=l2(0.0001))(x)
    
    model2 = Model(inputs=x_input, outputs=x_output)
    model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    weights = cached_download(hf_hub_url("jonathang/Protein_Family_CNN", 'model2.h5'))
    model2.load_weights(weights)

    return model2


model = get_model()
mappings_path = cached_download(hf_hub_url("jonathang/Protein_Family_CNN", 'prot_mappings.json'))
with open(mappings_path) as f:
    prot_mappings = json.load(f)

def greet(Amino_Acid_Sequence):
    processed_seq = Sequence.prepare(Amino_Acid_Sequence)
    raw_prediction = model.predict(processed_seq)[0]
    idx = raw_prediction.argmax()
    fam_asc = prot_mappings['id2fam_asc'][str(idx)]
    fam_id = prot_mappings['fam_asc2fam_id'][fam_asc]
    gc.collect()
    return f"Input is {Amino_Acid_Sequence}.\nProcessed input is:\n{processed_seq}\n\nModel makes prediction which maps to:\nFamily Accession={fam_asc} and ID={fam_id}\n\nRaw Prediction:\n{raw_prediction}\n"

iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch()