Pranjal-psytech commited on
Commit
4f137b2
1 Parent(s): 275476a

"Commit_code"

Browse files
Files changed (3) hide show
  1. corona_pred.py +68 -0
  2. corona_train.py +39 -0
  3. sars_mers_cov_other_train.csv +0 -0
corona_pred.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.feature_extraction.text import CountVectorizer
4
+ from sklearn.naive_bayes import MultinomialNB
5
+ import pickle
6
+ import sys
7
+
8
+ #print('Reading file...')
9
+ infile = sys.argv[1]
10
+ covid19df = pd.read_csv(infile)
11
+
12
+ # function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
13
+ kmer_size = 6
14
+ NGram = 4
15
+ #KFold_val = 10
16
+ def getKmers(sequence, size=kmer_size):
17
+ return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
18
+
19
+ #print('Creating token using K_Mer...')
20
+ covid19df['words'] = covid19df.apply(lambda x: getKmers(x['SEQ']), axis=1)
21
+
22
+ covid_texts = list(covid19df['words'])
23
+ #test_labels = np.array(covid19df.pop('CLASS'))
24
+
25
+ #print('Converting token to list...')
26
+ for item in range(len(covid_texts)):
27
+ covid_texts[item] = ' '.join(covid_texts[item])
28
+
29
+
30
+ #print('Performing Count Vectorization...')
31
+ cv = pickle.load(open('countVectTrain.pkl', 'rb'))
32
+ X = cv.transform(covid_texts)
33
+
34
+ # load the model from disk
35
+ filename = 'corona_pred.pkl'
36
+ model = pickle.load(open(filename, 'rb'))
37
+ test_pred = model.predict(X)
38
+ pred_prob = model.predict_proba(X)
39
+ test_pred_prob = pred_prob.max(1)*100
40
+
41
+ covid19df = covid19df.drop('words', axis=1)
42
+
43
+ df_test_pred = pd.DataFrame(data=test_pred, index=None, columns=["pred_label"])
44
+ #df_test_labels = pd.DataFrame(data=test_labels, index=None, columns=["test_label"])
45
+ df_pred_prob = pd.DataFrame(data=test_pred_prob, index=None, columns=["pred_prob_percentage"])
46
+
47
+ covid19df.reset_index(inplace = True, drop = True)
48
+ df_test_pred.reset_index(inplace = True, drop = True)
49
+ #df_test_labels.reset_index(inplace = True, drop = True)
50
+ df_out = pd.concat([covid19df, df_test_pred, df_pred_prob], axis=1)
51
+ df_out.to_csv('corona_pred_out.csv', index=False)
52
+
53
+ #mylist = str("Patient ID,Class <br>")
54
+ #mylist = str("<table border = 1 ><tr><th>Sequence ID</th><th>&nbsp;&nbsp;&nbsp;Class</th><th>&nbsp;&nbsp;&nbsp;Probability (in %)</th></tr><br>")
55
+
56
+ #for row in range(df_out.shape[0]):
57
+ # mylist = mylist + "<tr><td>" + df_out.iloc[row,0] + "</td>" + "<td>&nbsp;&nbsp;&nbsp;" + str(df_out.iloc[row,2]) + "</td>" + "<td>&nbsp;&nbsp;&nbsp;" + str(df_out.iloc[row,3]) + "</td></tr><br>"
58
+ # mylist = mylist + df_out.iloc[row,0] + "," + str(df_out.iloc[row,2]) + " <br>"
59
+
60
+ #mylist = mylist + "</table>"
61
+ #print(mylist)
62
+ df_out = df_out.drop('SEQ', axis=1)
63
+ df_out_html = df_out.to_html(index = False,justify = 'center')
64
+ import re
65
+ df_out_html = re.sub(r'PID', r'Sequence ID', df_out_html)
66
+ df_out_html = re.sub(r'pred_label', r'Predicted Class', df_out_html)
67
+ df_out_html = re.sub(r'pred_prob_percentage', r'Probability (in %)', df_out_html)
68
+ print(df_out_html)
corona_train.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.feature_extraction.text import CountVectorizer
4
+ from sklearn.naive_bayes import MultinomialNB
5
+ import pickle
6
+
7
+ # function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
8
+ kmer_size = 6
9
+ NGram = 4
10
+ #KFold_val = 10
11
+ def getKmers(sequence, size=kmer_size):
12
+ return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
13
+
14
+ print('Reading file...')
15
+ #covid19df= pd.read_csv('SARS_MERS_COV_train.csv')
16
+ covid19df= pd.read_csv('sars_mers_cov_other_train.csv')
17
+
18
+ print('Creating token using K_Mer...')
19
+ covid19df['words'] = covid19df.apply(lambda x: getKmers(x['SEQ']), axis=1)
20
+ covid19df = covid19df.drop('SEQ', axis=1)
21
+ covid_texts = list(covid19df['words'])
22
+
23
+ print('Converting token to list...')
24
+ for item in range(len(covid_texts)):
25
+ covid_texts[item] = ' '.join(covid_texts[item])
26
+ y_data = covid19df["CLASS"].values
27
+
28
+ print('Performing Count Vectorization...')
29
+ cv = CountVectorizer(ngram_range=(NGram,NGram))
30
+ X = cv.fit_transform(covid_texts)
31
+ pickle.dump(cv, open('countVectTrain.pkl', 'wb'))
32
+
33
+ print('Creating Classifiers...')
34
+ NB_classifier = MultinomialNB(alpha=0.1)
35
+
36
+ NB_classifier.fit(X, y_data)
37
+ # save the model to disk
38
+ filename = 'corona_pred.pkl'
39
+ pickle.dump(NB_classifier, open(filename, 'wb'))
sars_mers_cov_other_train.csv ADDED
The diff for this file is too large to render. See raw diff