From ade29e7005941eed5a7f549c1c759f6b35365d6a Mon Sep 17 00:00:00 2001
From: hakangunturkun
Date: Mon, 13 Apr 2020 12:20:33 -0500
Subject: arrange nlp and cnn functions
---
server.py | 119 ++++++++++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 89 insertions(+), 30 deletions(-)
diff --git a/server.py b/server.py
index f36a4b3..8ccd5ee 100755
--- a/server.py
+++ b/server.py
@@ -10,12 +10,36 @@ import tempfile
import random
import string
from ratspub import *
-from nlp import *
import time
import os
import re
import pytz
+
+import string
+import re
+import os
+from os import listdir
+from nltk.corpus import stopwords
+from nltk.stem.porter import PorterStemmer
+from collections import Counter
+import numpy as np
+from numpy import array
+import keras
+from keras.models import Model
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import Dense
+from keras.layers import Flatten
+from keras.layers import Embedding
+from keras.layers.convolutional import Conv1D
+from keras.layers.convolutional import MaxPooling1D
+from keras import metrics
+from keras import optimizers
+import pickle
+import tensorflow as tf
+
app=Flask(__name__)
datadir="/export/ratspub/"
app.config['SECRET_KEY'] = '#DtfrL98G5t1dC*4'
@@ -31,6 +55,39 @@ class users(db.Model):
password = db.Column(db.String(128), nullable=False)
date_created = db.Column(db.DateTime, default=datetime.utcnow)
+def clean_doc(doc, vocab):
+ doc = doc.lower()
+ tokens = doc.split()
+ re_punc = re.compile('[%s]' % re.escape(string.punctuation))
+ tokens = [re_punc.sub('' , w) for w in tokens]
+ tokens = [word for word in tokens if len(word) > 1]
+ stop_words = set(stopwords.words('english'))
+ tokens = [w for w in tokens if not w in stop_words]
+ porter = PorterStemmer()
+ stemmed = [porter.stem(word) for word in tokens]
+ return tokens
+
+# load tokenizer
+with open('./nlp/tokenizer.pickle', 'rb') as handle:
+ tokenizer = pickle.load(handle)
+
+# load vocabulary
+with open('./nlp/vocabulary.txt', 'r') as vocab:
+ vocab = vocab.read()
+
+# create the CNN model
+def create_model(vocab_size, max_length):
+ model = Sequential()
+ model.add(Embedding(vocab_size, 32, input_length=max_length))
+ model.add(Conv1D(filters=16, kernel_size=4, activation='relu'))
+ model.add(MaxPooling1D(pool_size=2))
+ model.add(Flatten())
+ model.add(Dense(10, activation='relu'))
+ model.add(Dense(1, activation='sigmoid'))
+ opt = keras.optimizers.Adamax(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
+ model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[keras.metrics.AUC()])
+ return model
+
@app.route("/")
def root():
return render_template('index.html')
@@ -537,18 +594,33 @@ def cytoscape():
@app.route("/sentences")
def sentences():
+ def predict_sent(sent_for_pred):
+ max_length = 64
+ tokens = clean_doc(sent_for_pred, vocab)
+ tokens = [w for w in tokens if w in vocab]
+ # convert to line
+ line = ' '.join(tokens)
+ line = [line]
+ tokenized_sent = tokenizer.texts_to_sequences(line)
+ tokenized_sent = pad_sequences(tokenized_sent, maxlen=max_length, padding='post')
+ predict_sent = model.predict(tokenized_sent, verbose=0)
+ percent_sent = predict_sent[0,0]
+ if round(percent_sent) == 0:
+ return 'neg'
+ else:
+ return 'pos'
pmid_list=[]
edge=request.args.get('edgeID')
(tf_name, gene0, cat0)=edge.split("|")
+ if(cat0=='stress'):
+ model = create_model(23154, 64)
+ model.load_weights("./nlp/weights.ckpt")
out3=""
-# out5_pl=""
-# out5_sn=""
out_pos = ""
out_neg = ""
num_abstract = 0
- stress_systemic = "
") - ## show the cytoscape graph for one gene from the top gene list @app.route("/showTopGene") def showTopGene(): -- cgit v1.2.3