diff options
author | hakangunturkun | 2020-04-09 12:15:15 -0500 |
---|---|---|
committer | hakangunturkun | 2020-04-09 12:15:15 -0500 |
commit | 23f31dcdabe7688ff7bd757110f3adecc21db312 (patch) | |
tree | 3a2a800957f5933f4ca2eb6d338aefe2fe466404 | |
parent | 0e8df2be7f0ce86e393cfc32346de32e1931b2a3 (diff) | |
download | genecup-23f31dcdabe7688ff7bd757110f3adecc21db312.tar.gz |
functions for nlp and cnn model
-rw-r--r-- | nlp.py | 82 |
1 files changed, 82 insertions, 0 deletions
@@ -0,0 +1,82 @@ +import string +import re +import os +from os import listdir +from nltk.corpus import stopwords +from nltk.stem.porter import PorterStemmer +from collections import Counter +import numpy as np +from numpy import array +import keras +from keras.models import Model +from keras.preprocessing.text import Tokenizer +from keras.preprocessing.sequence import pad_sequences +from keras.models import Sequential +from keras.layers import Dense +from keras.layers import Flatten +from keras.layers import Embedding +from keras.layers.convolutional import Conv1D +from keras.layers.convolutional import MaxPooling1D +from keras import metrics +from keras import optimizers +import pickle + +def clean_doc(doc, vocab): + doc = doc.lower() + # split into tokens by white space + tokens = doc.split() + # remove punctuation from each word + re_punc = re.compile('[%s]' % re.escape(string.punctuation)) + tokens = [re_punc.sub('' , w) for w in tokens] + # filter out short tokens + tokens = [word for word in tokens if len(word) > 1] + # filter out stop words + stop_words = set(stopwords.words('english')) + tokens = [w for w in tokens if not w in stop_words] + # stemming of words + porter = PorterStemmer() + stemmed = [porter.stem(word) for word in tokens] + #print(stemmed[:100]) + return tokens + +# load tokenizer +with open('./nlp/tokenizer.pickle', 'rb') as handle: + tokenizer = pickle.load(handle) + +# load vocabulary +with open('./nlp/vocabulary.txt', 'r') as vocab: + vocab = vocab.read() + +# create the CNN model +def create_model(vocab_size, max_length): + model = Sequential() + model.add(Embedding(vocab_size, 32, input_length=max_length)) + model.add(Conv1D(filters=16, kernel_size=4, activation='relu')) + model.add(MaxPooling1D(pool_size=2)) + model.add(Flatten()) + model.add(Dense(10, activation='relu')) + model.add(Dense(1, activation='sigmoid')) + opt = keras.optimizers.Adamax(learning_rate=0.002, beta_1=0.9, beta_2=0.999) + + model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[keras.metrics.AUC()]) + return model + +def predict_sent(sent_for_pred): + max_length = 64 + tokens = clean_doc(sent_for_pred, vocab) + tokens = [w for w in tokens if w in vocab] + # convert to line + line = ' '.join(tokens) + line = [line] + tokenized_sent = tokenizer.texts_to_sequences(line) + tokenized_sent = pad_sequences(tokenized_sent, maxlen=max_length, padding='post') + model = create_model(23154, 64) + # load the weights + checkpoint_path = "./nlp/weights.ckpt" + model.load_weights(checkpoint_path) + predict_sent = model.predict(tokenized_sent, verbose=0) + percent_sent = predict_sent[0,0] + if round(percent_sent) == 0: + return 'neg' + else: + return 'pos'
\ No newline at end of file |