diff options
author | hakangunturkun | 2020-04-13 15:17:53 -0500 |
---|---|---|
committer | hakangunturkun | 2020-04-13 15:17:53 -0500 |
commit | 4d4c37e7c8e9d0e85a937c538fba530879a3f4e9 (patch) | |
tree | 4c90bc95f83a43d4b47aeb7bbbd5343b1eb9f8bb | |
parent | ade29e7005941eed5a7f549c1c759f6b35365d6a (diff) | |
download | genecup-4d4c37e7c8e9d0e85a937c538fba530879a3f4e9.tar.gz |
remove nlp.py
-rw-r--r-- | nlp.py | 83 |
1 files changed, 0 insertions, 83 deletions
@@ -1,83 +0,0 @@ -import string -import re -import os -from os import listdir -from nltk.corpus import stopwords -from nltk.stem.porter import PorterStemmer -from collections import Counter -import numpy as np -from numpy import array -import keras -from keras.models import Model -from keras.preprocessing.text import Tokenizer -from keras.preprocessing.sequence import pad_sequences -from keras.models import Sequential -from keras.layers import Dense -from keras.layers import Flatten -from keras.layers import Embedding -from keras.layers.convolutional import Conv1D -from keras.layers.convolutional import MaxPooling1D -from keras import metrics -from keras import optimizers -import pickle - -def clean_doc(doc, vocab): - doc = doc.lower() - # split into tokens by white space - tokens = doc.split() - # remove punctuation from each word - re_punc = re.compile('[%s]' % re.escape(string.punctuation)) - tokens = [re_punc.sub('' , w) for w in tokens] - # filter out short tokens - tokens = [word for word in tokens if len(word) > 1] - # filter out stop words - stop_words = set(stopwords.words('english')) - tokens = [w for w in tokens if not w in stop_words] - # stemming of words - porter = PorterStemmer() - stemmed = [porter.stem(word) for word in tokens] - #print(stemmed[:100]) - return tokens - -# load tokenizer -with open('./nlp/tokenizer.pickle', 'rb') as handle: - tokenizer = pickle.load(handle) - -# load vocabulary -with open('./nlp/vocabulary.txt', 'r') as vocab: - vocab = vocab.read() - -# create the CNN model -#def create_model(vocab_size, max_length): -model = Sequential() -model.add(Embedding(vocab_size, 32, input_length=max_length)) -model.add(Conv1D(filters=16, kernel_size=4, activation='relu')) -model.add(MaxPooling1D(pool_size=2)) -model.add(Flatten()) -model.add(Dense(10, activation='relu')) -model.add(Dense(1, activation='sigmoid')) -opt = keras.optimizers.Adamax(learning_rate=0.002, beta_1=0.9, beta_2=0.999) -model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[keras.metrics.AUC()]) -model = create_model(23154, 64) -# load the weights -## this is done for every prediction?? -checkpoint_path = "./nlp/weights.ckpt" -model.load_weights(checkpoint_path) - -#return model - -def predict_sent(sent_for_pred): - max_length = 64 - tokens = clean_doc(sent_for_pred, vocab) - tokens = [w for w in tokens if w in vocab] - # convert to line - line = ' '.join(tokens) - line = [line] - tokenized_sent = tokenizer.texts_to_sequences(line) - tokenized_sent = pad_sequences(tokenized_sent, maxlen=max_length, padding='post') - predict_sent = model.predict(tokenized_sent, verbose=0) - percent_sent = predict_sent[0,0] - if round(percent_sent) == 0: - return 'neg' - else: - return 'pos' |