aboutsummaryrefslogtreecommitdiff
path: root/nlp
diff options
context:
space:
mode:
authorhakangunturkun2020-09-07 22:35:17 -0500
committerhakangunturkun2020-09-07 22:35:17 -0500
commita637889f13c2151303998e411dc81f3196a974f6 (patch)
treef593bb635e759889109504405d2abc3e31af9008 /nlp
parent35179dec9aa1926102ad2ddbbd5b8ad2882be92e (diff)
downloadgenecup-a637889f13c2151303998e411dc81f3196a974f6.tar.gz
last version
Diffstat (limited to 'nlp')
-rw-r--r--nlp/RatsPub_CNN.py167
-rw-r--r--nlp/RatsPub_CNN_predict.py181
2 files changed, 348 insertions, 0 deletions
diff --git a/nlp/RatsPub_CNN.py b/nlp/RatsPub_CNN.py
new file mode 100644
index 0000000..432d749
--- /dev/null
+++ b/nlp/RatsPub_CNN.py
@@ -0,0 +1,167 @@
+import string
+import re
+import os
+from os import listdir
+import matplotlib.pyplot as plt
+from nltk.corpus import stopwords
+from nltk.stem.porter import PorterStemmer
+from collections import Counter
+import numpy as np
+from numpy import array
+import tensorflow as tf
+import keras
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.utils import plot_model
+from tensorflow.keras.models import Sequential, Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D
+from tensorflow.keras.preprocessing.text import text_to_word_sequence
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+from tensorflow.keras import metrics, optimizers
+import pickle
+
+def load_doc(filename):
+ file = open(filename, 'r')
+ text = file.read()
+ file.close()
+ return text
+
+def clean_doc(doc, vocab):
+ #tokens = text_to_word_sequence(doc, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True, split=' ')
+ doc = doc.lower()
+ # split into tokens by white space
+ tokens = doc.split()
+ # remove punctuation from each word
+ re_punc = re.compile('[%s]' % re.escape(string.punctuation))
+ tokens = [re_punc.sub('' , w) for w in tokens]
+ # filter out short tokens
+ tokens = [word for word in tokens if len(word) > 1]
+ # filter out stop words
+ stop_words = set(stopwords.words('english'))
+ tokens = [w for w in tokens if not w in stop_words]
+ # stemming of words
+ porter = PorterStemmer()
+ stemmed = [porter.stem(word) for word in tokens]
+ #print(stemmed[:100])
+ return tokens
+
+# load all docs in a directory
+def train_valid(directory, vocab, is_train):
+ documents = list()
+ for filename in listdir(directory):
+ if is_train and (filename.endswith('1.txt') or filename.endswith('2.txt')):
+ continue
+ if not is_train and not (filename.endswith('1.txt') or filename.endswith('2.txt')):
+ continue
+ path = directory + '/' + filename
+ doc = load_doc(path)
+ tokens_train_valid = clean_doc(doc, vocab)
+ tokens_train_valid = [w for w in tokens_train_valid if w in vocab]
+ tokens_train_valid = ' '.join(tokens_train_valid)
+ documents.append(tokens_train_valid)
+ return documents
+
+def add_doc_to_vocab(filename, vocab):
+ doc = load_doc(filename)
+ tokens = clean_doc(doc, vocab)
+ vocab.update(tokens)
+
+def form_vocabulary(directory, vocab):
+ for filename in listdir(directory):
+ if not filename.endswith(".txt"):
+ next
+ path = directory + '/' + filename
+ add_doc_to_vocab(path, vocab)
+
+def load_dataset(vocab, is_train):
+ neg = train_valid('sentences/no_10000', vocab, is_train)
+ pos = train_valid('sentences/yes_10000', vocab, is_train)
+ docs = neg + pos
+ labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
+ return docs, labels
+
+def tokenize_data(train_docs, valid_docs, maxlen):
+ # create the tokenizer
+ tokenizer = Tokenizer()
+ # fit the tokenizer on the documents
+ tokenizer.fit_on_texts(vocab)
+ # encode training data set
+ Xtrain = tokenizer.texts_to_sequences(train_docs)
+ Xtrain = pad_sequences(Xtrain, maxlen=max_length, padding='post')
+ # encode training data set
+ Xvalid = tokenizer.texts_to_sequences(valid_docs)
+ Xvalid = pad_sequences(Xvalid, maxlen=max_length, padding='post')
+ return Xtrain, Xvalid, tokenizer
+
+def save_list(lines, filename):
+ data = '\n'.join(lines)
+ file = open(filename, 'w')
+ file.write(data)
+ file.close()
+
+vocab = Counter()
+# add all docs to vocab
+form_vocabulary('sentences/no_10000', vocab)
+form_vocabulary('sentences/yes_10000', vocab)
+save_list(vocab, 'vocab.txt')
+
+# load the vocabulary
+vocab_filename = 'vocab.txt'
+vocab = load_doc(vocab_filename)
+vocab = set(vocab.split())
+save_list(vocab, 'vocab_last.txt')
+# load training and validation data
+train_docs, ytrain = load_dataset(vocab, True)
+valid_docs, yvalid = load_dataset(vocab, False)
+max_length = max([len(s.split()) for s in train_docs])
+print("Maximum length:", max_length)
+Xtrain, Xvalid, tokenizer = tokenize_data(train_docs, valid_docs, max_length)
+
+# saving
+with open('tokenizer.pickle', 'wb') as handle:
+ pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+print(len(vocab))
+
+def create_model(vocab_size, max_length):
+ model = Sequential()
+ model.add(Embedding(vocab_size, 32, input_length=max_length))
+ model.add(Conv1D(filters=16, kernel_size=4, activation='relu'))
+ model.add(MaxPooling1D(pool_size=2))
+ model.add(Flatten())
+ model.add(Dense(10, activation='relu'))
+ model.add(Dense(1, activation='sigmoid'))
+ opt = tf.keras.optimizers.Adamax(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
+ model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[tf.keras.metrics.AUC()])
+# model.summary()
+ return model
+
+model = create_model(len(vocab)+1, max_length)
+
+checkpoint_path = "training/cp-{epoch:04d}.ckpt"
+checkpoint_dir = os.path.dirname(checkpoint_path)
+
+# Create a callback that saves the model's weights
+cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)
+
+# Train the model with the new callback
+model_fit=model.fit(Xtrain, ytrain, epochs=20,batch_size=64, validation_data=(Xvalid,yvalid),callbacks=[cp_callback])
+
+# Plot training & validation accuracy values
+plt.plot(model_fit.history['auc'])
+plt.plot(model_fit.history['val_auc'])
+plt.title('Model accuracy')
+plt.ylabel('Accuracy')
+plt.xlabel('Epoch')
+plt.legend(['Train', 'Validation'], loc='lower right')
+plt.savefig('model_accuracy.png')
+plt.show()
+
+# Plot training & validation loss values
+plt.plot(model_fit.history['loss'])
+plt.plot(model_fit.history['val_loss'])
+plt.title('Model loss')
+plt.ylabel('Loss')
+plt.xlabel('Epoch')
+plt.legend(['Train', 'Validation'], loc='upper right')
+plt.savefig('model_loss.png')
+plt.show()
diff --git a/nlp/RatsPub_CNN_predict.py b/nlp/RatsPub_CNN_predict.py
new file mode 100644
index 0000000..9b6a206
--- /dev/null
+++ b/nlp/RatsPub_CNN_predict.py
@@ -0,0 +1,181 @@
+import string
+import re
+import os
+from os import listdir
+import matplotlib.pyplot as plt
+from nltk.corpus import stopwords
+from nltk.stem.porter import PorterStemmer
+from collections import Counter
+import numpy as np
+from numpy import array
+import sklearn
+from sklearn import metrics
+from sklearn.metrics import confusion_matrix
+import tensorflow as tf
+import keras
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.utils import plot_model
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D
+from tensorflow.keras.preprocessing.text import text_to_word_sequence
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+from tensorflow.keras import metrics, optimizers
+import pickle
+
+def clean_doc(doc, vocab):
+ doc = doc.lower()
+ # split into tokens by white space
+ tokens = doc.split()
+ # remove punctuation from each word
+ re_punc = re.compile('[%s]' % re.escape(string.punctuation))
+ tokens = [re_punc.sub('' , w) for w in tokens]
+ # filter out short tokens
+ tokens = [word for word in tokens if len(word) > 1]
+ # filter out stop words
+ stop_words = set(stopwords.words('english'))
+ tokens = [w for w in tokens if not w in stop_words]
+ # stemming of words
+ porter = PorterStemmer()
+ stemmed = [porter.stem(word) for word in tokens]
+ #print(stemmed[:100])
+ return tokens
+
+# loading
+with open('./nlp/tokenizer.pickle', 'rb') as handle:
+ tokenizer = pickle.load(handle)
+with open('./nlp/vocabulary.txt', 'r') as vocab:
+ vocab = vocab.read()
+
+print(len(vocab.split()))
+
+def create_model(vocab_size, max_length):
+ model = Sequential()
+ model.add(Embedding(vocab_size, 32, input_length=max_length))
+ model.add(Conv1D(filters=16, kernel_size=4, activation='relu'))
+ model.add(MaxPooling1D(pool_size=2))
+ model.add(Flatten())
+ model.add(Dense(10, activation='relu'))
+ model.add(Dense(1, activation='sigmoid'))
+ opt = tf.keras.optimizers.Adamax(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
+ model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[tf.keras.metrics.AUC()])
+ #plot_model(model, to_file='model.png', show_shapes=True)
+ return model
+
+model = create_model(23154, 64)
+model.summary()
+
+checkpoint_path = "./nlp/weights.ckpt"
+model.load_weights(checkpoint_path)
+
+
+err=0
+pr=0
+total=0
+max_length=64
+pos_list = []
+neg_list = []
+for k in range(30000,35000):
+ file_name = "./sentences/yes_all/yes_"+str(k)+".txt"
+ try:
+ file = open(file_name,"r")
+ sent = file.readline()
+ tokens = clean_doc(sent,vocab)
+ tokens = [w for w in tokens if w in vocab]
+ line = ' '.join(tokens)
+ line = [line]
+ Xtrain_ex = tokenizer.texts_to_sequences(line)
+ Xtrain_ex = pad_sequences(Xtrain_ex, maxlen=max_length, padding='post')
+ yhat_pos = model.predict(Xtrain_ex, verbose=0)
+ percent_pos = yhat_pos[0,0]
+ pos_list.append(yhat_pos[0,0])
+ total = total+1
+ if round(percent_pos) == 0:
+ err = err +1
+ if (percent_pos < 0.9 and percent_pos > 0.1):
+ pr = pr+1
+ except FileNotFoundError:
+ pass
+ file.close()
+
+for t in range(30000,35000):
+ file_name = "./sentences/no_all/no_"+str(t)+".txt"
+ try:
+ file = open(file_name,"r")
+ sent = file.readline()
+ tokens = clean_doc(sent,vocab)
+ tokens = [w for w in tokens if w in vocab]
+ line = ' '.join(tokens)
+ line = [line]
+ Xtrain_ex = tokenizer.texts_to_sequences(line)
+ Xtrain_ex = pad_sequences(Xtrain_ex, maxlen=max_length, padding='post')
+ yhat_neg = model.predict(Xtrain_ex, verbose=0)
+ percent_pos = yhat_neg[0,0]
+ neg_list.append(yhat_neg[0,0])
+ total = total+1
+ if round(percent_pos) == 1:
+ err = err +1
+ if (percent_pos < 0.9 and percent_pos > 0.1):
+ pr = pr+1
+ except FileNotFoundError:
+ pass
+ file.close()
+
+err_pos = 0
+for i in range(len(pos_list)):
+ #print(round(pos_list[i]))
+ if (pos_list[i] < 0.5):
+ err_pos += 1
+print("Error for system stress class", err_pos)
+
+err_neg = 0
+for i in range(len(neg_list)):
+ #print((neg_list[i]))
+ if (round(neg_list[i]) > 0.5):
+ err_neg += 1
+print("Error for cellular stress class",err_neg)
+print((err_pos + err_neg)/10000)
+
+pos_list_int = []
+for i in range(5000):
+ if(pos_list[i]<0.5):
+ pos_list_int.append(0)
+ else:
+ pos_list_int.append(1)
+
+neg_list_int = []
+for i in range(5000):
+ if(neg_list[i]>0.5):
+ neg_list_int.append(0)
+ else:
+ neg_list_int.append(1)
+
+listofzeros = [0] * 5000
+listofones= []
+for i in range(5000):
+ listofones.append(1)
+y_true = listofones + listofzeros
+y_pred_int = pos_list_int + neg_list_int
+confusion_matrix(y_true, y_pred_int)
+
+pos_list_np = np.array(pos_list)
+neg_list_np = np.array(neg_list)
+
+y_pred_np = np.array(pos_list+neg_list)
+y_true_np = np.array(y_true)
+
+
+fpr, tpr, thresholds = metrics.roc_curve(y_true_np, y_pred_np, pos_label=0)
+print(metrics.auc(fpr, tpr))
+print(sklearn.metrics.roc_auc_score(y_true_np, y_pred_np))
+
+plt.gcf().subplots_adjust(bottom=0.15)
+data1 = pos_list_np
+data2 = neg_list_np
+bins = np.arange(0, 1+1e-8, 0.1)
+plt.hist(data1, bins=bins, alpha=0.5, color = 'red')
+plt.hist(data2, bins=bins, alpha=0.5, color = 'blue')
+plt.xlabel("predicted probabilities \n blue: CS red:SS")
+plt.ylabel('number of sentences')
+plt.savefig('stress_pred.png', dpi=300)
+plt.show()