#Prediction For Main Issues Data Set
import csv
from tensorflow.keras.preprocessing import text
from nltk.corpus import gutenberg
from string import punctuation
from tensorflow.keras.preprocessing.sequence import skipgrams
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
%matplotlib inline
from nltk.stem.snowball import SnowballStemmer
englishStemmer=SnowballStemmer("english")
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Dot, Input, Dense, Reshape, LSTM, Conv2D, Flatten, MaxPooling1D, Dropout, MaxPooling2D
from tensorflow.keras.layers import Embedding, Multiply, Subtract
from tensorflow.keras.models import Sequential, Model
from tensorflow.python.keras.layers import Lambda
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
#from IPython.display import SVG
#from keras.utils.vis_utils import model_to_dot
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.manifold import TSNE
from utils.read_data import Dynamic_Dataset,Processing_Dataset
from utils.vectorize_sentence import Embeddings
path = "../data/augmented_dataset/"
process_unit = Processing_Dataset(path)
ground_truth = process_unit.get_ground_truth()
dataset = Dynamic_Dataset(ground_truth, path,False)
test, train = process_unit.get_test_and_training(ground_truth,isZip = True)
#for elem in test:
# print(elem[0])
import nltk
nltk.download('stopwords')
embeddings = Embeddings()
max_words = 5000 #<------- [Parameter]
pre_corpora_train = [doc for doc in train if len(doc[1])< max_words]
pre_corpora_test = [doc for doc in test if len(doc[1])< max_words]
print(len(pre_corpora_train))
print(len(pre_corpora_test))
embed_path = '../data/word_embeddings-embed_size_100-epochs_100.csv'
embeddings_dict = embeddings.get_embeddings_dict(embed_path)
corpora_train = [embeddings.vectorize(doc[1].decode("utf-8"), embeddings_dict) for doc in pre_corpora_train]#vectorization Inputs
corpora_test = [embeddings.vectorize(doc[1].decode("utf-8"), embeddings_dict) for doc in pre_corpora_test]#vectorization
target_train = [[int(list(doc[0])[1]),int(list(doc[0])[3])] for doc in pre_corpora_train]#vectorization Output
target_test = [[int(list(doc[0])[1]),int(list(doc[0])[3])]for doc in pre_corpora_test]#vectorization Output
#target_train
max_len_sentences_train = max([len(doc) for doc in corpora_train]) #<------- [Parameter]
max_len_sentences_test = max([len(doc) for doc in corpora_test]) #<------- [Parameter]
max_len_sentences = max(max_len_sentences_train,max_len_sentences_test)
print("Max. Sentence # words:",max_len_sentences)
min_len_sentences_train = min([len(doc) for doc in corpora_train]) #<------- [Parameter]
min_len_sentences_test = min([len(doc) for doc in corpora_test]) #<------- [Parameter]
min_len_sentences = max(min_len_sentences_train,min_len_sentences_test)
print("Mix. Sentence # words:",min_len_sentences)
embed_size = np.size(corpora_train[0][0])
embeddigs_cols = embed_size
input_sh = (max_len_sentences,embeddigs_cols,1)
#Selecting filters?
#https://stackoverflow.com/questions/48243360/how-to-determine-the-filter-parameter-in-the-keras-conv2d-function
#https://stats.stackexchange.com/questions/196646/what-is-the-significance-of-the-number-of-convolution-filters-in-a-convolutional
N_filters = 128 # <-------- [HyperParameter] Powers of 2 Numer of Features
K = 2 # <-------- [HyperParameter] Number of Classess
input_sh
gram_input = Input(shape = input_sh)
conv_1_layer = Conv2D(filters=32, input_shape=input_sh, activation='relu',
kernel_size=(7,embeddigs_cols), padding='valid')(gram_input)
conv_1_layer.shape
max_1_pooling = MaxPooling2D(pool_size=((max_len_sentences-7+1),1), strides=None, padding='valid')(conv_1_layer)
max_1_pooling.shape
fully_connected_1_gram = Flatten()(max_1_pooling)
fully_connected_1_gram.shape
fully_connected_1_gram = Reshape((32, 1, 1))(fully_connected_1_gram)
fully_connected_1_gram.shape
conv_2_layer = Conv2D(filters=64, kernel_size=(5,1), activation='relu',
padding='valid')(fully_connected_1_gram)
conv_2_layer.shape
max_2_pooling = MaxPooling2D(pool_size=((32-5+1),1), strides=None, padding='valid')(conv_2_layer)
max_2_pooling.shape
fully_connected_2_gram = Flatten()(max_2_pooling)
fully_connected_2_gram.shape
fully_connected_2_gram = Reshape((64, 1, 1))(fully_connected_2_gram)
fully_connected_2_gram.shape
conv_3_layer = Conv2D(filters=128, kernel_size=(3,1), activation='relu',
padding='valid')(fully_connected_2_gram)
conv_3_layer.shape
conv_4_layer = Conv2D(filters=128, kernel_size=(3,1), activation='relu',
padding='valid')(conv_3_layer)
conv_4_layer.shape
conv_5_layer = Conv2D(filters=64, kernel_size=(3,1), activation='relu',
padding='valid')(conv_4_layer)
conv_5_layer.shape
max_5_pooling = MaxPooling2D(pool_size=(58,1), strides=None, padding='valid')(conv_5_layer)
max_5_pooling.shape
fully_connected = Flatten()(max_5_pooling)
fully_connected.shape
deep_dense_1_layer = Dense(32, activation='relu')(fully_connected)
deep_dense_1_layer = Dropout(0.2)(deep_dense_1_layer) # <-------- [HyperParameter]
deep_dense_1_layer.shape
deep_dense_2_layer = Dense(32, activation='relu')(deep_dense_1_layer)
deep_dense_2_layer = Dropout(0.2)(deep_dense_2_layer) # <-------- [HyperParameter]
deep_dense_2_layer.shape
deep_dense_3_layer = Dense(16, activation='relu')(deep_dense_2_layer)
deep_dense_3_layer = Dropout(0.2)(deep_dense_3_layer) # <-------- [HyperParameter]
deep_dense_3_layer.shape
predictions = Dense(K, activation='softmax')(deep_dense_3_layer)
criticality_network = Model(inputs=[gram_input],outputs=[predictions])
print(criticality_network.summary())
criticality_network.compile(optimizer='adam',loss='binary_crossentropy',
metrics=['accuracy'])
from tempfile import mkdtemp
import os.path as path
file_corpora_train_x = path.join(mkdtemp(), 'alex-res-adapted-003_temp_corpora_train_x.dat') #Update per experiment
file_corpora_test_x = path.join(mkdtemp(), 'alex-res-adapted-003_temp_corpora_test_x.dat')
shape_train_x = (len(corpora_train),max_len_sentences,embeddigs_cols,1)
shape_test_x = (len(corpora_test),max_len_sentences,embeddigs_cols,1)
corpora_train_x = np.memmap(
filename = file_corpora_train_x,
dtype='float32',
mode='w+',
shape = shape_train_x)
corpora_test_x = np.memmap( #Test Corpora (for future evaluation)
filename = file_corpora_test_x,
dtype='float32',
mode='w+',
shape = shape_test_x)
target_train_y = np.array(target_train) #Train Target
target_test_y = np.array(target_test) #Test Target (for future evaluation)
corpora_train_x.shape
target_train_y.shape
corpora_test_x.shape
target_test_y.shape
for doc in range(len(corpora_train)):
#print(corpora_train[doc].shape[1])
for words_rows in range(corpora_train[doc].shape[0]):
embed_flatten = np.array(corpora_train[doc][words_rows]).flatten() #<--- Capture doc and word
for embedding_cols in range(embed_flatten.shape[0]):
corpora_train_x[doc,words_rows,embedding_cols,0] = embed_flatten[embedding_cols]
for doc in range(len(corpora_test)):
for words_rows in range(corpora_test[doc].shape[0]):
embed_flatten = np.array(corpora_test[doc][words_rows]).flatten() #<--- Capture doc and word
for embedding_cols in range(embed_flatten.shape[0]):
corpora_test_x[doc,words_rows,embedding_cols,0] = embed_flatten[embedding_cols]
#csv_logger = CSVLogger(system+'_training.log')
# filepath changed from: "alex-adapted-res-003/best_model.hdf5" for testing
# The folder alex-adapted-res-003 doesn't exist yet in the repository. RC created 08_test in the root folder
# manually
filepath = "../08_test/best_model.hdf5"
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)
mc = ModelCheckpoint(filepath, monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
callbacks_list = [es,mc]
history = criticality_network.fit(
x = corpora_train_x,
y = target_train_y,
#batch_size=64,
epochs=2000, #5 <------ Hyperparameter
validation_split = 0.2,
callbacks=callbacks_list
)
#Saving Training History
df_history = pd.DataFrame.from_dict(history.history)
df_history.to_csv('../08_test/history_training.csv', encoding='utf-8',index=False)
criticality_network.save(filepath)
df_history.head()
# 'alex-adapted-res-003/corpora_test_x./target_test_y.npy' for testing
#Saving Test Data
np.save('../08_test/corpora_test_x.npy',corpora_test_x)
np.save('../08_test/target_test_y.npy',target_test_y)
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs2 = range(len(acc))
plt.plot(epochs2, acc, 'b', label='Training')
plt.plot(epochs2, val_acc, 'r', label='Validation')
plt.title('Training and validation accuracy')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend()
plt.figure()
plt.plot(epochs2, loss, 'b', label='Training')
plt.plot(epochs2, val_loss, 'r', label='Validation')
plt.title('Training and validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend()
plt.show()
from sklearn.metrics import average_precision_score,precision_recall_curve
#funcsigs replaces the (deprecated?) sklearn signature
from funcsigs import signature
#from sklearn.utils.fixes import signature
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from tensorflow.keras.models import load_model
path = '../08_test/best_model.hdf5'
criticality_network_load = load_model(path) #<----- The Model
score = criticality_network_load.evaluate(corpora_test_x, target_test_y, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
history_predict = criticality_network_load.predict(x=corpora_test_x)
history_predict
inferred_data = pd.DataFrame(history_predict,columns=list('AB'))
target_data = pd.DataFrame(target_test_y,columns=list('LN'))
data = target_data.join(inferred_data)
y_true = list(data['L'])
y_score= list(data['A'])
average_precision = average_precision_score(y_true, y_score)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))
auc = roc_auc_score(y_true, y_score)
print('AUC: %.3f' % auc)