#Generates the embeddings used to train securereqnet models
import csv
#Implementing the Skip-Gram Model
from tensorflow.keras.preprocessing import text
from nltk.corpus import gutenberg
from string import punctuation
from tensorflow.keras.preprocessing.sequence import skipgrams
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
%matplotlib inline
from nltk.stem.snowball import SnowballStemmer
englishStemmer=SnowballStemmer("english")
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Dot, Input, Dense, Reshape, LSTM, Conv2D, Flatten, MaxPooling1D, Dropout, MaxPooling2D
from tensorflow.keras.layers import Embedding, Multiply, Subtract
from tensorflow.keras.models import Sequential, Model
from tensorflow.python.keras.layers import Lambda
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.manifold import TSNE
#Preprocessing Part O wpt = nltk.WordPunctTokenizer() stop_words = nltk.corpus.stopwords.words('english') remove_terms = punctuation + '0123456789' def normalize_document(doc): # lower case and remove special characters\whitespaces doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A) doc = doc.lower() doc = doc.strip() # tokenize document tokens = wpt.tokenize(doc) #Filter stopwords out of document filtered_tokens = [token for token in tokens if token not in stop_words] #Filtering Stemmings filtered_tokens = [englishStemmer.stem(token) for token in filtered_tokens] #Filtering remove-terms filtered_tokens = [token for token in filtered_tokens if token not in remove_terms and len(token)>2] # re-create document from filtered tokens doc = ' '.join(filtered_tokens) return doc normalize_corpus = np.vectorize(normalize_document)
remove_terms
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-80ded238787d> in <module>
      1 #Preprocessing Part I
----> 2 remove_terms

NameError: name 'remove_terms' is not defined
stop_words
filename = '../data/cve/cve_dataset.tsv'
data = []
with open(filename, 'r') as tsv_file:
	tsv_reader = csv.reader(tsv_file, delimiter='\t')
	for line in tsv_reader:
		data.append((line[1], line[2]))

for d in data:
	print(d)
data[0][1]
#Create a Method
corpora = [sentence[1] for sentence in data]
norm_corpora = [sentence.split(' ') for sentence in corpora]
norm_corpora = [[word.lower() for word in sent if word not in remove_terms] 
                for sent in norm_corpora]
norm_corpora = [' '.join(tok_sent) for tok_sent in norm_corpora]
norm_corpora = filter(None, normalize_corpus(norm_corpora))
norm_corpora = [tok_sent for tok_sent in norm_corpora if len(tok_sent.split()) > 2] #Len of the sentence
norm_corpora
corpora
tokenizer_corpora = text.Tokenizer()
tokenizer_corpora.fit_on_texts(norm_corpora)
word2id = tokenizer_corpora.word_index
id2word = {v:k for k, v in word2id.items()}
word2id
vocab_size = len(word2id) + 1 
embed_size = 20 # <-------- [HyperParameter]
print('Vocabulary Size Source:', vocab_size)
print('Vocabulary Sample Source:', list(word2id.items())[:10])
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_corpora] #Vector of IDs of words per sentence
# generate skip-grams
#Window SIZE!
w_size = 10 # <-------- [HyperParameter]
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=w_size) for wid in wids]
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          id2word[pairs[i][0]], pairs[i][0], 
          id2word[pairs[i][1]], pairs[i][1], 
          labels[i]))
#The functional API Version
#Receive 1 Integer between 1 and embed_size
word_input = Input(shape=(1,))

# This embedding layer will encode the input sequence
# into a sequence of dense vocab_size-dimensional vectors.
x_word = Embedding(vocab_size, embed_size,embeddings_initializer="glorot_uniform",input_length=1)(word_input)
x_word = Reshape((embed_size, ))(x_word)

context_input = Input(shape=(1,))

x_context = Embedding(vocab_size, embed_size,embeddings_initializer="glorot_uniform",input_length=1)(context_input)
x_context = Reshape((embed_size, ))(x_context)

x = Dot(axes=-1,normalize=True)([x_word, x_context])
x = Dense(1,kernel_initializer="glorot_uniform", activation="sigmoid")(x)
model = Model(inputs=[word_input,context_input], outputs=[x])
model.compile(loss="mean_squared_error", optimizer="rmsprop")
print(model.summary())
SVG(model_to_dot(model, show_shapes=True, show_layer_names=False, 
                 rankdir='TB').create(prog='dot', format='svg'))
for epoch in range(1, 6):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed Source {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += model.train_on_batch(X,Y)  
    print('Epoch:', epoch, 'Loss:', loss)
weights = model.layers[2].get_weights()[0][1:]
df_embeddings = pd.DataFrame(weights, index=id2word.values())
df_embeddings.head(20)
df_embedding_trans = df_embeddings.T
df_embedding_trans.head()
np.array(df_embedding_trans['vulner'])
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:4]+1] 
                   for search_term in ['vulner', 'attack', 'window', 'via', 'remot', 'code', 'user','exploit']}
len(distance_matrix)
similar_words
words = sum([[k] + v for k, v in similar_words.items()], [])
words_ids = [word2id[w] for w in words]
word_vectors = np.array([weights[idx] for idx in words_ids])
tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=3)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(word_vectors)
labels = words
plt.figure(figsize=(8, 8))
plt.scatter(T[:, 0], T[:, 1], c='steelblue', edgecolors='k')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')
print('Total words:', len(words), '\tWord Embedding shapes:', word_vectors.shape)
max_len_sentences = max([len(wpt.tokenize(doc)) for doc in norm_corpora]) #<------- [Parameter]
print("Max. Sentence # words:",max_len_sentences)
corpora_tensor = [[np.array(df_embedding_trans[word_]) for word_ in wpt.tokenize(doc) if word_ not in remove_terms] 
                  for doc in norm_corpora]
len(wpt.tokenize(norm_corpora[0]))
words_rows = max_len_sentences
embeddigs_cols = embed_size
input_sh = (words_rows,embeddigs_cols,1)
#Selecting filters? 
#https://stackoverflow.com/questions/48243360/how-to-determine-the-filter-parameter-in-the-keras-conv2d-function
#https://stats.stackexchange.com/questions/196646/what-is-the-significance-of-the-number-of-convolution-filters-in-a-convolutional

N_filters = 32 # <-------- [HyperParameter] Powers of 2 Numer of Features
K = 2 # <-------- [HyperParameter] Number of Classess
input_sh
gram_input = Input(shape = input_sh)
conv_filter_1_gram = Conv2D(filters= N_filters, input_shape=input_sh, activation='relu', 
                       kernel_size=(1,embeddigs_cols), padding='valid',data_format="channels_last")(gram_input)
conv_filter_1_gram
conv_filter_3_gram = Conv2D(filters= N_filters, input_shape=input_sh, activation='relu', 
                       kernel_size=(3,embeddigs_cols), padding='valid')(gram_input)
conv_filter_5_gram = Conv2D(filters= N_filters, input_shape=input_sh, activation='relu', 
                       kernel_size=(5,embeddigs_cols), padding='valid')(gram_input)
max_pool_1_gram = MaxPooling2D(pool_size=((words_rows-1+1), 1), strides=None, padding='valid')(conv_filter_1_gram)
max_pool_3_gram = MaxPooling2D(pool_size=((words_rows-3+1), 1), strides=None, padding='valid')(conv_filter_3_gram)
max_pool_5_gram = MaxPooling2D(pool_size=((words_rows-5+1), 1), strides=None, padding='valid')(conv_filter_5_gram)
fully_connected_1_gram = Flatten()(max_pool_1_gram)
fully_connected_3_gram = Flatten()(max_pool_3_gram)
fully_connected_5_gram = Flatten()(max_pool_5_gram)
merged_vector = layers.concatenate([fully_connected_1_gram, fully_connected_3_gram, 
                                    fully_connected_5_gram], axis=-1)

integration_layer = Dropout(0.4)(merged_vector)

predictions = Dense(K, activation='softmax')(integration_layer)
criticality_network = Model(inputs=[gram_input],outputs=[predictions]) 
criticality_network.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
import csv
import random