history = pd.read_csv('../08_test/history_training.csv')
show_loss_accurracy_plots(history)
path = '../08_test/best_model.hdf5'
# You must be using tensorflow 2.3 or greater
criticality_network_load = load_model(path) #<----- The Model
corpora_test_x = np.load('../08_test/corpora_test_x.npy')
target_test_y = np.load('../08_test/target_test_y.npy')
evaluate_model(criticality_network_load,corpora_test_x,target_test_y)
def clean_list(inp):
out = []
for i in inp:
out.append(i[0])
return out
def clean_input(inp):
return tuple(clean_list(inp.tolist()))
def summarize(inp):
total = 0
for i in inp:
# print(i)
total+=i
new = [total]
return new
from securereqnet.utils import Embeddings
import shap
model = criticality_network_load
embeddings = Embeddings()
embed_path = '../data/word_embeddings-embed_size_100-epochs_100.csv'
embeddings_dict = embeddings.get_embeddings_dict(embed_path)
reverse_embeddings = {}
for key, value in embeddings_dict.items():
value = tuple(np.array(value, dtype='float32').tolist())
# print(value)
reverse_embeddings[value] = key
background = corpora_test_x[np.random.choice(corpora_test_x.shape[0], 400, replace=False)]
# explain predictions of the model on four images
e = shap.DeepExplainer(model, background)
# ...or pass tensors directly
# e = shap.DeepExplainer((model.layers[0].input, model.layers[-1].output), background)
shap_values = e.shap_values(corpora_test_x[0:200])
# (shap, string)
shaps = []
for doc in range(shap_values[0].shape[0]):
for word in range(shap_values[0][doc].shape[0]):
# grab the word
try:
string = reverse_embeddings[clean_input(corpora_test_x[doc, word])]
shap_value = summarize(clean_list(shap_values[0][doc, word]))[0]
shaps.append((shap_value, string))
except KeyError as e:
pass
shaps = sorted(shaps, key = lambda x: abs(x[0]), reverse=True)
import matplotlib.pyplot as plt
import math
import statistics
shap_vals = []
token = []
fig1 = plt.gcf()
data = {}
# Top 25 shap vals
uBound = 25
i = 0
while i < uBound:
if(i < len(shaps)):
curTok = shaps[i][1]
curShap = shaps[i][0]
if curTok in data.keys():
data[curTok].append(curShap)
uBound += 1
else:
data[curTok] = [curShap]
i += 1
# get the rest
for i in range(len(shaps)):
curTok = shaps[i][1]
curShap = shaps[i][0]
if curTok in data.keys():
data[curTok].append(curShap)
for key in data.keys():
for item in data[key]:
shap_vals.append(item)
token.append(key)
fig = plt.figure(figsize = (15, 10))
max_shap_val = max(shap_vals)
min_shap_val = min(shap_vals)
total_range = max_shap_val - min_shap_val
std_dev = statistics.stdev(shap_vals)
median = statistics.median(shap_vals)
mean = statistics.mean(shap_vals)
# define our gradient
# we want something less linear
redness = lambda x : math.sqrt(((x+abs(min_shap_val))/total_range) * 100) * 10 / 100
blueness = lambda x : 1 - redness(x)
# size as normal distribution
size = lambda x : 500 * math.ceil(100 * ((1/(std_dev*math.sqrt(math.pi))*math.e)**(-1*((x-mean)**2)/(2*std_dev**2)))) / 100 + 35
plt.xlabel("Shap Value")
plt.ylabel("token")
plt.title("Shap Visualization for 200 Issues")
plt.xlim([-1 * (max_shap_val + std_dev), max_shap_val + std_dev])
plt.gca().invert_yaxis()
# creating the bar plot
plt.scatter(shap_vals, token, c = [(redness(x), 0, blueness(x)) for x in shap_vals], marker='.', s = [size(x) for x in shap_vals])
plt.savefig("../images/shap_200_issues_alpha.png", transparent=False)
plt.show()