keras-anomaly-detection
Anomaly detection implemented in Keras
The source codes of the recurrent, convolutional and feedforward networks auto-encoders for anomaly detection can be found in keras_anomaly_detection/library/convolutional.py and keras_anomaly_detection/library/recurrent.py and keras_anomaly_detection/library/feedforward.py
The the anomaly detection is implemented using auto-encoder with convolutional, feedforward, and recurrent networks and can be applied to:
- timeseries data to detect timeseries time windows that have anomaly pattern
- LstmAutoEncoder in keras_anomaly_detection/library/recurrent.py
- Conv1DAutoEncoder in keras_anomaly_detection/library/convolutional.py
- CnnLstmAutoEncoder in keras_anomaly_detection/library/recurrent.py
- BidirectionalLstmAutoEncoder in keras_anomaly_detection/library/recurrent.py
- structured data (i.e., tabular data) to detect anomaly in data records
- Conv1DAutoEncoder in keras_anomaly_detection/library/convolutional.py
- FeedforwardAutoEncoder in keras_anomaly_detection/library/feedforward.py
Usage
Detect Anomaly within the ECG Data
The sample codes can be found in the demo/ecg_demo.
The following sample codes show how to fit and detect anomaly using Conv1DAutoEncoder:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras_anomaly_detection.library.plot_utils import visualize_reconstruction_error
from keras_anomaly_detection.library.convolutional import Conv1DAutoEncoder
def main():
data_dir_path = './data'
model_dir_path = './models'
# ecg data in which each row is a temporal sequence data of continuous values
ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
print(ecg_data.head())
ecg_np_data = ecg_data.as_matrix()
scaler = MinMaxScaler()
ecg_np_data = scaler.fit_transform(ecg_np_data)
print(ecg_np_data.shape)
ae = Conv1DAutoEncoder()
# fit the data and save model into model_dir_path
ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
# load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
anomaly_information = ae.anomaly(ecg_np_data[:23, :])
reconstruction_error = []
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
reconstruction_error.append(dist)
visualize_reconstruction_error(reconstruction_error, ae.threshold)
if __name__ == '__main__':
main()
The following sample codes show how to fit and detect anomaly using LstmAutoEncoder:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras_anomaly_detection.library.plot_utils import visualize_reconstruction_error
from keras_anomaly_detection.library.recurrent import LstmAutoEncoder
def main():
data_dir_path = './data'
model_dir_path = './models'
ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
print(ecg_data.head())
ecg_np_data = ecg_data.as_matrix()
scaler = MinMaxScaler()
ecg_np_data = scaler.fit_transform(ecg_np_data)
print(ecg_np_data.shape)
ae = LstmAutoEncoder()
# fit the data and save model into model_dir_path
ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
# load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
anomaly_information = ae.anomaly(ecg_np_data[:23, :])
reconstruction_error = []
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
reconstruction_error.append(dist)
visualize_reconstruction_error(reconstruction_error, ae.threshold)
if __name__ == '__main__':
main()
The following sample codes show how to fit and detect anomaly using CnnLstmAutoEncoder:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras_anomaly_detection.library.plot_utils import visualize_reconstruction_error
from keras_anomaly_detection.library.recurrent import CnnLstmAutoEncoder
def main():
data_dir_path = './data'
model_dir_path = './models'
ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
print(ecg_data.head())
ecg_np_data = ecg_data.as_matrix()
scaler = MinMaxScaler()
ecg_np_data = scaler.fit_transform(ecg_np_data)
print(ecg_np_data.shape)
ae = CnnLstmAutoEncoder()
# fit the data and save model into model_dir_path
ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
# load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
anomaly_information = ae.anomaly(ecg_np_data[:23, :])
reconstruction_error = []
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
reconstruction_error.append(dist)
visualize_reconstruction_error(reconstruction_error, ae.threshold)
if __name__ == '__main__':
main()
The following sample codes show how to fit and detect anomaly using BidirectionalLstmAutoEncoder:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras_anomaly_detection.library.plot_utils import visualize_reconstruction_error
from keras_anomaly_detection.library.recurrent import BidirectionalLstmAutoEncoder
def main():
data_dir_path = './data'
model_dir_path = './models'
ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
print(ecg_data.head())
ecg_np_data = ecg_data.as_matrix()
scaler = MinMaxScaler()
ecg_np_data = scaler.fit_transform(ecg_np_data)
print(ecg_np_data.shape)
ae = BidirectionalLstmAutoEncoder()
# fit the data and save model into model_dir_path
ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
# load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
anomaly_information = ae.anomaly(ecg_np_data[:23, :])
reconstruction_error = []
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
reconstruction_error.append(dist)
visualize_reconstruction_error(reconstruction_error, ae.threshold)
if __name__ == '__main__':
main()
The following sample codes show how to fit and detect anomaly using FeedForwardAutoEncoder:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras_anomaly_detection.library.plot_utils import visualize_reconstruction_error
from keras_anomaly_detection.library.feedforward import FeedForwardAutoEncoder
def main():
data_dir_path = './data'
model_dir_path = './models'
# ecg data in which each row is a temporal sequence data of continuous values
ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
print(ecg_data.head())
ecg_np_data = ecg_data.as_matrix()
scaler = MinMaxScaler()
ecg_np_data = scaler.fit_transform(ecg_np_data)
print(ecg_np_data.shape)
ae = FeedForwardAutoEncoder()
# fit the data and save model into model_dir_path
ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
# load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
anomaly_information = ae.anomaly(ecg_np_data[:23, :])
reconstruction_error = []
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
reconstruction_error.append(dist)
visualize_reconstruction_error(reconstruction_error, ae.threshold)
if __name__ == '__main__':
main()
Detect Fraud in Credit Card Transaction
The sample codes can be found in the demo/credit_card_demo.
The credit card sample data is from this repo
Below is the sample code using FeedforwardAutoEncoder:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras_anomaly_detection.library.feedforward import FeedForwardAutoEncoder
from keras_anomaly_detection.demo.credit_card_demo.unzip_utils import unzip
from keras_anomaly_detection.library.plot_utils import plot_confusion_matrix, plot_training_history, visualize_anomaly
from keras_anomaly_detection.library.evaluation_utils import report_evaluation_metrics
import numpy as np
DO_TRAINING = False
def preprocess_data(csv_data):
credit_card_data = csv_data.drop(labels=['Class', 'Time'], axis=1)
credit_card_data['Amount'] = StandardScaler().fit_transform(credit_card_data['Amount'].values.reshape(-1, 1))
# print(credit_card_data.head())
credit_card_np_data = credit_card_data.as_matrix()
y_true = csv_data['Class'].as_matrix()
return credit_card_np_data, y_true
def main():
seed = 42
np.random.seed(seed)
data_dir_path = './data'
model_dir_path = './models'
unzip(data_dir_path + '/creditcardfraud.zip', data_dir_path)
csv_data = pd.read_csv(data_dir_path + '/creditcard.csv')
estimated_negative_sample_ratio = 1 - csv_data['Class'].sum() / csv_data['Class'].count()
print(estimated_negative_sample_ratio)
X, Y = preprocess_data(csv_data)
print(X.shape)
ae = FeedForwardAutoEncoder()
training_history_file_path = model_dir_path + '/' + FeedForwardAutoEncoder.model_name + '-history.npy'
# fit the data and save model into model_dir_path
epochs = 100
history = None
if DO_TRAINING:
history = ae.fit(X, model_dir_path=model_dir_path,
estimated_negative_sample_ratio=estimated_negative_sample_ratio,
nb_epoch=epochs,
random_state=seed)
np.save(training_history_file_path, history)
else:
history = np.load(training_history_file_path).item()
# load back the model saved in model_dir_path
ae.load_model(model_dir_path)
# detect anomaly for the test data
Ypred = []
_, Xtest, _, Ytest = train_test_split(X, Y, test_size=0.2, random_state=seed)
reconstruction_error = []
adjusted_threshold = 14
anomaly_information = ae.anomaly(Xtest, adjusted_threshold)
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
predicted_label = 1 if is_anomaly else 0
Ypred.append(predicted_label)
reconstruction_error.append(dist)
report_evaluation_metrics(Ytest, Ypred)
plot_training_history(history)
visualize_anomaly(Ytest, reconstruction_error, adjusted_threshold)
plot_confusion_matrix(Ytest, Ypred)
if __name__ == '__main__':
main()
The sample code below uses Conv1DAutoEncoder:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras_anomaly_detection.library.convolutional import Conv1DAutoEncoder
from keras_anomaly_detection.demo.credit_card_demo.unzip_utils import unzip
from keras_anomaly_detection.library.plot_utils import plot_confusion_matrix, plot_training_history, visualize_anomaly
from keras_anomaly_detection.library.evaluation_utils import report_evaluation_metrics
import numpy as np
import os
DO_TRAINING = False
def preprocess_data(csv_data):
credit_card_data = csv_data.drop(labels=['Class', 'Time'], axis=1)
credit_card_data['Amount'] = StandardScaler().fit_transform(credit_card_data['Amount'].values.reshape(-1, 1))
# print(credit_card_data.head())
credit_card_np_data = credit_card_data.as_matrix()
y_true = csv_data['Class'].as_matrix()
return credit_card_np_data, y_true
def main():
seed = 42
np.random.seed(seed)
data_dir_path = './data'
model_dir_path = './models'
unzip(data_dir_path + '/creditcardfraud.zip', data_dir_path)
csv_data = pd.read_csv(data_dir_path + '/creditcard.csv')
estimated_negative_sample_ratio = 1 - csv_data['Class'].sum() / csv_data['Class'].count()
print(estimated_negative_sample_ratio)
X, Y = preprocess_data(csv_data)
print(X.shape)
ae = Conv1DAutoEncoder()
training_history_file_path = model_dir_path + '/' + Conv1DAutoEncoder.model_name + '-history.npy'
# fit the data and save model into model_dir_path
epochs = 10
history = None
if DO_TRAINING:
history = ae.fit(X, model_dir_path=model_dir_path,
estimated_negative_sample_ratio=estimated_negative_sample_ratio,
epochs=epochs)
np.save(training_history_file_path, history)
elif os.path.exists(training_history_file_path):
history = np.load(training_history_file_path).item()
# load back the model saved in model_dir_path
ae.load_model(model_dir_path)
# detect anomaly for the test data
Ypred = []
_, Xtest, _, Ytest = train_test_split(X, Y, test_size=0.2, random_state=seed)
reconstruction_error = []
adjusted_threshold = 10
anomaly_information = ae.anomaly(Xtest, adjusted_threshold)
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
predicted_label = 1 if is_anomaly else 0
Ypred.append(predicted_label)
reconstruction_error.append(dist)
report_evaluation_metrics(Ytest, Ypred)
plot_training_history(history)
visualize_anomaly(Ytest, reconstruction_error, adjusted_threshold)
plot_confusion_matrix(Ytest, Ypred)
if __name__ == '__main__':
main()
Note
There is also an autoencoder from H2O for timeseries anomaly detection in demo/h2o_ecg_pulse_detection.py
Configure to run on GPU on Windows
- Step 1: Change tensorflow to tensorflow-gpu in requirements.txt and install tensorflow-gpu
- Step 2: Download and install the CUDA® Toolkit 9.0 (Please note that currently CUDA® Toolkit 9.1 is not yet supported by tensorflow, therefore you should download CUDA® Toolkit 9.0)
- Step 3: Download and unzip the cuDNN 7.0.4 for CUDA@ Toolkit 9.0 and add the bin folder of the unzipped directory to the $PATH of your Windows environment