Fraud Detection in Python

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import seaborn as sns
from scipy.stats import norm

df = pd.read_csv(‘data.csv’) class_names = {0:’Not Fraud’, 1:’Fraud’} print(df.Class.value_counts().rename(index = class_names)) df.head()

corrmat = df.corr() f, ax = pyplot.subplots(figsize =(9, 8)) sns.heatmap(corrmat, ax = ax, cmap =”YlGnBu”, linewidths = 0.1)

array = df.values X = array[:,1:30] y = array[:,30]

# fit model no training data
model = XGBClassifier()
model.fit(X, y)

# plot feature importance
plot_importance(model)
pyplot.show()

def PrintStats(cmat, y_test, pred):
# separate out the confusion matrix components
tpos = cmat[0][0]
fneg = cmat[1][1]
fpos = cmat[0][1]
tneg = cmat[1][0]

# calculate F!, Recall scores
f1Score = round(f1_score(y_test, pred), 2)
recallScore = round(recall_score(y_test, pred), 2)

# calculate and display metrics
print(cmat)
print( ‘Accuracy: ‘+ str(np.round(100*float(tpos+fneg)/float(tpos+fneg + fpos + tneg),2))+’%’)
print( ‘Cohen Kappa: ‘+ str(np.round(cohen_kappa_score(y_test, pred),3))) print(“Sensitivity/Recall for Model : {recall_score}”.format(recall_score = recallScore))
print(“F1 Score for Model : {f1_score}”.format(f1_score = f1Score))

def RunModel(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train.values.ravel())
pred = model.predict(X_test)
matrix = confusion_matrix(y_test, pred)
return matrix, pred

feature_names = df.iloc[:,1:30].columns
target = df.iloc[:1, 30: ].columns
data_features = df[feature_names]
data_target = df[target]

from sklearn.model_selection import train_test_split
np.random.seed(123)
X_train, X_test, y_train, y_test = train_test_split(data_features, data_target, train_size=0.70, test_size=0.30, random_state=1)

from warnings import simplefilter
# ignore all future warnings
simplefilter(action=’ignore’, category=FutureWarning)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression() cmat, pred = RunModel(lr, X_train, y_train, X_test, y_test) PrintStats(cmat, y_test, pred)