# 13-特征工程之金融反欺诈

##### 五.导包
``````#numpy、pandas、matplotlib
import pandas as pd
import numpy as np

#画图#
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
#算法和数据处理、模型评估
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
#样本不均衡处理
#忽略弹出的warnings
import warnings
warnings.filterwarnings('ignore')
#导入过采样的工具包处理类别不平衡问题
from imblearn.over_sampling import SMOTE
import itertools
``````
##### 六.获取数据
``````#设置float类型数据保留位数
pd.set_option('display.float_format', lambda x: '%.3f' % x)#设置pandas读入数据保留3位小数点
``````
##### 七.查看数据是否缺失
``````data_cr.info()
``````
##### 八.特征工程
``````#正负样本均衡问题
#目标变量可视化
fig,ax=plt.subplots(1,2,figsize=(12,8))
from pylab import mpl#用于显示中文
plt.style.use('ggplot')
mpl.rcParams['font.sans-serif'] = ['SimHei']   # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False          # 解决保存图像是负号'-'显示为方块的问题
data_cr["Class"].value_counts().plot(kind="bar",ax=ax[0],fontsize=23)
ax[0].set_title("目标变量中每类的频数分布直方图")
data_cr["Class"].value_counts().plot(kind="pie",ax=ax[1],fontsize=23,autopct='%1.2f%%')#长度为1，保留百分号前面的2个小数点
ax[1].set_title("目标变量中的每类频率分布饼图",fontproperties = 'SimHei')
``````

``````#特征转换,将时间从单位每秒化为单位每小时
data_cr["Time"]=data_cr["Time"].apply(lambda x: divmod(x,3600)[0])
``````
``````#特征选择
v_feature=data_cr.iloc[:,1:29].columns#获取特征名
plt.figure(figsize=(16,28*5))
gs=gridspec.GridSpec(28,1)
# 图形需要绘制10分钟
for i,cn in enumerate(v_feature):
ax=plt.subplot(gs[i])
ax.hist(data_cr[cn][data_cr["Class"]==1],bins=50,normed = True)
ax.hist(data_cr[cn][data_cr["Class"]==0],bins=100,normed = True)
data_cr[cn][data_cr["Class"]==1].plot(kind = 'kde',ax = ax)
data_cr[cn][data_cr["Class"]==0].plot(kind = 'kde',ax = ax)
ax.set_title("直方图分布"+str(cn),fontproperties = 'SimHei')
``````
``````#我们将选择在不同信用卡状态下的分布有明显区别的变量。因此剔除变量V8、V13 、V15 、V20 、V21 、V22、 V23 、V24 、V25 、V26 、V27 和V28变量
#删除相关变量
droplist=["V8","V13","V15","V20","V21","V22","V23","V24","V25","V26","V27","V28"]#提成相关性较弱的12列，还剩下19列
data_new=data_cr.drop(droplist,axis=1)
``````
``````#特征缩放
#Amount变量和Time变量的取值范围与其他变量相差较大，所以要对其进行特征缩放
col=["Amount","Time"]
from sklearn.preprocessing import StandardScaler
data_new[col]=StandardScaler().fit_transform(data_new[col])
``````
``````#对特征的重要性进行排序，以进一步减少变量
#构造X和Y变量
x_val=data_new.iloc[:,:-1]
y_val=data_new.iloc[:,-1]
#利用GBDT梯度提升决策树进行特征重要性排序
from sklearn.ensemble import GradientBoostingClassifier as GDBT
clf=GDBT()
clf.fit(x_val,y_val)
#排序可视化
# plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12,6)
importance=clf.feature_importances_
feature_name=data_new.columns[:-1]
indices=np.argsort(importance)[::-1]
fig = plt.figure(figsize=(20,6))
plt.title("Feature importances by GDBTClassifier")
plt.bar(range(len(importance)),importance[indices],color="blue",align="center")
plt.xticks(range(len(importance)),feature_name[indices],rotation='vertical',fontsize=14)
plt.xlim([-1, len(indices)])
``````

``````#删除次要属性
droplist1=['V16','Time','V7','V5','V4','V19','V11','V1','Amount']
data_new1=data_new.drop(droplist1,axis=1)
``````
##### 九.模型训练

SMOTE（Synthetic Minority Oversampling Technique），SMOET的基本原理是：采样最邻近算法，计算出每个少数类样本的K个近邻，从K个近邻中随机挑选N个样本进行随机线性插值，构造新的少数样本，同时将新样本与原数据合成，产生新的训练集。

``````#SMOTE过采样
#重新构造X变量和Y变量
x_all=data_new1.iloc[:,:-1]
y_all=data_new1.iloc[:,-1]

X_train,X_test,Y_train,Y_test=train_test_split(x_val,y_val,test_size=0.3)
n_samples=len(Y_train)
pos_samples=Y_train[Y_train==1].shape[0]
print("过采样之前被盗刷所占的比例{:.2%}".format(pos_samples/n_samples))
#为了保证预测的数据分布的真实性，所以我们只在训练集上进行过采样处理
X_train_new,Y_train_new=SMOTE(random_state=12).fit_sample(X_train,Y_train)
n_samples_new=len(Y_train_new)
pos_samples_new=Y_train_new[Y_train_new==1].shape[0]
print("过采样之后被盗刷所占的比例{:.2%}".format(pos_samples_new/n_samples_new))
``````
``````#百分比饼图
fig,ax=plt.subplots(1,2,figsize=(12,8))
plt.style.use('seaborn-darkgrid')
from pylab import mpl#用于显示中文
mpl.rcParams['font.sans-serif'] = ['SimHei']   # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False          # 解决保存图像是负号'-'显示为方块的问题
data_cr["Class"].value_counts().plot(kind="pie",ax=ax[0],fontsize=23,autopct='%1.2f%%')
ax[0].set_title("SMOTE采样之前的频率分布饼图")
pd.Series(Y_train_new).value_counts().plot(kind="pie",ax=ax[1],fontsize=23,autopct='%1.2f%%')#长度为1，保留百分号前面的2个小数点
ax[1].set_title("SMOTE采样之后的频率分布饼图")
ax[1].set_ylabel("Class")
plt.savefig("./smote.jpg")
``````

``````#自定义可视化函数
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)

threshold = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > threshold else "black")#若对应格子上面的数量不超过阈值则，上面的字体为白色，为了方便查看

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
``````
``````#单独的逻辑回归求得查全率Recall rate
from sklearn.linear_model import LogisticRegression
lg_clf=LogisticRegression()
lg_clf.fit(X_train_new,Y_train_new)
lg_pred=lg_clf.predict(X_test)
cnf_matrix_lg = confusion_matrix(Y_test,lg_pred)  # 生成混淆矩阵
np.set_printoptions(precision=2)#精确到两位小数点
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure(figsize=(6,4))
plt.subplot(111)
plot_confusion_matrix(cnf_matrix_lg
, classes=class_names
, title='logit_Confusion matrix,recall is {:.4f}'.format(cnf_matrix_lg[1,1]/(cnf_matrix_lg[1,0]+cnf_matrix_lg[1,1])))
print("逻辑回归在测试集上的查全率（Recall rate）: ", cnf_matrix_lg[1,1]/(cnf_matrix_lg[1,0]+cnf_matrix_lg[1,1]))
plt.savefig(r"./逻辑回归.jpg",dpi=600)
plt.show()
``````
Recall rate.png
``````#利用GridSearchCV进行交叉验证和模型参数自动调优
#利用逻辑回归算法分类
from sklearn.linear_model import LogisticRegression
para_logit= {'C': [100,1,10]}#候选参数集
clf=GridSearchCV(LogisticRegression(dual=True),param_grid=para_logit,cv=10,iid=False,n_jobs=-1)#构建分类器，10折交叉验证
clf.fit(X_train_new,Y_train_new)#使用训练集进行训练
print("最佳参数组合: {}".format(clf.best_params_))
print("最佳交叉验证拟合分数{:.5f}".format(clf.best_score_))
``````
``````#预测
y_pred = clf.predict(X_test)
print("预测集的准确率: {:.5f}".format(accuracy_score(Y_test, y_pred,)))
``````
``````#结果可视化
# Compute confusion matrix
cnf_matrix_lg = confusion_matrix(Y_test,lg_pred)  # 生成混淆矩阵
cnf_matrix_gd = confusion_matrix(Y_test,y_pred)  # 生成混淆矩阵
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure(figsize=(16,8))
plt.subplot(121)
plot_confusion_matrix(cnf_matrix_lg
, classes=class_names
, title='lg_Confusion matrix,recall is {:.4f}'.format(cnf_matrix_lg[1,1]/(cnf_matrix_lg[1,0]+cnf_matrix_lg[1,1])))
plt.subplot(122)
plot_confusion_matrix(cnf_matrix_gd
, classes=class_names
, title='GridSearchCV_Confusion matrix,recall is {:.4f}'.format(cnf_matrix_gd[1,1]/(cnf_matrix_gd[1,0]+cnf_matrix_gd[1,1])))
print("逻辑回归在测试集上的查全率（Recall rate）: ", cnf_matrix_lg[1,1]/(cnf_matrix_lg[1,0]+cnf_matrix_lg[1,1]))
print("GridSearchCV在测试集上的查全率（Recall rate）: ", cnf_matrix_gd[1,1]/(cnf_matrix_gd[1,0]+cnf_matrix_gd[1,1]))
plt.show()
``````

##### 十.模型评估
``````#考虑设置阈值，来调整预测被盗刷的概率，依次来调整模型的查全率（Recall）
from itertools import cycle
y_pred_proba=clf.predict_proba(X_test)
thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]#设定不同的阈值，本来原始的阈值是0.5
plt.figure(figsize=(12,8))
j=1
re=[]
pr=[]
a=[]
for i in thresholds:
y_test_predictions_high_recall=y_pred_proba[:,1]>i#将预测为被盗刷的概率值与阈值作比较，大于阈值则预测为被盗刷，小于阈值则预测为正常
plt.subplot(3,3,j)
j+=1
# Compute confusion matrix
cnf_matrix1 = confusion_matrix(Y_test, y_test_predictions_high_recall)
fpr, tpr, _ = roc_curve(Y_test, y_test_predictions_high_recall)
area = auc(fpr, tpr)
recall_rate=cnf_matrix1[1,1]/(cnf_matrix1[1,0]+cnf_matrix1[1,1])#查全率
precision_rate=(cnf_matrix1[1,1]+cnf_matrix1[0,0])/(cnf_matrix1.sum())#准确率
print("When threshold is {0},  Recall rate is {1:0.5f},  AUC is {2:.5f}".format(i, recall_rate,area))
# Plot non-normalized confusion matrix
class_names = [0,1]
plot_confusion_matrix(cnf_matrix1, classes=class_names,title="threshold>{}".format(i))
re.append(recall_rate)
pr.append(precision_rate)
a.append(area)
plt.savefig('./模型评估.jpg')
``````

``````#趋势图
plt.figure(figsize=(8,6))
plt.plot(thresholds,re,label="recall_rate")
plt.plot(thresholds,pr,label="precision_rate")
plt.plot(thresholds,a,label="Value of AUC")
plt.legend(fontsize=23)
plt.xlabel("threshold",fontsize=19)
plt.ylabel("value",fontsize=19)
plt.title("Recall, Precision rate and thresholds")
plt.show()
``````

precision和recall是一组矛盾的变量。从上面混淆矩阵和PRC曲线可以看到，阈值越小，recall值越大，模型能找出信用卡被盗刷的数量也就更多，但换来的代价是误判的数量也较大。随着阈值的提高，recall值逐渐降低，precision值也逐渐提高，误判的数量也随之减少。通过调整模型阈值，控制模型反信用卡欺诈的力度，若想找出更多的信用卡被盗刷就设置较小的阈值，反之，则设置较大的阈值。