上篇中已经给出了application_{train|test}.csv 数据表字段的基本含义,本篇对其进行基本的数据分析,包活异常数据处理,特征变换等,最后给出仅考虑此数据文件,应用logistic回归和random forest两种模型分别训练模型的方式。
加载数据并初步预览
import os
import pandas as pd
import matplotlib. pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np
from sklearn. preprocessing import LabelEncoder, MinMaxScaler, Imputer
from sklearn. linear_model import LogisticRegression
from sklearn. ensemble import RandomForestClassifier
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
dir_path = 'XXX/dataset'
print ( os. listdir( dir_path) )
结果如下:
[‘application_test.csv’, ‘application_train.csv’, ‘bureau.csv’, ‘bureau_balance.csv’, ‘credit_card_balance.csv’, ‘HomeCredit_columns_description.csv’, ‘installments_payments.csv’, ‘POS_CASH_balance.csv’, ‘previous_application.csv’, ‘sample_submission.csv’]
application_train_file = dir_path + '/application_train.csv'
application_test_file = dir_path + '/application_test.csv'
app_train = pd. read_csv( application_train_file)
app_test = pd. read_csv( application_test_file)
print ( 'Training data shape: ' , app_train. shape)
app_train. head( )
结果如下:
Training data shape: (307511, 122)
SK_ID_CURR
TARGET
NAME_CONTRACT_TYPE
CODE_GENDER
FLAG_OWN_CAR
FLAG_OWN_REALTY
CNT_CHILDREN
AMT_INCOME_TOTAL
AMT_CREDIT
AMT_ANNUITY
...
FLAG_DOCUMENT_18
FLAG_DOCUMENT_19
FLAG_DOCUMENT_20
FLAG_DOCUMENT_21
AMT_REQ_CREDIT_BUREAU_HOUR
AMT_REQ_CREDIT_BUREAU_DAY
AMT_REQ_CREDIT_BUREAU_WEEK
AMT_REQ_CREDIT_BUREAU_MON
AMT_REQ_CREDIT_BUREAU_QRT
AMT_REQ_CREDIT_BUREAU_YEAR
0
100002
1
Cash loans
M
N
Y
0
202500.0
406597.5
24700.5
...
0
0
0
0
0.0
0.0
0.0
0.0
0.0
1.0
1
100003
0
Cash loans
F
N
N
0
270000.0
1293502.5
35698.5
...
0
0
0
0
0.0
0.0
0.0
0.0
0.0
0.0
2
100004
0
Revolving loans
M
Y
Y
0
67500.0
135000.0
6750.0
...
0
0
0
0
0.0
0.0
0.0
0.0
0.0
0.0
3
100006
0
Cash loans
F
N
Y
0
135000.0
312682.5
29686.5
...
0
0
0
0
NaN
NaN
NaN
NaN
NaN
NaN
4
100007
0
Cash loans
M
N
Y
0
121500.0
513000.0
21865.5
...
0
0
0
0
0.0
0.0
0.0
0.0
0.0
0.0
5 rows × 122 columns
1.根据TARGET在不同分类下的数量绘制整体风险直方图
app_train. TARGET. value_counts( ) . plot( kind= 'bar' )
plt. title( "是否有偿还风险(1-有风险,0- 无风险)" )
plt. ylabel( "人数" )
plt. show( )
2.检查缺失数据整体情况(对于缺失值的处理可以舍弃,补中位数,补均值,RF预测,XGBoost预测等)
def examine_missing_data ( df) :
missing = df. isnull( ) . sum ( ) . sort_values( ascending= False )
missing_percent = ( 100 * df. isnull( ) . sum ( ) / len ( df) ) . sort_values( ascending= False )
missing_table = pd. concat( [ missing, missing_percent] , axis= 1 )
missing_table = missing_table. rename( columns= { 0 : '缺失值数量' , 1 : '缺失值占比(%)' } )
missing_table = missing_table[ missing_table. iloc[ : , 1 ] != 0 ]
print ( "数据总共有%d个特征,其中存在缺失值的特征数为%d" % ( df. shape[ 1 ] , missing_table. shape[ 0 ] ) )
return missing_table
missing_values = examine_missing_data( app_train)
missing_values. head( 10 )
结果如下:
数据总共有122个特征,其中存在缺失值的特征数为67
缺失值数量
缺失值占比(%)
COMMONAREA_MEDI
214865
69.872297
COMMONAREA_AVG
214865
69.872297
COMMONAREA_MODE
214865
69.872297
NONLIVINGAPARTMENTS_MODE
213514
69.432963
NONLIVINGAPARTMENTS_MEDI
213514
69.432963
NONLIVINGAPARTMENTS_AVG
213514
69.432963
FONDKAPREMONT_MODE
210295
68.386172
LIVINGAPARTMENTS_MEDI
210199
68.354953
LIVINGAPARTMENTS_MODE
210199
68.354953
LIVINGAPARTMENTS_AVG
210199
68.354953
3. 查看特征类型(非数值类型需要LabelEncoder或者one-hot Encoder编码成数值型)
对于离散型非数值特征,例如工作性质(工人,教师,商人,医生)需要编码成计算机能识别的数值类型,通常有两种方式,label encoder简单的说就是编码成连续连续的数值,例如(工人— 0,教师 — 1,商人— 2,医生—4)。而one-hot(独热)编码,直观来说就是有多少个状态就有多少比特,而且只有一个比特为1,其他全为0的一种码制,例如(工人—1000,教师—0100,商人—0010,医生—0001)。可以看出one-hot编码长度就是该特征可以取到的所有的状态的数量,对于状态数很多的情况下,可能会导致维数灾难,可以结合PCA算法优化。对于状态数只有二个的情况,LabelEncoder的0,1二值就足够表示,不需要one-hot来扩充编码位数了。
print ( app_train. dtypes. value_counts( ) )
结果如下:
float64 65
int64 41
object 16
dtype: int64
可以看出有16个非数值型(object)特征。
print ( app_train. select_dtypes( include= 'object' ) . apply ( func= pd. Series. nunique, axis= 0 ) )
NAME_CONTRACT_TYPE 2
CODE_GENDER 3
FLAG_OWN_CAR 2
FLAG_OWN_REALTY 2
NAME_TYPE_SUITE 7
NAME_INCOME_TYPE 8
NAME_EDUCATION_TYPE 5
NAME_FAMILY_STATUS 6
NAME_HOUSING_TYPE 6
OCCUPATION_TYPE 18
WEEKDAY_APPR_PROCESS_START 7
ORGANIZATION_TYPE 58
FONDKAPREMONT_MODE 4
HOUSETYPE_MODE 3
WALLSMATERIAL_MODE 7
EMERGENCYSTATE_MODE 2
dtype: int64
print ( app_train[ 'ORGANIZATION_TYPE' ] . unique( ) )
['Business Entity Type 3' 'School' 'Government' 'Religion' 'Other' 'XNA'
'Electricity' 'Medicine' 'Business Entity Type 2' 'Self-employed'
'Transport: type 2' 'Construction' 'Housing' 'Kindergarten'
'Trade: type 7' 'Industry: type 11' 'Military' 'Services'
'Security Ministries' 'Transport: type 4' 'Industry: type 1' 'Emergency'
'Security' 'Trade: type 2' 'University' 'Transport: type 3' 'Police'
'Business Entity Type 1' 'Postal' 'Industry: type 4' 'Agriculture'
'Restaurant' 'Culture' 'Hotel' 'Industry: type 7' 'Trade: type 3'
'Industry: type 3' 'Bank' 'Industry: type 9' 'Insurance' 'Trade: type 6'
'Industry: type 2' 'Transport: type 1' 'Industry: type 12' 'Mobile'
'Trade: type 1' 'Industry: type 5' 'Industry: type 10' 'Legal Services'
'Advertising' 'Trade: type 5' 'Cleaning' 'Industry: type 13'
'Trade: type 4' 'Telecom' 'Industry: type 8' 'Realtor' 'Industry: type 6']
def organization_type_convert ( df) :
df[ 'ORGANIZATION_TYPE' ] . replace(
regex= [ r'Business Entity[\s\S]*' , r'Transport[\s\S]*' , r'Industry[\s\S]*' , r'Trade[\s\S]*' ] ,
value= [ 'Business Entity' , 'Transport' , 'Industry' , 'Trade' ] ,
inplace= True )
organization_type_convert( app_train)
organization_type_convert( app_test)
print ( app_train[ 'ORGANIZATION_TYPE' ] . unique( ) )
['Business Entity' 'School' 'Government' 'Religion' 'Other' 'XNA'
'Electricity' 'Medicine' 'Self-employed' 'Transport' 'Construction'
'Housing' 'Kindergarten' 'Trade' 'Industry' 'Military' 'Services'
'Security Ministries' 'Emergency' 'Security' 'University' 'Police'
'Postal' 'Agriculture' 'Restaurant' 'Culture' 'Hotel' 'Bank' 'Insurance'
'Mobile' 'Legal Services' 'Advertising' 'Cleaning' 'Telecom' 'Realtor']
def label_encode_unique_under_two ( train, test) :
label_encoder = LabelEncoder( )
for col in train:
if train[ col] . dtype == 'object' and len ( train[ col] . unique( ) ) <= 2 :
label_encoder. fit( train[ col] )
train[ col] = label_encoder. transform( train[ col] )
test[ col] = label_encoder. transform( test[ col] )
return train, test
def onehot_encode_unique_over_two ( train, test) :
train = pd. get_dummies( train)
test = pd. get_dummies( test)
return train, test
app_train, app_test = label_encode_unique_under_two( app_train, app_test)
app_train, app_test = onehot_encode_unique_over_two( app_train, app_test)
print ( app_train. shape)
print ( app_test. shape)
(307511, 220)
(48744, 216)
train_target = app_train[ 'TARGET' ]
app_train, app_test = app_train. align( other= app_test, join= 'inner' , axis= 1 )
app_train[ 'TARGET' ] = train_target
print ( app_train. shape)
print ( app_test. shape)
(307511, 217)
(48744, 216)
def plot_day_employ ( train) :
print ( train[ 'DAYS_EMPLOYED' ] . describe( ) )
train[ 'DAYS_EMPLOYED' ] . plot. hist( title= '从业时间分布直方图(以天为单位)' )
plt. xlabel( '从业时间' )
plt. ylabel( '人数' )
plt. show( )
plot_day_employ( app_train)
count 307511.000000
mean 63815.045904
std 141275.766519
min -17912.000000
25% -2760.000000
50% -1213.000000
75% -289.000000
max 365243.000000
Name: DAYS_EMPLOYED, dtype: float64
anoms = app_train[ app_train[ 'DAYS_EMPLOYED' ] > 300000 ]
anoms_one = app_train[ app_train[ 'DAYS_EMPLOYED' ] == 365243 ]
non_anoms = app_train[ app_train[ 'DAYS_EMPLOYED' ] != 365243 ]
print ( anoms. shape)
print ( anoms_one. shape)
(55374, 217)
(55374, 217)
print ( '异常数据的偿还风险均值为%0.2f%%' % ( 100 * anoms[ 'TARGET' ] . mean( ) ) )
print ( '正常数据的偿还风险均值为%0.2f%%' % ( 100 * non_anoms[ 'TARGET' ] . mean( ) ) )
异常数据的偿还风险均值为5.40%
正常数据的偿还风险均值为8.66%
app_train[ 'DAYS_EMPLOYED_ANOM' ] = app_train[ 'DAYS_EMPLOYED' ] == 365243
app_train[ 'DAYS_EMPLOYED' ] . replace( { 365243 : np. nan} , inplace= True )
app_test[ 'DAYS_EMPLOYED_ANOM' ] = app_test[ 'DAYS_EMPLOYED' ] == 365243
app_test[ 'DAYS_EMPLOYED' ] . replace( { 365243 : np. nan} , inplace= True )
plot_day_employ( app_train)
count 252137.000000
mean -2384.169325
std 2338.360162
min -17912.000000
25% -3175.000000
50% -1648.000000
75% -767.000000
max 0.000000
Name: DAYS_EMPLOYED, dtype: float64
app_train[ 'DAYS_BIRTH' ] = app_train[ 'DAYS_BIRTH' ] . abs ( ) / 365
app_train[ 'DAYS_EMPLOYED' ] = app_train[ 'DAYS_EMPLOYED' ] . abs ( ) / 365
app_test[ 'DAYS_BIRTH' ] = app_test[ 'DAYS_BIRTH' ] . abs ( ) / 365
app_test[ 'DAYS_EMPLOYED' ] = app_test[ 'DAYS_EMPLOYED' ] . abs ( ) / 365
app_train. rename( columns= { 'DAYS_BIRTH' : 'YEARS_BIRTH' , 'DAYS_EMPLOYED' : 'YEARS_EMPLOYED' } , inplace= True )
app_test. rename( columns= { 'DAYS_BIRTH' : 'YEARS_BIRTH' , 'DAYS_EMPLOYED' : 'YEARS_EMPLOYED' } , inplace= True )
print ( app_train[ 'YEARS_BIRTH' ] . describe( ) )
count 307511.000000
mean 43.936973
std 11.956133
min 20.517808
25% 34.008219
50% 43.150685
75% 53.923288
max 69.120548
Name: YEARS_BIRTH, dtype: float64
age_data = app_train. loc[ : , [ 'TARGET' , 'YEARS_BIRTH' ] ]
age_data. loc[ : , 'YEARS_BINNED' ] = pd. cut( age_data. loc[ : , 'YEARS_BIRTH' ] , bins= np. linspace( 20 , 70 , num= 11 ) )
age_group = age_data. groupby( 'YEARS_BINNED' ) . mean( )
print ( age_group. index)
CategoricalIndex([(20.0, 25.0], (25.0, 30.0], (30.0, 35.0], (35.0, 40.0],
(40.0, 45.0], (45.0, 50.0], (50.0, 55.0], (55.0, 60.0],
(60.0, 65.0], (65.0, 70.0]],
categories=[(20.0, 25.0], (25.0, 30.0], (30.0, 35.0], (35.0, 40.0], (40.0, 45.0], (45.0, 50.0], (50.0, 55.0], (55.0, 60.0], ...], ordered=True, name='YEARS_BINNED', dtype='category')
plt. bar( age_group. index. astype( str ) , age_group[ 'TARGET' ] )
plt. title( '按年龄分组的偿还风险' )
plt. xticks( rotation= 75 )
plt. xlabel( '年龄段' )
plt. ylabel( '偿还风险' )
plt. show( )
correlations = app_train. corr( ) [ 'TARGET' ] . sort_values( ascending= False )
print ( '最正相关的十个特征为:\n' , correlations. head( 10 ) )
print ( '最负相关的十个特征为:\n' , correlations. tail( 10 ) )
最正相关的十个特征为:
TARGET 1.000000
REGION_RATING_CLIENT_W_CITY 0.060893
REGION_RATING_CLIENT 0.058899
NAME_INCOME_TYPE_Working 0.057481
DAYS_LAST_PHONE_CHANGE 0.055218
CODE_GENDER_M 0.054713
DAYS_ID_PUBLISH 0.051457
REG_CITY_NOT_WORK_CITY 0.050994
NAME_EDUCATION_TYPE_Secondary / secondary special 0.049824
FLAG_EMP_PHONE 0.045982
Name: TARGET, dtype: float64
最负相关的十个特征为:
DAYS_EMPLOYED_ANOM -0.045987
ORGANIZATION_TYPE_XNA -0.045987
NAME_INCOME_TYPE_Pensioner -0.046209
CODE_GENDER_F -0.054704
NAME_EDUCATION_TYPE_Higher education -0.056593
YEARS_EMPLOYED -0.074958
YEARS_BIRTH -0.078239
EXT_SOURCE_1 -0.155317
EXT_SOURCE_2 -0.160472
EXT_SOURCE_3 -0.178919
Name: TARGET, dtype: float64
def plot_kde_curve ( var_name, df) :
"""
绘制变量和目标关联关系的核密度曲线
:param var_name: 变量名称
:param df: DataFrame
:return:
"""
corr = df. loc[ : , 'TARGET' ] . corr( df. loc[ : , var_name] , method= 'pearson' )
avg_paid = df. loc[ df[ 'TARGET' ] == 0 , var_name] . median( )
avg_not_paid = df. loc[ df[ 'TARGET' ] == 1 , var_name] . median( )
print ( '%s 和TARGET的相关系数为 %0.4f' % ( var_name, corr) )
print ( '无偿还风险的变量中位数为 %0.4f' % avg_paid)
print ( '有偿还风险的变量中位数系数为 %0.4f ' % avg_not_paid)
plt. figure( figsize= ( 12 , 6 ) )
sns. distplot( df. loc[ df[ 'TARGET' ] == 0 , var_name] , label= 'TARGET==0(无风险)' )
sns. distplot( df. loc[ df[ 'TARGET' ] == 1 , var_name] , label= 'TARGET==1(有风险)' )
plt. xlabel( var_name)
plt. ylabel( '核密度估计' )
plt. title( '%s 分布' % var_name)
plt. legend( )
plt. show( )
plot_kde_curve( 'YEARS_BIRTH' , app_train)
YEARS_BIRTH 和TARGET的相关系数为 -0.0782
无偿还风险的变量中位数为 43.4986
有偿还风险的变量中位数系数为 39.1288
def base_lr_train ( train_data, test_data) :
target = train_data[ 'TARGET' ]
train = train_data. drop( columns= [ 'TARGET' ] )
test = test_data. copy( )
imputer = Imputer( strategy= 'median' )
imputer. fit( train)
train = imputer. transform( train)
test = imputer. transform( test)
scaler = MinMaxScaler( feature_range= ( 0 , 1 ) )
scaler. fit( train)
train = scaler. transform( train)
test = scaler. transform( test)
print ( 'Training data shape' , train. shape)
print ( 'Test data shape' , test. shape)
log_reg = LogisticRegression( C= 0.001 )
log_reg. fit( train, target)
log_reg_predict_not_paid = log_reg. predict_proba( test) [ : , 1 ]
submit = test_data[ [ 'SK_ID_CURR' ] ]
submit. loc[ : , 'TARGET' ] = log_reg_predict_not_paid
print ( submit. head( 5 ) )
submit. to_csv( dir_path + '/result/log_reg_baseline.csv' , index= False )
base_lr_train( app_train, app_test)
def base_rf_train ( train_data, test_data) :
target = train_data[ 'TARGET' ]
train = train_data. drop( columns= [ 'TARGET' ] )
test = test_data. copy( )
imputer = Imputer( strategy= 'median' )
imputer. fit( train)
train = imputer. transform( train)
test = imputer. transform( test)
scaler = MinMaxScaler( feature_range= ( 0 , 1 ) )
scaler. fit( train)
train = scaler. transform( train)
test = scaler. transform( test)
print ( 'Training data shape' , train. shape)
print ( 'Test data shape' , test. shape)
random_forest = RandomForestClassifier( n_estimators= 100 , random_state= 42 , verbose= 1 , n_jobs= - 1 )
random_forest. fit( train, target)
random_forest_predict_not_paid = random_forest. predict_proba( test) [ : , 1 ]
submit = test_data[ [ 'SK_ID_CURR' ] ]
submit. loc[ : , 'TARGET' ] = random_forest_predict_not_paid
print ( submit. head( 5 ) )
submit. to_csv( dir_path + '/result/random_forest_baseline.csv' , index= False )
random_forest. max_features
base_rf_train( app_train, app_test)