当前位置:   article > 正文

信用卡客户画像(年龄,性别,收入,支出 和信用卡种的关系)_持卡人人群画像

持卡人人群画像

import os
import pandas as pd

os.chdir(r'H:\2019-2-3新华书店笔记以及资料\资料\HW5')

card=pd.read_csv(r"card.csv",encoding="gbk") #读取数据
disp=pd.read_csv(r"disp.csv",encoding="gbk")
clients=pd.read_csv(r"clients.csv",encoding="gbk")

card.head() #查看数据分布

 card_iddisp_idissuedtype
0100592851993-11-07普通卡
11045881994-01-19普通卡
274749151994-02-05普通卡
3704391994-02-08普通卡
457736871994-02-15普通卡

disp.head()

 disp_idclient_idaccount_idtype
0111所有者
1222所有者
2332用户
3443所有者
4553用户

clients.head()

 client_idsexbirth_datedistrict_id
011970-12-1318
121945-02-041
231940-10-091
341956-12-015
451960-07-035

import sqlite3 # sqlite3相当于轻量版,更多功能可使用SQLAlchemy

con = sqlite3.connect(':memory:') # 数据库连接,使用sql连接查询表

card.to_sql('card', con)

disp.to_sql('disp', con)

clients.to_sql('clients', con)

car_sql='''select a.*,c.sex,c.birth_date,c.district_id from card a left join disp b on a.disp_id=b.disp_id left join clients c on b.client_id=c.client_id where b.type="所有者"'''

card_t=pd.read_sql(car_sql, con)

card_t.head()

 indexcard_iddisp_idissuedtypesexbirth_datedistrict_id
00100592851993-11-07普通卡1968-01-2874
111045881994-01-19普通卡1960-10-2061
2274749151994-02-05普通卡1963-07-1940
33704391994-02-08普通卡1968-09-1251
4457736871994-02-15普通卡1972-02-0649

from pylab import mpl

mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体

mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题

card_t['type'].value_counts().plot(kind='pie',autopct='%.1f%%') #查看数据分布 饼图

<matplotlib.axes._subplots.AxesSubplot at 0x167ec390>

 

#1、发卡趋势
#总体发卡趋势
from  datetime  import  *  
card_t['issued_date']=pd.to_datetime(card_t['issued'])
card_t['issued_year']=card_t['issued_date'].map(lambda x:x.year)
card_t.card_id.groupby(card_t['issued_year']).count().plot(kind="bar") #年龄和发卡的关系
<matplotlib.axes._subplots.AxesSubplot at 0x9354358>

#2、不同卡的分布

card_t.type.value_counts().plot(kind="pie",autopct='%.1f%%') #卡种分布

<matplotlib.axes._subplots.AxesSubplot at 0x93aa8d0>

 

pd.crosstab(card_t.issued_year,card_t.type).plot(kind = 'bar') #卡种和年龄的关系分布
<matplotlib.axes._subplots.AxesSubplot at 0x940cb00>

 

t1=pd.crosstab(card_t.issued_year,card_t.type) #卡种和年龄占比对比

t1["sum1"]=t1.sum(1)

t2=t1.div(t1.sum1,axis = 0)

t2.drop("sum1",1).plot(kind = 'bar',stacked= True)

<matplotlib.axes._subplots.AxesSubplot at 0xa8b5d30>

 

import matplotlib.pyplot as plt
labels=["青年卡","普通卡","金卡"]
y1=t1.loc[:,"青年卡"]#.astype('int')
y2=t1.loc[:,"普通卡"]#.astype('int')
y3=t1.loc[:,"金卡"]#.astype('int')
x=t1.index#.astype('int')
plt.stackplot(x,y1,y2,y3,labels = labels) # 图表类型
plt.title('发卡趋势')
plt.ylabel('发卡量')
plt.legend(loc = 'upper left')# 标签位置
plt.show()

 

?plt.stackplot(x,y1,y2,y3,labels = labels) 
pd.crosstab(card_t.type,card_t.sex).plot(kind='bar')#性别和卡种数量对比
<matplotlib.axes._subplots.AxesSubplot at 0xa9960b8>

 

xb=pd.crosstab(card_t.type,card_t.sex) #性别和卡种百分比对比
xb.div(xb.sum(1),axis=0).plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0xaa24860>

 

from stack2dim import *
stack2dim(card_t,'type','sex') #调用之前写的库,查看卡种和性别关系(柱子粗细代表数量,纵坐标代表百分比)

 

card_t['age']=(pd.to_datetime(card_t['issued'])-pd.to_datetime(card_t['birth_date']))
card_t['age1']=card_t['age'].map(lambda x:x.days/365)
import seaborn as sns
sns.boxplot(x='type',y='age1',data=card_t) #查看何须图,年龄和卡种分布
<matplotlib.axes._subplots.AxesSubplot at 0xc104940>

 

 
trans=pd.read_csv(r"trans.csv",encoding="gbk")
trans.to_sql('trans', con)
card_t.to_sql('card_t', con)
D:\anaconda\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
D:\anaconda\lib\site-packages\pandas\io\sql.py:450: UserWarning: the 'timedelta' type is not supported, and will be written as integer values (ns frequency) to the database.
  chunksize=chunksize, dtype=dtype)

 

trans.head()
 trans_idaccount_iddatetypeoperationamountbalancek_symbolbankaccount
069524723781993-01-01信贷资金$700$700NaNNaNNaN
11718125761993-01-01信贷资金$900$900NaNNaNNaN
22072647041993-01-01信贷资金$1,000$1,000NaNNaNNaN
3111724738181993-01-01信贷资金$600$600NaNNaNNaN
457937319721993-01-02信贷资金$400$400NaNNaNNaN

card_t.head()

 indexcard_iddisp_idissuedtypesexbirth_datedistrict_idissued_dateissued_yearageage1
00100592851993-11-07普通卡1968-01-28741993-11-0719939415 days25.794521
111045881994-01-19普通卡1960-10-20611994-01-19199412144 days33.271233
2274749151994-02-05普通卡1963-07-19401994-02-05199411159 days30.572603
33704391994-02-08普通卡1968-09-12511994-02-0819949280 days25.424658
4457736871994-02-15普通卡1972-02-06491994-02-1519948045 days22.041096

car_sql='''

select a.card_id,a.issued,a.type,c.type as t_type,c.amount,c.balance,c.date as t_date
  from card as a
  left join disp as b on a.disp_id=b.disp_id
  left join trans as c on b.account_id=c.account_id
  where b.type="所有者"
  order by a.card_id,c.date
'''  #使用sql连接表
card_t2=pd.read_sql(car_sql, con)
card_t2.head()
 card_idissuedtypet_typeamountbalancet_date
011998-10-16金卡$900$9001996-11-24
111998-10-16金卡$33975$348751996-12-07
211998-10-16金卡$6,600$282751996-12-24
311998-10-16金卡$30$283051996-12-31
411998-10-16金卡$22650$509551997-01-07

 

card_t2['issued']=pd.to_datetime(card_t2['issued'])
card_t2['t_date']=pd.to_datetime(card_t2['t_date'])
card_t2.head()
 card_idissuedtypet_typeamountbalancet_date
011998-10-16金卡$900$9001996-11-24
111998-10-16金卡$33975$348751996-12-07
211998-10-16金卡$6,600$282751996-12-24
311998-10-16金卡$30$283051996-12-31
411998-10-16金卡$22650$509551997-01-07

 

card_t2['balance2'] = card_t2['balance'].map(lambda x: int(''.join(x[1:].split(','))))
card_t2['amount2'] = card_t2['amount'].map(lambda x: int(''.join(x[1:].split(','))))
import datetime #办卡前一年的平均余额
card_t3 = card_t2[card_t2.issued>card_t2.t_date][card_t2.issued<card_t2.t_date+datetime.timedelta(days=365)]
card_t3.head()
D:\anaconda\lib\site-packages\ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  
 card_idissuedtypet_typeamountbalancet_datebalance2amount2
4711998-10-16金卡$13600$579501997-10-205795013600
4811998-10-16金卡$15$581971997-10-315819715
4911998-10-16金卡$262$582121997-10-3158212262
5011998-10-16金卡$4,880$533171997-11-06533174880
5111998-10-16金卡$22650$759671997-11-077596722650

 

#card_t3["avg_balance"] = card_t3.groupby('card_id')['balance2'].mean() #生成的series  下面agg生成的dataframe
card_t4=card_t3.groupby(['type','card_id'])['balance2'].agg([('avg_balance','mean')])
card_t4.head()
#card_t4.to_sql('card_t4', con)
  avg_balance
typecard_id 
普通卡238898.610169
454222.060976
746888.702381
955854.223881
1051387.500000

 

card_t3.groupby(['type','card_id'])['balance2'].agg([('avg_balance','mean'),'count']).head()
  avg_balancecount
typecard_id  
普通卡238898.61016959
454222.06097682
746888.70238184
955854.22388167
1051387.50000066

 

card_t5=card_t4.reset_index()  #为透视表添加行的名,type类型不变
#card_t5=pd.read_sql('select * from card_t4', con)
sns.boxplot(x = 'type', y = 'avg_balance', data = card_t5)
<matplotlib.axes._subplots.AxesSubplot at 0x21afb358>

 

#%%6、不同类型卡的持卡人在办卡前一年内的平均收入对比
type_dict = {'借':'out','贷':'income'}
card_t3['type1'] = card_t3.t_type.map(type_dict)
card_t6= card_t3.groupby(['type','card_id','type1'])[['amount2']].sum()
card_t6.head()
card_t6.to_sql('card_t6', con)
card_t7=card_t6.reset_index()
#card_t7=pd.read_sql('select * from card_t6', con)
card_t7.to_sql('card_t7', con)
card_t8=pd.read_sql('select * from card_t7 where type1="income"', con)
card_t6.head()
   amount2
typecard_idtype1 
普通卡2income193911
out196384
4income474142
out357224
7income299797

 

sns.boxplot(x = 'type', y = 'amount2', data = card_t8)  #收入和卡种的关系
<matplotlib.axes._subplots.AxesSubplot at 0x15fd6588>

 

card_t8.head()
 indextypecard_idtype1amount2
00普通卡2income193911
12普通卡4income474142
24普通卡7income299797
36普通卡9income322548
48普通卡10income644560

 

card_t9=pd.read_sql('select * from card_t7 where type1="out"', con) #支出和卡种的关系
sns.boxplot(x = 'type', y = 'amount2', data = card_t9)
<matplotlib.axes._subplots.AxesSubplot at 0xa993a58>

声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号