赞
踩
getwd()
setwd('C:/Users/blabla/Desktop/数据分析/R语言')
getwd()
list.dirs() #罗列目录下所有文件夹
list.files() #罗列目录下所有文件
pf <- read.csv('C:/Users/孔啊吱/Desktop/数据分析/R语言/prosperLoanData.csv')
names(pf) #查看字段名
str(pf) #The str() function gives us the variable names and their types.
dim(pf) #查看数据大小
head(pf,2) #查看前两行
变量解释:
data <- pf[c("ListingKey","LoanOriginationDate",
"BorrowerRate","Occupation","Term",
"EmploymentStatus","BorrowerState",
"ListingCategory..numeric.","IncomeRange", #注意..numeric.
"CreditGrade","ProsperRating..Alpha.","LenderYield",
"LoanStatus","EmploymentStatusDuration",
"IsBorrowerHomeowner","CreditScoreRangeLower",
"CreditScoreRangeUpper","InquiriesLast6Months",
"DelinquenciesLast7Years","BankcardUtilization",
"DebtToIncomeRatio","StatedMonthlyIncome",
"LoanOriginalAmount")]
1.借款人信用评级(CreditGrade/ProsperRating(Alpha)):CreditGrade为2009年7月1日前的数据,ProsperRating(Alpha)为2009年7月1日后的数据,将其进行合并
temp <- pf[c("CreditGrade","ProsperRating..Alpha.")]
for (i in 1:nrow(temp)){
if (temp[i,1] == ""){
temp$new[i] <- temp[i,2]
}else{
temp$new[i] <- temp[i,1]
}
}
temp$new <- factor(temp$new,
levels = c(2,3,4,5,6,7,8),
labels = c("A","AA","B","C",
"D","E","HR"))
data$creditlevel <- temp$new
data$Phase[as.character(data$LoanOriginationDate) > "2009-07-01"] <- "After 2009" #标记生成年份
data$Phase[as.character(data$LoanOriginationDate) < "2009-07-01"] <- "Before 2009"
table(data$Phase)
After 2009 Before 2009
84997 28940
2.信用评分(CreditScoreRangeLower/CreditScoreRangeUpper):使用消费者信用评级机构提供的借款人信用评分范围下限值/上限值的均值作为最终评估的信用评分。
data$creditscore <- (data$CreditScoreRangeLower +
data$CreditScoreRangeUpper)/2
head(data$creditscore,2)
3.贷款状态处理:平台把贷款状态分为7大种:Cancelled(取消)、Chargedoff(冲销,投资人有损失)、Completed(正常完成,投资人无损失)、Current(贷款还款中)、Defaulted(坏账,投资人有损失)、FinalPaymentInProgress(最后还款中,投资人无损失)、PastDue(逾期还款,投资人无损失)。为了方便统计及后续的预测模型,我们将贷款状态分为两类:问题贷款和正常贷款。正常贷款包括:Completed、Current、FinalPaymentInProgress三种,其余都归为问题贷款。(资料来自https://zhuanlan.zhihu.com/p/39812067)
#levels(data$LoanStatus)
table(data$LoanStatus)
#将Completed、Current、FinalPaymentInProgress三种划分为正常贷款
data$newLoanStatus[data$LoanStatus == "Current"] <- "normalloan"
data$newLoanStatus[data$LoanStatus == "FinalPaymentInProgress"] <- "normalloan"
data$newLoanStatus[data$LoanStatus == "Completed"] <- "normalloan"
#将所有逾期的贷款均归为Past Due
PastDue <- c("Past Due (1-15 days)","Past Due (16-30 days)",
"Past Due (31-60 days)","Past Due (61-90 days)",
"Past Due (91-120 days)","Past Due (>120 days)")
data$LoanStatus[data$LoanStatus %in% PastDue] <- "PastDue" #attention
#将其余均划分为问题贷款(problemloan)
problemloan <- c("PastDue","Cancelled","Chargedoff","Defaulted")
data$newLoanStatus[data$LoanStatus %in% problemloan] <- "problemloan"#attention
4.缺失值处理
4.1缺失值查看
sapply(data,function(x)sum(is.na(x)))
ListingKey LoanOriginationDate BorrowerRate 0 0 0 Occupation Term EmploymentStatus 0 0 0 BorrowerState ListingCategory..numeric. IncomeRange 0 0 0 CreditGrade ProsperRating..Alpha. LenderYield 0 0 0 LoanStatus EmploymentStatusDuration IsBorrowerHomeowner 2067 7625 0 CreditScoreRangeLower CreditScoreRangeUpper InquiriesLast6Months 591 591 697 DelinquenciesLast7Years BankcardUtilization DebtToIncomeRatio 990 7604 8554 StatedMonthlyIncome LoanOriginalAmount creditlevel 0 0 272 Phase creditscore newLoanStatus 0 591 2067
4.2缺失值处理(使用均值替代)
data[is.na(data$DebtToIncomeRatio),"DebtToIncomeRatio"] <- mean(data$DebtToIncomeRatio,na.rm = TRUE)
data[is.na(data$EmploymentStatusDuration),"EmploymentStatusDuration"] <- mean(data$EmploymentStatusDuration,na.rm = TRUE)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。