2.3 概率抽样
一、简单随机抽样
R
复制代码
# 数据预处理
LoanStats3c = read.csv("D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/data/2数据集二:Loan Data--Lending Club/LoanStats3c/LoanStats3c.csv", header = TRUE, fill = TRUE, comment.char = "")
# str(LoanStats3c)
# 将有缺失值的列进行中位数填补(针对数值列)
# 检测列类型为数值的列,输出列序号为列表
num_cols <- sapply(LoanStats3c, is.numeric)
num_col_indices <- which(num_cols)
# 对每个数值列,如果存在缺失值则用中位数填补
for(i in num_col_indices) {
col_name <- names(LoanStats3c)[i]
na_count <- sum(is.na(LoanStats3c[[i]]))
if(na_count > 0) {
med <- median(LoanStats3c[[i]], na.rm = TRUE)
if(is.na(med)) {
# 全为 NA 的列,输出提示并跳过
message(sprintf("跳过列 %s (索引 %d):全为 NA,无法用中位数填补", col_name, i))
} else {
LoanStats3c[[i]][is.na(LoanStats3c[[i]])] <- med
message(sprintf("列 %s (索引 %d):用中位数 %s 填补 %d 个缺失值", col_name, i, format(med), na_count))
}
}
}
# 再次统计数值列的缺失值数(用于检查)
na_summary <- sapply(LoanStats3c[num_col_indices], function(x) sum(is.na(x)))
print(na_summary)
# 写入数值列索引到csv文件,供后续分析使用
write.csv(num_col_indices, file = "D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/demo2/num_col_indices.csv", row.names = FALSE)
# 保存清洗后的数据集
write.csv(LoanStats3c, file = "D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/demo2/LoanStats3c_imputed.csv", row.names = FALSE)
# str(LoanStats3c) // 可取消注释查看数据结构
复制代码
列 member_id (索引 2):用中位数 22953173 填补 4 个缺失值
列 loan_amnt (索引 3):用中位数 13000 填补 4 个缺失值
列 funded_amnt (索引 4):用中位数 13000 填补 4 个缺失值
列 loan_amnt (索引 3):用中位数 13000 填补 4 个缺失值
列 funded_amnt (索引 4):用中位数 13000 填补 4 个缺失值
列 funded_amnt_inv (索引 5):用中位数 13000 填补 4 个缺失值
列 installment (索引 8):用中位数 384.14 填补 4 个缺失值
列 funded_amnt_inv (索引 5):用中位数 13000 填补 4 个缺失值
列 installment (索引 8):用中位数 384.14 填补 4 个缺失值
列 annual_inc (索引 14):用中位数 65000 填补 4 个缺失值
列 dti (索引 25):用中位数 17.63 填补 4 个缺失值
列 delinq_2yrs (索引 26):用中位数 0 填补 4 个缺失值
列 annual_inc (索引 14):用中位数 65000 填补 4 个缺失值
列 dti (索引 25):用中位数 17.63 填补 4 个缺失值
列 delinq_2yrs (索引 26):用中位数 0 填补 4 个缺失值
列 inq_last_6mths (索引 28):用中位数 0 填补 4 个缺失值
列 mths_since_last_delinq (索引 29):用中位数 30 填补 115885 个缺失值
列 mths_since_last_record (索引 30):用中位数 69 填补 194109 个缺失值
列 inq_last_6mths (索引 28):用中位数 0 填补 4 个缺失值
列 mths_since_last_delinq (索引 29):用中位数 30 填补 115885 个缺失值
列 mths_since_last_record (索引 30):用中位数 69 填补 194109 个缺失值
列 open_acc (索引 31):用中位数 11 填补 4 个缺失值
列 open_acc (索引 31):用中位数 11 填补 4 个缺失值
列 pub_rec (索引 32):用中位数 0 填补 4 个缺失值
列 revol_bal (索引 33):用中位数 11686 填补 4 个缺失值
列 pub_rec (索引 32):用中位数 0 填补 4 个缺失值
列 revol_bal (索引 33):用中位数 11686 填补 4 个缺失值
列 total_acc (索引 35):用中位数 24 填补 4 个缺失值
列 out_prncp (索引 37):用中位数 9823.83 填补 4 个缺失值
列 total_acc (索引 35):用中位数 24 填补 4 个缺失值
列 out_prncp (索引 37):用中位数 9823.83 填补 4 个缺失值
列 out_prncp_inv (索引 38):用中位数 9817.7 填补 4 个缺失值
列 total_pymnt (索引 39):用中位数 3478.72 填补 4 个缺失值
列 out_prncp_inv (索引 38):用中位数 9817.7 填补 4 个缺失值
列 total_pymnt (索引 39):用中位数 3478.72 填补 4 个缺失值
列 total_pymnt_inv (索引 40):用中位数 3478.08 填补 4 个缺失值
列 total_rec_prncp (索引 41):用中位数 2152.3 填补 4 个缺失值
列 total_rec_int (索引 42):用中位数 995.42 填补 4 个缺失值
列 total_pymnt_inv (索引 40):用中位数 3478.08 填补 4 个缺失值
列 total_rec_prncp (索引 41):用中位数 2152.3 填补 4 个缺失值
列 total_rec_int (索引 42):用中位数 995.42 填补 4 个缺失值
列 total_rec_late_fee (索引 43):用中位数 0 填补 4 个缺失值
列 total_rec_late_fee (索引 43):用中位数 0 填补 4 个缺失值
列 recoveries (索引 44):用中位数 0 填补 4 个缺失值
列 collection_recovery_fee (索引 45):用中位数 0 填补 4 个缺失值
列 last_pymnt_amnt (索引 47):用中位数 420.64 填补 4 个缺失值
列 recoveries (索引 44):用中位数 0 填补 4 个缺失值
列 collection_recovery_fee (索引 45):用中位数 0 填补 4 个缺失值
列 last_pymnt_amnt (索引 47):用中位数 420.64 填补 4 个缺失值
列 collections_12_mths_ex_med (索引 50):用中位数 0 填补 4 个缺失值
列 mths_since_last_major_derog (索引 51):用中位数 43 填补 169155 个缺失值
列 policy_code (索引 52):用中位数 1 填补 4 个缺失值
列 collections_12_mths_ex_med (索引 50):用中位数 0 填补 4 个缺失值
列 mths_since_last_major_derog (索引 51):用中位数 43 填补 169155 个缺失值
列 policy_code (索引 52):用中位数 1 填补 4 个缺失值
member_id loan_amnt
0 0
funded_amnt funded_amnt_inv
0 0
installment annual_inc
0 0
dti delinq_2yrs
0 0
inq_last_6mths mths_since_last_delinq
0 0
mths_since_last_record open_acc
0 0
pub_rec revol_bal
0 0
total_acc out_prncp
0 0
out_prncp_inv total_pymnt
0 0
total_pymnt_inv total_rec_prncp
0 0
total_rec_int total_rec_late_fee
0 0
recoveries collection_recovery_fee
0 0
last_pymnt_amnt collections_12_mths_ex_med
0 0
mths_since_last_major_derog policy_code
0 0
R
复制代码
# 推荐使用read.csv,并加上fill=TRUE和comment.char=""参数,避免列数不一致报错
LoanStats3c = read.csv("D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/demo2/LoanStats3c_imputed.csv", header = TRUE, fill = TRUE, comment.char = "")
names(LoanStats3c) #数据中所有变量名
library(sampling) #抽样函数所在的包
N=dim(LoanStats3c)[1] #总体数量
n=500 #样本数量
srsp=srswor(n,N) #简单随机抽样
srs=getdata(LoanStats3c,srsp) #得到样本
# str(srs)
# 检测列类型为数值的列,输出列序号为列表
new_num_cols <- sapply(srs, is.numeric)
new_num_col_indices <- which(new_num_cols)
# 输出数值列的序号
# print(new_num_col_indices)
- 'id'
- 'member_id'
- 'loan_amnt'
- 'funded_amnt'
- 'funded_amnt_inv'
- 'term'
- 'int_rate'
- 'installment'
- 'grade'
- 'sub_grade'
- 'emp_title'
- 'emp_length'
- 'home_ownership'
- 'annual_inc'
- 'verification_status'
- 'issue_d'
- 'loan_status'
- 'pymnt_plan'
- 'url'
- 'desc'
- 'purpose'
- 'title'
- 'zip_code'
- 'addr_state'
- 'dti'
- 'delinq_2yrs'
- 'earliest_cr_line'
- 'inq_last_6mths'
- 'mths_since_last_delinq'
- 'mths_since_last_record'
- 'open_acc'
- 'pub_rec'
- 'revol_bal'
- 'revol_util'
- 'total_acc'
- 'initial_list_status'
- 'out_prncp'
- 'out_prncp_inv'
- 'total_pymnt'
- 'total_pymnt_inv'
- 'total_rec_prncp'
- 'total_rec_int'
- 'total_rec_late_fee'
- 'recoveries'
- 'collection_recovery_fee'
- 'last_pymnt_d'
- 'last_pymnt_amnt'
- 'next_pymnt_d'
- 'last_credit_pull_d'
- 'collections_12_mths_ex_med'
- 'mths_since_last_major_derog'
- 'policy_code'
R
复制代码
# 简单估计比较样本均值和总体均值
meanY = colMeans(LoanStats3c[, num_col_indices], na.rm = TRUE) # 总体均值
meany = colMeans(srs[, new_num_col_indices], na.rm = TRUE) # 样本均值
# 输出对象长度并检查匹配
cat('length(meanY) =', length(meanY), '\n')
cat('length(meany) =', length(meany), '\n')
if(length(meanY) != length(meany)) {
warning('总体均值和样本均值长度不一致:尝试按共有变量对齐')
common_names <- intersect(names(meanY), names(meany))
meanY <- meanY[common_names]
meany <- meany[common_names]
cat('对齐后长度 =', length(meanY), '\n')
}
# 计算差值并显示前几项
md = meanY - meany
print(head(md))
复制代码
length(meanY) = 28
length(meany) = 29
length(meany) = 29
Warning message:
"总体均值和样本均值长度不一致:尝试按共有变量对齐"
对齐后长度 = 28
member_id loan_amnt funded_amnt funded_amnt_inv installment
-411336.24337 -518.87495 -518.87495 -519.04750 -10.08388
annual_inc
-1301.80738
member_id loan_amnt funded_amnt funded_amnt_inv installment
-411336.24337 -518.87495 -518.87495 -519.04750 -10.08388
annual_inc
-1301.80738
二、分层随机抽样
R
复制代码
# 确认存在 grade 列
if(!"grade" %in% names(LoanStats3c)) {
stop("数据集中不存在名为 'grade' 的列,请检查变量名(区分大小写)")
}
# 清洗 grade 字段:去除首尾空白并统一为大写,避免 ' A' 或 'a' 等问题
LoanStats3c$grade <- trimws(as.character(LoanStats3c$grade))
LoanStats3c$grade <- toupper(LoanStats3c$grade)
# 将 grade 转为因子并指定 A-F 顺序(如只存在部分等级,factor 会自动处理)
LoanStats3c$grade <- factor(LoanStats3c$grade, levels = c("A","B","C","D","E","F"))
# 去掉 grade 为 NA 的行(否则会导致 strata 的 size 对应出现 NA)
df_nomiss_grade <- LoanStats3c[!is.na(LoanStats3c$grade), ]
if(nrow(df_nomiss_grade) == 0) stop('去除 NA 后没有可用于分层的数据')
# 计算各等级总量(基于去除 NA 的数据)
grade_counts <- table(df_nomiss_grade$grade)
# 仅保留数据中实际存在的等级(count > 0)
present_levels <- names(grade_counts[grade_counts > 0])
if(length(present_levels) == 0) stop('数据中没有可用的 grade 等级')
counts_present <- as.integer(grade_counts[present_levels])
names(counts_present) <- present_levels
# 若请求样本量 n 大于总体容量,调整 n
total_capacity <- sum(counts_present)
if(n > total_capacity) {
warning(sprintf('请求样本量 n=%d 大于总体容量 %d,已将 n 调整为 %d', n, total_capacity, total_capacity))
n <- total_capacity
}
# 按比例初始分配(仅对存在的等级)
prop_present <- counts_present / sum(counts_present)
wh_present <- as.integer(round(n * prop_present))
# 确保每层不超过该层容量,若超过则先截断
over_idx <- which(wh_present > counts_present)
if(length(over_idx) > 0) {
wh_present[over_idx] <- counts_present[over_idx]
}
# 调整使总和等于 n:若不足则在有剩余容量的层中循环分配
alloc_sum <- sum(wh_present)
diff <- n - alloc_sum
if(diff > 0) {
# 逐步分配剩余名额到还有容量的层,按容量从大到小循环分配
avail <- counts_present - wh_present
order_idx <- order(avail, decreasing = TRUE)
i <- 1
while(diff > 0 && sum(avail) > 0) {
idx <- order_idx[((i - 1) %% length(order_idx)) + 1]
if(avail[idx] > 0) {
wh_present[idx] <- wh_present[idx] + 1
avail[idx] <- avail[idx] - 1
diff <- diff - 1
}
i <- i + 1
}
}
# 若分配过多(diff < 0),从有多余的层中减少
if(diff < 0) {
# 从当前分配量最多的层开始减少,直到匹配
order_idx <- order(wh_present, decreasing = TRUE)
i <- 1
while(diff < 0) {
idx <- order_idx[((i - 1) %% length(order_idx)) + 1]
if(wh_present[idx] > 0) {
wh_present[idx] <- wh_present[idx] - 1
diff <- diff + 1
}
i <- i + 1
}
}
names(wh_present) <- present_levels
message('各层目标样本量(仅列出存在的等级):')
print(wh_present)
# 为 strata 准备数据:按 grade 排序(strata 内部以出现顺序匹配 size)
o <- order(df_nomiss_grade$grade)
data_o <- df_nomiss_grade[o, ]
# 确保 size 的顺序与 data 中实际出现的等级顺序一致
data_levels_in_order <- unique(as.character(data_o$grade))
message('data 中实际出现的等级顺序:')
print(data_levels_in_order)
message('wh_present 的名字:')
print(names(wh_present))
# 从 wh_present 中按 data_levels_in_order 提取样本量(使用安全映射),并确保没有 NA,转换为整数
size_for_strata <- sapply(data_levels_in_order, function(l) {
if(!is.na(l) && l %in% names(wh_present)) as.integer(wh_present[[l]]) else 0
})
names(size_for_strata) <- data_levels_in_order
# 计算每层实际容量(按 data_o 中的计数),使用 sapply 避免 NA
counts_in_order <- sapply(data_levels_in_order, function(l) sum(data_o$grade == l, na.rm = TRUE))
# 再次确保 size_for_strata 不超过每层实际容量
size_for_strata <- pmin(as.integer(size_for_strata), as.integer(counts_in_order))
message('传递给 strata 的 size 向量(按 data 中等级顺序):')
print(size_for_strata)
message('对应每层的容量(counts_in_order):')
print(counts_in_order)
# 若全部为 0,则停止
if(all(size_for_strata == 0)) stop('分配到各层的样本数均为 0,无法抽样,请检查 n 的值或 grade 分布')
# 调用 strata 进行分层无放回抽样
srp <- strata(data = data_o, stratanames = "grade", size = size_for_strata, method = "srswor")
# 提取样本并恢复原始顺序(可选)
# sr 使用原始数据框和 srp 索引来获取对应行
sr <- getdata(LoanStats3c, srp)
srs <- sr
# 更新数值列索引(供后续使用)
new_num_cols <- sapply(srs, is.numeric)
new_num_col_indices <- which(new_num_cols)
message(sprintf("分层抽样完成,样本行数 = %d", nrow(srs)))
# 查看各层实际入选数量
print(table(srs$grade))
# 输出入选样本前几行
print(head(srs))
# 可选:保存分层样本
write.csv(srs, file = "D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/demo2/srs_by_grade.csv", row.names = FALSE)
复制代码
各层目标样本量(仅列出存在的等级):
A B C D E F
77 132 143 92 43 13
data 中实际出现的等级顺序:
[1] "A" "B" "C" "D" "E" "F"
wh_present 的名字:
[1] "A" "B" "C" "D" "E" "F"
传递给 strata 的 size 向量(按 data 中等级顺序):
[1] 77 132 143 92 43 13
对应每层的容量(counts_in_order):
A B C D E F
36108 61935 66565 42992 20121 6223
分层抽样完成,样本行数 = 500
A B C D E F
77 132 143 92 43 13
id member_id loan_amnt funded_amnt funded_amnt_inv term
544 36019516 38721136 10000 10000 10000 36 months
560 37690957 40463819 32000 32000 32000 36 months
676 37791309 40554270 8400 8400 8400 36 months
747 37840891 40603766 28000 28000 28000 60 months
965 36733440 39476198 8000 8000 8000 36 months
1514 37840801 40603650 4500 4500 4500 36 months
int_rate installment sub_grade emp_title emp_length
544 7.49% 311.02 A4 owner 8 years
560 12.39% 1068.83 C1 RN CASE MANAGER 3 years
676 14.99% 291.15 C5 Merchandising Manager 8 years
747 9.49% 587.92 B2 System Administrator 10+ years
965 7.49% 248.82 A4 Controller 3 years
1514 12.39% 150.31 C1 OPERATIONS MANAGER 8 years
home_ownership annual_inc verification_status issue_d loan_status
544 MORTGAGE 225000 VERIFIED - income source Dec-14 Current
560 OWN 70000 VERIFIED - income Dec-14 Current
676 OWN 34750 VERIFIED - income source Dec-14 Current
747 MORTGAGE 125000 VERIFIED - income source Dec-14 Current
965 RENT 140000 VERIFIED - income source Dec-14 Current
1514 RENT 40000 not verified Dec-14 Current
pymnt_plan
544 n
560 n
676 n
747 n
965 n
1514 n
url desc
544 https://www.lendingclub.com/browse/loanDetail.action?loan_id=36019516
560 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37690957
676 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37791309
747 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37840891
965 https://www.lendingclub.com/browse/loanDetail.action?loan_id=36733440
1514 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37840801
purpose title zip_code addr_state dti
544 home_improvement Home improvement 370xx TN 7.79
560 debt_consolidation Debt consolidation 769xx TX 17.75
676 debt_consolidation Debt consolidation 840xx UT 25.15
747 credit_card Credit card refinancing 103xx NY 21.00
965 debt_consolidation Debt consolidation 864xx AZ 4.78
1514 credit_card Credit card refinancing 072xx NJ 18.67
delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
544 0 Sep-95 0 76
560 0 Dec-99 0 30
676 0 Jan-90 1 30
747 0 Dec-85 0 30
965 0 Sep-99 1 26
1514 0 Mar-05 0 30
mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
544 75 11 1 25176 26.20% 23
560 69 10 0 25295 45.70% 20
676 69 9 0 11666 81.60% 10
747 69 23 0 93879 56.40% 44
965 69 8 0 13018 64.80% 28
1514 69 7 0 5733 56.80% 11
initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
544 f 8996.24 8996.24 1237.84 1237.84
560 f 29000.22 29000.22 4242.28 4242.28
676 f 7834.33 7834.33 862.96 862.96
747 f 26516.57 26516.57 2329.54 2329.54
965 w 7196.97 7196.97 991.95 991.95
1514 f 4078.13 4078.13 598.14 598.14
total_rec_prncp total_rec_int total_rec_late_fee recoveries
544 1003.76 234.08 0 0
560 2999.78 1242.50 0 0
676 565.67 297.29 0 0
747 1483.43 846.11 0 0
965 803.03 188.92 0 0
1514 421.87 176.27 0 0
collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
544 0 Apr-15 311.02 May-15
560 0 Apr-15 1068.83 May-15
676 0 Apr-15 291.15 May-15
747 0 Apr-15 587.92 May-15
965 0 Apr-15 248.82 May-15
1514 0 Apr-15 150.31 May-15
last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
544 Apr-15 0 76
560 Apr-15 0 43
676 Apr-15 0 43
747 Apr-15 0 43
965 Apr-15 0 28
1514 Apr-15 0 43
policy_code grade ID_unit Prob Stratum
544 1 A 544 0.002132491 1
560 1 A 560 0.002132491 1
676 1 A 676 0.002132491 1
747 1 A 747 0.002132491 1
965 1 A 965 0.002132491 1
1514 1 A 1514 0.002132491 1
三、整群抽样
R
复制代码
#------------------ 整群抽样(稳健实现)------------------#
cluster_col <- "home_ownership"
if(!cluster_col %in% names(LoanStats3c)) {
stop(sprintf("找不到列 %s,请检查变量名", cluster_col))
}
# 计算可用簇(去除 NA)
clusters <- unique(na.omit(LoanStats3c[[cluster_col]]))
n_clusters <- length(clusters)
requested_clusters <- 10 # 希望抽取的簇数,可修改
if(n_clusters == 0) stop(sprintf("列 %s 没有可用的簇(全部为 NA)", cluster_col))
# 将请求的簇数限制为实际可用簇数
size_clusters <- min(requested_clusters, n_clusters)
message(sprintf("簇总数 = %d, 请求 = %d, 实际将抽取 = %d", n_clusters, requested_clusters, size_clusters))
# 如果 size_clusters 为 0 则无法抽样
if(size_clusters <= 0) stop("没有可用的簇可抽取")
# 调用 cluster 函数进行抽样(无放回)
scp <- cluster(data = LoanStats3c, clustername = cluster_col, size = size_clusters, method = "srswor", description = FALSE)
sc <- getdata(LoanStats3c, scp)
# 查看抽取到的簇和样本数量
message(sprintf("抽到的簇数量 = %d, 抽样得到的行数 = %d", length(unique(scp$ID_1)), nrow(sc)))
print(head(sc))
复制代码
簇总数 = 5, 请求 = 10, 实际将抽取 = 5
抽到的簇数量 = 0, 抽样得到的行数 = 235633
抽到的簇数量 = 0, 抽样得到的行数 = 235633
id member_id loan_amnt
235630 22953173 13000
235631 22953173 13000
235632 Total amount funded in policy code 1: 3503840175 22953173 13000
235633 Total amount funded in policy code 2: 873663239 22953173 13000
80686 26170263 28642950 5000
16527 35226318 37907692 19200
funded_amnt funded_amnt_inv term int_rate installment grade
235630 13000 13000 384.14 <NA>
235631 13000 13000 384.14 <NA>
235632 13000 13000 384.14 <NA>
235633 13000 13000 384.14 <NA>
80686 5000 5000 36 months 11.67% 165.29 B
16527 19200 19200 36 months 9.49% 614.95 B
sub_grade emp_title emp_length annual_inc
235630 65000
235631 65000
235632 65000
235633 65000
80686 B4 Office Administrative Assistant 5 years 35680
16527 B2 Controller 10+ years 38400
verification_status issue_d loan_status pymnt_plan
235630
235631
235632
235633
80686 VERIFIED - income source Sep-14 Fully Paid n
16527 not verified Nov-14 Current n
url
235630
235631
235632
235633
80686 https://www.lendingclub.com/browse/loanDetail.action?loan_id=26170263
16527 https://www.lendingclub.com/browse/loanDetail.action?loan_id=35226318
desc purpose title zip_code addr_state
235630
235631
235632
235633
80686 debt_consolidation Debt consolidation 757xx TX
16527 credit_card Credit card refinancing 476xx IN
dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
235630 17.63 0 0 30
235631 17.63 0 0 30
235632 17.63 0 0 30
235633 17.63 0 0 30
80686 28.12 0 Dec-03 0 55
16527 28.94 0 Oct-96 0 27
mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
235630 69 11 0 11686 24
235631 69 11 0 11686 24
235632 69 11 0 11686 24
235633 69 11 0 11686 24
80686 69 7 0 3319 43.10% 12
16527 69 9 0 19353 64.50% 30
initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
235630 9823.83 9817.70 3478.720 3478.08
235631 9823.83 9817.70 3478.720 3478.08
235632 9823.83 9817.70 3478.720 3478.08
235633 9823.83 9817.70 3478.720 3478.08
80686 f 0.00 0.00 5048.625 5048.62
16527 w 16847.53 16847.53 3064.630 3064.63
total_rec_prncp total_rec_int total_rec_late_fee recoveries
235630 2152.30 995.42 0 0
235631 2152.30 995.42 0 0
235632 2152.30 995.42 0 0
235633 2152.30 995.42 0 0
80686 5000.00 48.62 0 0
16527 2352.47 712.16 0 0
collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
235630 0 420.64
235631 0 420.64
235632 0 420.64
235633 0 420.64
80686 0 Oct-14 5048.63
16527 0 Apr-15 614.95 May-15
last_credit_pull_d collections_12_mths_ex_med
235630 0
235631 0
235632 0
235633 0
80686 Apr-15 0
16527 Apr-15 0
mths_since_last_major_derog policy_code home_ownership ID_unit Prob
235630 43 1 235630 1
235631 43 1 235631 1
235632 43 1 235632 1
235633 43 1 235633 1
80686 55 1 ANY 80686 1
16527 31 1 MORTGAGE 16527 1
四、系统抽样
R
复制代码
i=rep(1,N)
pik1=inclusionprobabilities(i,n) #采用等概率的系统抽样,共抽取n个样本
ssp=UPsystematic(pik1,eps=1e-6) #系统抽样的函数
ss=getdata(LoanStats3c,ssp) #入选样本
print(head(ss))
复制代码
ID_unit id member_id loan_amnt funded_amnt funded_amnt_inv
283 283 37751794 40514790 12000 12000 12000
755 755 37741264 40504192 7000 7000 7000
1226 1226 37740968 40503825 3600 3600 3600
1697 1697 37700609 40473393 16000 16000 16000
2168 2168 37760342 40523083 22525 22525 22525
2640 2640 37257945 40030762 25000 25000 25000
term int_rate installment grade sub_grade emp_title
283 36 months 11.99% 398.52 B B5 Office Administrator
755 36 months 6.49% 214.52 A A2 Associate Engineer
1226 36 months 13.66% 122.45 C C3 Software Engineer
1697 36 months 9.49% 512.46 B B2 Manager
2168 60 months 19.24% 587.29 E E2 Senior Typist
2640 60 months 16.49% 614.48 D D3 Parole Officer
emp_length home_ownership annual_inc verification_status issue_d
283 7 years RENT 68200 VERIFIED - income source Dec-14
755 < 1 year RENT 65000 not verified Dec-14
1226 < 1 year RENT 67000 not verified Dec-14
1697 3 years MORTGAGE 90000 not verified Dec-14
2168 10+ years RENT 49000 VERIFIED - income Dec-14
2640 2 years RENT 60000 VERIFIED - income Dec-14
loan_status pymnt_plan
283 Current n
755 Current n
1226 Current n
1697 Current n
2168 Current n
2640 Current n
url desc
283 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37751794
755 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37741264
1226 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37740968
1697 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37700609
2168 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37760342
2640 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37257945
purpose title zip_code addr_state dti
283 other Other 944xx CA 17.75
755 debt_consolidation Debt consolidation 100xx NY 7.94
1226 credit_card Credit card refinancing 276xx NC 21.73
1697 debt_consolidation Debt consolidation 982xx WA 25.92
2168 credit_card Credit card refinancing 120xx NY 33.60
2640 debt_consolidation Debt consolidation 995xx AK 38.66
delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
283 0 Jun-93 1 30
755 0 Feb-01 0 30
1226 0 Feb-89 0 29
1697 0 May-05 2 30
2168 0 May-94 0 30
2640 0 Aug-01 1 30
mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
283 69 8 0 17695 78.60% 14
755 69 12 0 10294 38.30% 39
1226 69 12 0 7191 53.30% 14
1697 69 16 0 13537 42.40% 29
2168 69 12 0 28745 96.50% 18
2640 69 17 0 16058 49.90% 54
initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
283 f 10868.71 10868.71 1578.09 1578.09
755 f 6287.60 6287.60 854.29 854.29
1226 f 3268.51 3268.51 484.34 484.34
1697 f 14437.88 14437.88 2032.97 2032.97
2168 w 21598.46 21598.46 2325.08 2325.08
2640 w 23893.70 23893.70 2435.02 2435.02
total_rec_prncp total_rec_int total_rec_late_fee recoveries
283 1131.29 446.80 0 0
755 712.40 141.89 0 0
1226 331.49 152.85 0 0
1697 1562.12 470.85 0 0
2168 926.54 1398.54 0 0
2640 1106.30 1328.72 0 0
collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
283 0 Apr-15 398.52 May-15
755 0 Apr-15 214.52 May-15
1226 0 Apr-15 122.45 May-15
1697 0 Apr-15 512.46 May-15
2168 0 Apr-15 587.29 May-15
2640 0 Apr-15 614.48 May-15
last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
283 Apr-15 0 43
755 Apr-15 0 43
1226 Apr-15 0 43
1697 Apr-15 0 43
2168 Apr-15 0 43
2640 Apr-15 0 43
policy_code
283 1
755 1
1226 1
1697 1
2168 1
2640 1
五、多阶段两阶段抽样
R
复制代码
#------------------ 两阶段抽样(手工实现,替代 mstage)------------------#
# 说明:先抽取若干主簇(cluster),然后在每个被抽中的簇内抽取二级样本。
# 参数(可调整)
primary_cluster_col <- "home_ownership" # 主簇变量名(请根据数据替换)
primary_k <- 3 # 希望抽取的主簇数
# 二级样本总目标为 n(之前定义),按主簇均分
secondary_total <- n
# 清洗并验证主簇列
if(!primary_cluster_col %in% names(LoanStats3c)) stop(sprintf("找不到列 %s,请检查字段名", primary_cluster_col))
LoanStats3c[[primary_cluster_col]] <- trimws(as.character(LoanStats3c[[primary_cluster_col]]))
LoanStats3c[[primary_cluster_col]] <- ifelse(LoanStats3c[[primary_cluster_col]]=="", NA, LoanStats3c[[primary_cluster_col]])
clusters <- unique(na.omit(LoanStats3c[[primary_cluster_col]]))
n_clusters <- length(clusters)
if(n_clusters == 0) stop(sprintf("列 %s 没有可用的簇(全部为 NA 或空字符串)", primary_cluster_col))
# 调整主簇数量不超过可用簇数
primary_k <- min(primary_k, n_clusters)
message(sprintf("可用簇数量 = %d,计划抽取主簇 = %d", n_clusters, primary_k))
# 随机抽取主簇(无放回)
set.seed(123) # 可选固定随机种子,便于重现
primary_selected <- sample(clusters, primary_k)
message('抽中的主簇:')
print(primary_selected)
# 为每个主簇计算二级样本目标(均分 + 余数分配)
base_sec <- floor(secondary_total / primary_k)
sec_sizes <- rep(base_sec, primary_k)
rem <- secondary_total - sum(sec_sizes)
if(rem > 0) {
for(i in seq_len(rem)) sec_sizes[i] <- sec_sizes[i] + 1
}
names(sec_sizes) <- primary_selected
message('每个被抽中簇的目标二级样本数(可能被截断至簇容量):')
print(sec_sizes)
# 在每个被抽中簇内抽样
sampled_rows <- integer(0)
per_cluster_actual <- integer(length(primary_selected))
for(i in seq_along(primary_selected)){
cl <- primary_selected[i]
rows_in_cl <- which(LoanStats3c[[primary_cluster_col]] == cl)
cap <- length(rows_in_cl)
target <- sec_sizes[i]
if(cap == 0) {
per_cluster_actual[i] <- 0
next
}
take <- min(target, cap)
# 若需要全部抽取可直接取全部
chosen <- sample(rows_in_cl, take, replace = FALSE)
sampled_rows <- c(sampled_rows, chosen)
per_cluster_actual[i] <- length(chosen)
}
# 生成结果对象
mss <- LoanStats3c[sampled_rows, ]
ms <- list(primary_selected = primary_selected, per_cluster_target = sec_sizes, per_cluster_actual = per_cluster_actual)
message(sprintf('两阶段抽样完成:共抽取 %d 行样本', nrow(mss)))
message('各簇实际抽取数:')
print(setNames(per_cluster_actual, primary_selected))
# 保存或查看部分样本
print(head(mss))
# 可选保存
# write.csv(mss, file = "D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/demo2/multistage_sample.csv", row.names = FALSE)
复制代码
可用簇数量 = 4,计划抽取主簇 = 3
抽中的主簇:
抽中的主簇:
[1] "OWN" "ANY" "RENT"
每个被抽中簇的目标二级样本数(可能被截断至簇容量):
OWN ANY RENT
167 167 166
两阶段抽样完成:共抽取 334 行样本
各簇实际抽取数:
各簇实际抽取数:
OWN ANY RENT
167 1 166
id member_id loan_amnt funded_amnt funded_amnt_inv term
28691 34442221 37105507 5000 5000 5000 36 months
17467 34874813 37558157 8200 8200 8200 36 months
32452 33230956 35874247 13050 13050 13050 60 months
114764 21370073 23682984 9300 9300 9300 36 months
45358 31367396 33940619 28000 28000 28000 60 months
65397 27650347 30153412 10000 10000 10000 36 months
int_rate installment grade sub_grade emp_title emp_length
28691 11.99% 166.05 B B5 truck driver 7 years
17467 14.99% 284.22 C C5 flight attendant 1 year
32452 19.52% 342.27 E E2 Asst. Vice President 9 years
114764 16.99% 331.53 D D3 lab tech II 5 years
45358 13.35% 642.12 C C2 Staff Civil Engineer < 1 year
65397 14.99% 346.61 C C5 Computer Specialist 10+ years
home_ownership annual_inc verification_status issue_d loan_status
28691 OWN 70000 not verified Nov-14 Current
17467 OWN 23000 VERIFIED - income source Nov-14 Current
32452 OWN 45000 VERIFIED - income source Nov-14 Current
114764 OWN 47544 VERIFIED - income source Jul-14 Current
45358 OWN 73000 VERIFIED - income source Oct-14 Current
65397 OWN 95000 VERIFIED - income source Oct-14 Current
pymnt_plan
28691 n
17467 n
32452 n
114764 n
45358 n
65397 n
url
28691 https://www.lendingclub.com/browse/loanDetail.action?loan_id=34442221
17467 https://www.lendingclub.com/browse/loanDetail.action?loan_id=34874813
32452 https://www.lendingclub.com/browse/loanDetail.action?loan_id=33230956
114764 https://www.lendingclub.com/browse/loanDetail.action?loan_id=21370073
45358 https://www.lendingclub.com/browse/loanDetail.action?loan_id=31367396
65397 https://www.lendingclub.com/browse/loanDetail.action?loan_id=27650347
desc purpose title zip_code addr_state dti
28691 home_improvement Home improvement 604xx IL 20.69
17467 debt_consolidation Debt consolidation 410xx KY 34.13
32452 debt_consolidation Debt consolidation 330xx FL 32.75
114764 debt_consolidation Debt consolidation 631xx MO 18.60
45358 debt_consolidation Debt consolidation 193xx PA 11.61
65397 debt_consolidation Debt consolidation 114xx NY 4.90
delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
28691 0 Jun-01 0 56
17467 1 Feb-99 3 17
32452 0 Oct-04 0 30
114764 0 Jun-96 1 30
45358 0 Feb-02 0 30
65397 4 Apr-93 0 1
mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
28691 69 6 0 5057 84.30% 18
17467 69 9 0 10573 41.80% 16
32452 69 28 0 12713 88.90% 49
114764 60 18 1 7255 34.70% 51
45358 69 15 0 8689 25.70% 33
65397 69 10 0 4999 27.20% 45
initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
28691 f 4407.83 4407.83 826.92 826.92
17467 w 7268.06 7268.06 1414.27 1414.27
32452 w 12378.56 12378.56 1697.20 1697.20
114764 w 7395.98 7395.98 2983.77 2983.77
45358 w 25960.28 25960.28 3831.95 3831.95
65397 w 8627.60 8627.60 2079.66 2079.66
total_rec_prncp total_rec_int total_rec_late_fee recoveries
28691 592.17 234.75 0 0
17467 931.94 482.33 0 0
32452 671.44 1025.76 0 0
114764 1904.02 1079.75 0 0
45358 2039.72 1792.23 0 0
65397 1372.40 707.26 0 0
collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
28691 0 Apr-15 166.05 May-15
17467 0 Apr-15 284.22 May-15
32452 0 Apr-15 342.27 May-15
114764 0 Apr-15 331.53 May-15
45358 0 Apr-15 642.12 May-15
65397 0 Apr-15 346.61 May-15
last_credit_pull_d collections_12_mths_ex_med
28691 Apr-15 0
17467 Apr-15 0
32452 Apr-15 0
114764 Apr-15 0
45358 Apr-15 0
65397 Apr-15 0
mths_since_last_major_derog policy_code
28691 43 1
17467 17 1
32452 43 1
114764 43 1
45358 43 1
65397 1 1
id member_id loan_amnt funded_amnt funded_amnt_inv term
28691 34442221 37105507 5000 5000 5000 36 months
17467 34874813 37558157 8200 8200 8200 36 months
32452 33230956 35874247 13050 13050 13050 60 months
114764 21370073 23682984 9300 9300 9300 36 months
45358 31367396 33940619 28000 28000 28000 60 months
65397 27650347 30153412 10000 10000 10000 36 months
int_rate installment grade sub_grade emp_title emp_length
28691 11.99% 166.05 B B5 truck driver 7 years
17467 14.99% 284.22 C C5 flight attendant 1 year
32452 19.52% 342.27 E E2 Asst. Vice President 9 years
114764 16.99% 331.53 D D3 lab tech II 5 years
45358 13.35% 642.12 C C2 Staff Civil Engineer < 1 year
65397 14.99% 346.61 C C5 Computer Specialist 10+ years
home_ownership annual_inc verification_status issue_d loan_status
28691 OWN 70000 not verified Nov-14 Current
17467 OWN 23000 VERIFIED - income source Nov-14 Current
32452 OWN 45000 VERIFIED - income source Nov-14 Current
114764 OWN 47544 VERIFIED - income source Jul-14 Current
45358 OWN 73000 VERIFIED - income source Oct-14 Current
65397 OWN 95000 VERIFIED - income source Oct-14 Current
pymnt_plan
28691 n
17467 n
32452 n
114764 n
45358 n
65397 n
url
28691 https://www.lendingclub.com/browse/loanDetail.action?loan_id=34442221
17467 https://www.lendingclub.com/browse/loanDetail.action?loan_id=34874813
32452 https://www.lendingclub.com/browse/loanDetail.action?loan_id=33230956
114764 https://www.lendingclub.com/browse/loanDetail.action?loan_id=21370073
45358 https://www.lendingclub.com/browse/loanDetail.action?loan_id=31367396
65397 https://www.lendingclub.com/browse/loanDetail.action?loan_id=27650347
desc purpose title zip_code addr_state dti
28691 home_improvement Home improvement 604xx IL 20.69
17467 debt_consolidation Debt consolidation 410xx KY 34.13
32452 debt_consolidation Debt consolidation 330xx FL 32.75
114764 debt_consolidation Debt consolidation 631xx MO 18.60
45358 debt_consolidation Debt consolidation 193xx PA 11.61
65397 debt_consolidation Debt consolidation 114xx NY 4.90
delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
28691 0 Jun-01 0 56
17467 1 Feb-99 3 17
32452 0 Oct-04 0 30
114764 0 Jun-96 1 30
45358 0 Feb-02 0 30
65397 4 Apr-93 0 1
mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
28691 69 6 0 5057 84.30% 18
17467 69 9 0 10573 41.80% 16
32452 69 28 0 12713 88.90% 49
114764 60 18 1 7255 34.70% 51
45358 69 15 0 8689 25.70% 33
65397 69 10 0 4999 27.20% 45
initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
28691 f 4407.83 4407.83 826.92 826.92
17467 w 7268.06 7268.06 1414.27 1414.27
32452 w 12378.56 12378.56 1697.20 1697.20
114764 w 7395.98 7395.98 2983.77 2983.77
45358 w 25960.28 25960.28 3831.95 3831.95
65397 w 8627.60 8627.60 2079.66 2079.66
total_rec_prncp total_rec_int total_rec_late_fee recoveries
28691 592.17 234.75 0 0
17467 931.94 482.33 0 0
32452 671.44 1025.76 0 0
114764 1904.02 1079.75 0 0
45358 2039.72 1792.23 0 0
65397 1372.40 707.26 0 0
collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
28691 0 Apr-15 166.05 May-15
17467 0 Apr-15 284.22 May-15
32452 0 Apr-15 342.27 May-15
114764 0 Apr-15 331.53 May-15
45358 0 Apr-15 642.12 May-15
65397 0 Apr-15 346.61 May-15
last_credit_pull_d collections_12_mths_ex_med
28691 Apr-15 0
17467 Apr-15 0
32452 Apr-15 0
114764 Apr-15 0
45358 Apr-15 0
65397 Apr-15 0
mths_since_last_major_derog policy_code
28691 43 1
17467 17 1
32452 43 1
114764 43 1
45358 43 1
65397 1 1
六、不等概抽样
R
复制代码
vol=LoanStats3c$total_acc
pik=inclusionprobabilities(vol,n) #以成交量为样本规模求出包含概率
usp=UPmidzuno(pik) #用Midzuno方法进行不等概率抽样
us=getdata(LoanStats3c,usp) #入选样本
print(head(us))
复制代码
ID_unit id member_id loan_amnt funded_amnt funded_amnt_inv
317 317 37800583 40563367 11875 11875 11875
880 880 37811222 40574134 3000 3000 3000
1187 1187 37661052 40423957 12300 12300 12300
1435 1435 37690890 40463735 25000 25000 25000
2732 2732 37098036 39860860 2000 2000 2000
3000 3000 37077808 39840608 14700 14700 14700
term int_rate installment grade sub_grade
317 36 months 14.31% 407.65 C C4
880 36 months 14.99% 103.99 C C5
1187 36 months 6.49% 376.93 A A2
1435 36 months 6.03% 760.89 A A1
2732 36 months 12.99% 67.38 C C2
3000 36 months 14.99% 509.51 C C5
emp_title emp_length home_ownership annual_inc
317 admission < 1 year OWN 55000.00
880 Contractor Installation Manager 3 years OWN 63000.00
1187 office manager < 1 year OWN 54000.00
1435 Director, Techincal Services 10+ years MORTGAGE 160000.00
2732 Client Response Communication 9 years MORTGAGE 26583.07
3000 Writer < 1 year RENT 100000.00
verification_status issue_d loan_status pymnt_plan
317 not verified Dec-14 Current n
880 not verified Dec-14 Current n
1187 VERIFIED - income source Dec-14 Current n
1435 VERIFIED - income Dec-14 Current n
2732 not verified Dec-14 Current n
3000 not verified Dec-14 Current n
url desc
317 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37800583
880 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37811222
1187 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37661052
1435 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37690890
2732 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37098036
3000 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37077808
purpose title zip_code addr_state dti
317 credit_card Credit card refinancing 921xx CA 26.23
880 moving Moving and relocation 997xx AK 17.05
1187 debt_consolidation Debt consolidation 217xx MD 18.80
1435 credit_card Credit card refinancing 603xx IL 15.54
2732 other Other 844xx UT 11.87
3000 credit_card Credit card refinancing 100xx NY 8.07
delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
317 0 Feb-02 1 30
880 0 Jul-99 1 41
1187 0 Jan-01 0 30
1435 0 Dec-94 0 30
2732 0 Dec-00 0 30
3000 0 Jan-66 0 40
mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
317 86 28 1 19422 49.40% 46
880 69 20 0 3390 51.40% 52
1187 69 11 0 9839 32.50% 27
1435 69 9 0 36538 62.90% 27
2732 99 6 1 8675 70.50% 14
3000 69 7 0 32014 87.20% 34
initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
317 w 10791.65 10791.65 1607.00 1607.00
880 f 2728.91 2728.91 413.46 413.46
1187 w 11048.26 11048.26 1503.29 1503.29
1435 f 22439.72 22439.72 3035.18 3035.18
2732 f 1814.09 1814.09 268.08 268.08
3000 f 13371.84 13371.84 2025.80 2025.80
total_rec_prncp total_rec_int total_rec_late_fee recoveries
317 1083.35 523.65 0 0
880 271.09 142.37 0 0
1187 1251.74 251.55 0 0
1435 2560.28 474.90 0 0
2732 185.91 82.17 0 0
3000 1328.16 697.64 0 0
collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
317 0 Apr-15 407.65 May-15
880 0 Apr-15 103.99 May-15
1187 0 Apr-15 376.93 May-15
1435 0 Apr-15 760.89 May-15
2732 0 Apr-15 67.38 May-15
3000 0 Apr-15 509.51 May-15
last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
317 Apr-15 0 43
880 Apr-15 0 51
1187 Apr-15 0 43
1435 Apr-15 0 43
2732 Apr-15 0 43
3000 Apr-15 0 43
policy_code
317 1
880 1
1187 1
1435 1
2732 1
3000 1
七、二重抽样
R
复制代码
#第一重抽样采用简单随机抽样确定层权,第二重抽样采用分层抽样
srsp1=srswor(3000,N)
srs1=getdata(LoanStats3c,srsp1) #第一重抽样
srsp2=srswor(n,3000)
srs2=getdata(srs1,srsp2) #第二重抽样
print(head(srs2))
复制代码
ID_unit id member_id loan_amnt funded_amnt funded_amnt_inv
175 175 37641820 40404842 24000 24000 24000
407 407 37771625 40534643 7150 7150 7150
444 444 37661489 40424497 20000 20000 20000
529 529 37821449 40584421 6000 6000 6000
1180 1180 37670980 40433849 10000 10000 10000
1688 1688 37820639 40583405 12000 12000 12000
term int_rate installment grade sub_grade
175 36 months 6.03% 730.46 A A1
407 36 months 17.14% 255.42 D D4
444 36 months 11.99% 664.20 B B5
529 36 months 10.49% 194.99 B B3
1180 60 months 14.99% 237.85 C C5
1688 36 months 11.99% 398.52 B B5
emp_title emp_length home_ownership annual_inc
175 Captain/Paramedic 10+ years MORTGAGE 120000
407 Customer Care Specialust < 1 year RENT 30000
444 Operations Management < 1 year MORTGAGE 85000
529 Sales Manager 10+ years RENT 100000
1180 Policies & Procedures Administrator 10+ years MORTGAGE 82000
1688 Safety and Security Offider < 1 year MORTGAGE 85000
verification_status issue_d loan_status pymnt_plan
175 VERIFIED - income source Dec-14 Current n
407 VERIFIED - income source Dec-14 Current n
444 VERIFIED - income source Dec-14 Fully Paid n
529 not verified Dec-14 Current n
1180 VERIFIED - income Dec-14 Current n
1688 VERIFIED - income source Dec-14 Current n
url desc
175 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37641820
407 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37771625
444 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37661489
529 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37821449
1180 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37670980
1688 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37820639
purpose title zip_code addr_state dti
175 credit_card Credit card refinancing 957xx CA 19.94
407 debt_consolidation Debt consolidation 284xx NC 9.52
444 credit_card Credit card refinancing 597xx MT 11.00
529 debt_consolidation Debt consolidation 917xx CA 9.99
1180 credit_card Credit card refinancing 801xx CO 10.08
1688 debt_consolidation Debt consolidation 063xx CT 7.71
delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
175 1 Feb-98 0 8
407 0 Oct-11 2 30
444 0 Jul-04 0 30
529 0 Nov-97 1 59
1180 2 Oct-01 2 2
1688 0 Jul-99 0 45
mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
175 69 9 0 32544 46.70% 26
407 69 12 0 7239 55.30% 14
444 69 5 0 19542 97.70% 7
529 69 12 0 5914 32.10% 18
1180 69 21 0 7952 27% 54
1688 69 9 0 5143 57.10% 25
initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
175 w 22161.21 22161.21 2179.32 2179.32
407 f 6523.56 6523.56 1011.47 1011.47
444 f 0.00 0.00 20470.31 20470.31
529 f 5422.32 5422.32 774.72 774.72
1180 w 9539.73 9539.73 943.07 943.07
1688 f 10868.71 10868.71 1578.09 1578.09
total_rec_prncp total_rec_int total_rec_late_fee recoveries
175 1838.79 340.53 0 0
407 626.44 385.03 0 0
444 20000.00 470.31 0 0
529 577.68 197.04 0 0
1180 460.27 482.80 0 0
1688 1131.29 446.80 0 0
collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
175 0 Apr-15 730.46 May-15
407 0 Apr-15 255.42 May-15
444 0 Feb-15 19826.09
529 0 Apr-15 194.99 May-15
1180 0 Apr-15 237.85 May-15
1688 0 Apr-15 398.52 May-15
last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
175 Apr-15 0 43
407 Apr-15 0 43
444 Mar-15 0 43
529 Apr-15 0 59
1180 Apr-15 0 43
1688 Apr-15 0 45
policy_code
175 1
407 1
444 1
529 1
1180 1
1688 1