大数据探索性分析——抽样技术应用

2.3 概率抽样
一、简单随机抽样
R 复制代码
# 数据预处理
LoanStats3c = read.csv("D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/data/2数据集二:Loan Data--Lending Club/LoanStats3c/LoanStats3c.csv", header = TRUE, fill = TRUE, comment.char = "")
# str(LoanStats3c)
# 将有缺失值的列进行中位数填补(针对数值列)

# 检测列类型为数值的列,输出列序号为列表
num_cols <- sapply(LoanStats3c, is.numeric)
num_col_indices <- which(num_cols)

# 对每个数值列,如果存在缺失值则用中位数填补
for(i in num_col_indices) {
  col_name <- names(LoanStats3c)[i]
  na_count <- sum(is.na(LoanStats3c[[i]]))
  if(na_count > 0) {
    med <- median(LoanStats3c[[i]], na.rm = TRUE)
    if(is.na(med)) {
      # 全为 NA 的列,输出提示并跳过
      message(sprintf("跳过列 %s (索引 %d):全为 NA,无法用中位数填补", col_name, i))
    } else {
      LoanStats3c[[i]][is.na(LoanStats3c[[i]])] <- med
      message(sprintf("列 %s (索引 %d):用中位数 %s 填补 %d 个缺失值", col_name, i, format(med), na_count))
    }
  }
}

# 再次统计数值列的缺失值数(用于检查)
na_summary <- sapply(LoanStats3c[num_col_indices], function(x) sum(is.na(x)))
print(na_summary)

# 写入数值列索引到csv文件,供后续分析使用
write.csv(num_col_indices, file = "D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/demo2/num_col_indices.csv", row.names = FALSE)
# 保存清洗后的数据集
write.csv(LoanStats3c, file = "D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/demo2/LoanStats3c_imputed.csv", row.names = FALSE)

# str(LoanStats3c) // 可取消注释查看数据结构
复制代码
列 member_id (索引 2):用中位数 22953173 填补 4 个缺失值

列 loan_amnt (索引 3):用中位数 13000 填补 4 个缺失值

列 funded_amnt (索引 4):用中位数 13000 填补 4 个缺失值

列 loan_amnt (索引 3):用中位数 13000 填补 4 个缺失值

列 funded_amnt (索引 4):用中位数 13000 填补 4 个缺失值

列 funded_amnt_inv (索引 5):用中位数 13000 填补 4 个缺失值

列 installment (索引 8):用中位数 384.14 填补 4 个缺失值

列 funded_amnt_inv (索引 5):用中位数 13000 填补 4 个缺失值

列 installment (索引 8):用中位数 384.14 填补 4 个缺失值

列 annual_inc (索引 14):用中位数 65000 填补 4 个缺失值

列 dti (索引 25):用中位数 17.63 填补 4 个缺失值

列 delinq_2yrs (索引 26):用中位数 0 填补 4 个缺失值

列 annual_inc (索引 14):用中位数 65000 填补 4 个缺失值

列 dti (索引 25):用中位数 17.63 填补 4 个缺失值

列 delinq_2yrs (索引 26):用中位数 0 填补 4 个缺失值

列 inq_last_6mths (索引 28):用中位数 0 填补 4 个缺失值

列 mths_since_last_delinq (索引 29):用中位数 30 填补 115885 个缺失值

列 mths_since_last_record (索引 30):用中位数 69 填补 194109 个缺失值

列 inq_last_6mths (索引 28):用中位数 0 填补 4 个缺失值

列 mths_since_last_delinq (索引 29):用中位数 30 填补 115885 个缺失值

列 mths_since_last_record (索引 30):用中位数 69 填补 194109 个缺失值

列 open_acc (索引 31):用中位数 11 填补 4 个缺失值

列 open_acc (索引 31):用中位数 11 填补 4 个缺失值

列 pub_rec (索引 32):用中位数 0 填补 4 个缺失值

列 revol_bal (索引 33):用中位数 11686 填补 4 个缺失值

列 pub_rec (索引 32):用中位数 0 填补 4 个缺失值

列 revol_bal (索引 33):用中位数 11686 填补 4 个缺失值

列 total_acc (索引 35):用中位数 24 填补 4 个缺失值

列 out_prncp (索引 37):用中位数 9823.83 填补 4 个缺失值

列 total_acc (索引 35):用中位数 24 填补 4 个缺失值

列 out_prncp (索引 37):用中位数 9823.83 填补 4 个缺失值

列 out_prncp_inv (索引 38):用中位数 9817.7 填补 4 个缺失值

列 total_pymnt (索引 39):用中位数 3478.72 填补 4 个缺失值

列 out_prncp_inv (索引 38):用中位数 9817.7 填补 4 个缺失值

列 total_pymnt (索引 39):用中位数 3478.72 填补 4 个缺失值

列 total_pymnt_inv (索引 40):用中位数 3478.08 填补 4 个缺失值

列 total_rec_prncp (索引 41):用中位数 2152.3 填补 4 个缺失值

列 total_rec_int (索引 42):用中位数 995.42 填补 4 个缺失值

列 total_pymnt_inv (索引 40):用中位数 3478.08 填补 4 个缺失值

列 total_rec_prncp (索引 41):用中位数 2152.3 填补 4 个缺失值

列 total_rec_int (索引 42):用中位数 995.42 填补 4 个缺失值

列 total_rec_late_fee (索引 43):用中位数 0 填补 4 个缺失值

列 total_rec_late_fee (索引 43):用中位数 0 填补 4 个缺失值

列 recoveries (索引 44):用中位数 0 填补 4 个缺失值

列 collection_recovery_fee (索引 45):用中位数 0 填补 4 个缺失值

列 last_pymnt_amnt (索引 47):用中位数 420.64 填补 4 个缺失值

列 recoveries (索引 44):用中位数 0 填补 4 个缺失值

列 collection_recovery_fee (索引 45):用中位数 0 填补 4 个缺失值

列 last_pymnt_amnt (索引 47):用中位数 420.64 填补 4 个缺失值

列 collections_12_mths_ex_med (索引 50):用中位数 0 填补 4 个缺失值

列 mths_since_last_major_derog (索引 51):用中位数 43 填补 169155 个缺失值

列 policy_code (索引 52):用中位数 1 填补 4 个缺失值

列 collections_12_mths_ex_med (索引 50):用中位数 0 填补 4 个缺失值

列 mths_since_last_major_derog (索引 51):用中位数 43 填补 169155 个缺失值

列 policy_code (索引 52):用中位数 1 填补 4 个缺失值



                  member_id                   loan_amnt 
                          0                           0 
                funded_amnt             funded_amnt_inv 
                          0                           0 
                installment                  annual_inc 
                          0                           0 
                        dti                 delinq_2yrs 
                          0                           0 
             inq_last_6mths      mths_since_last_delinq 
                          0                           0 
     mths_since_last_record                    open_acc 
                          0                           0 
                    pub_rec                   revol_bal 
                          0                           0 
                  total_acc                   out_prncp 
                          0                           0 
              out_prncp_inv                 total_pymnt 
                          0                           0 
            total_pymnt_inv             total_rec_prncp 
                          0                           0 
              total_rec_int          total_rec_late_fee 
                          0                           0 
                 recoveries     collection_recovery_fee 
                          0                           0 
            last_pymnt_amnt  collections_12_mths_ex_med 
                          0                           0 
mths_since_last_major_derog                 policy_code 
                          0                           0 
R 复制代码
# 推荐使用read.csv,并加上fill=TRUE和comment.char=""参数,避免列数不一致报错
LoanStats3c = read.csv("D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/demo2/LoanStats3c_imputed.csv", header = TRUE, fill = TRUE, comment.char = "")
names(LoanStats3c) #数据中所有变量名
library(sampling) #抽样函数所在的包
N=dim(LoanStats3c)[1] #总体数量
n=500 #样本数量
srsp=srswor(n,N) #简单随机抽样
srs=getdata(LoanStats3c,srsp) #得到样本

# str(srs)
# 检测列类型为数值的列,输出列序号为列表
new_num_cols <- sapply(srs, is.numeric)
new_num_col_indices <- which(new_num_cols)
# 输出数值列的序号
# print(new_num_col_indices)
  1. 'id'
  2. 'member_id'
  3. 'loan_amnt'
  4. 'funded_amnt'
  5. 'funded_amnt_inv'
  6. 'term'
  7. 'int_rate'
  8. 'installment'
  9. 'grade'
  10. 'sub_grade'
  11. 'emp_title'
  12. 'emp_length'
  13. 'home_ownership'
  14. 'annual_inc'
  15. 'verification_status'
  16. 'issue_d'
  17. 'loan_status'
  18. 'pymnt_plan'
  19. 'url'
  20. 'desc'
  21. 'purpose'
  22. 'title'
  23. 'zip_code'
  24. 'addr_state'
  25. 'dti'
  26. 'delinq_2yrs'
  27. 'earliest_cr_line'
  28. 'inq_last_6mths'
  29. 'mths_since_last_delinq'
  30. 'mths_since_last_record'
  31. 'open_acc'
  32. 'pub_rec'
  33. 'revol_bal'
  34. 'revol_util'
  35. 'total_acc'
  36. 'initial_list_status'
  37. 'out_prncp'
  38. 'out_prncp_inv'
  39. 'total_pymnt'
  40. 'total_pymnt_inv'
  41. 'total_rec_prncp'
  42. 'total_rec_int'
  43. 'total_rec_late_fee'
  44. 'recoveries'
  45. 'collection_recovery_fee'
  46. 'last_pymnt_d'
  47. 'last_pymnt_amnt'
  48. 'next_pymnt_d'
  49. 'last_credit_pull_d'
  50. 'collections_12_mths_ex_med'
  51. 'mths_since_last_major_derog'
  52. 'policy_code'
R 复制代码
# 简单估计比较样本均值和总体均值
meanY = colMeans(LoanStats3c[, num_col_indices], na.rm = TRUE) # 总体均值
meany = colMeans(srs[, new_num_col_indices], na.rm = TRUE) # 样本均值
# 输出对象长度并检查匹配
cat('length(meanY) =', length(meanY), '\n')
cat('length(meany) =', length(meany), '\n')
if(length(meanY) != length(meany)) {
  warning('总体均值和样本均值长度不一致:尝试按共有变量对齐')
  common_names <- intersect(names(meanY), names(meany))
  meanY <- meanY[common_names]
  meany <- meany[common_names]
  cat('对齐后长度 =', length(meanY), '\n')
}
# 计算差值并显示前几项
md = meanY - meany
print(head(md))
复制代码
length(meanY) = 28 
length(meany) = 29 
length(meany) = 29 


Warning message:
"总体均值和样本均值长度不一致:尝试按共有变量对齐"


对齐后长度 = 28 
      member_id       loan_amnt     funded_amnt funded_amnt_inv     installment 
  -411336.24337      -518.87495      -518.87495      -519.04750       -10.08388 
     annual_inc 
    -1301.80738 
      member_id       loan_amnt     funded_amnt funded_amnt_inv     installment 
  -411336.24337      -518.87495      -518.87495      -519.04750       -10.08388 
     annual_inc 
    -1301.80738 
二、分层随机抽样
R 复制代码
# 确认存在 grade 列
if(!"grade" %in% names(LoanStats3c)) {
  stop("数据集中不存在名为 'grade' 的列,请检查变量名(区分大小写)")
}
# 清洗 grade 字段:去除首尾空白并统一为大写,避免 ' A' 或 'a' 等问题
LoanStats3c$grade <- trimws(as.character(LoanStats3c$grade))
LoanStats3c$grade <- toupper(LoanStats3c$grade)
# 将 grade 转为因子并指定 A-F 顺序(如只存在部分等级,factor 会自动处理)
LoanStats3c$grade <- factor(LoanStats3c$grade, levels = c("A","B","C","D","E","F"))
# 去掉 grade 为 NA 的行(否则会导致 strata 的 size 对应出现 NA)
df_nomiss_grade <- LoanStats3c[!is.na(LoanStats3c$grade), ]
if(nrow(df_nomiss_grade) == 0) stop('去除 NA 后没有可用于分层的数据')
# 计算各等级总量(基于去除 NA 的数据)
grade_counts <- table(df_nomiss_grade$grade)
# 仅保留数据中实际存在的等级(count > 0)
present_levels <- names(grade_counts[grade_counts > 0])
if(length(present_levels) == 0) stop('数据中没有可用的 grade 等级')
counts_present <- as.integer(grade_counts[present_levels])
names(counts_present) <- present_levels
# 若请求样本量 n 大于总体容量,调整 n
total_capacity <- sum(counts_present)
if(n > total_capacity) {
  warning(sprintf('请求样本量 n=%d 大于总体容量 %d,已将 n 调整为 %d', n, total_capacity, total_capacity))
  n <- total_capacity
}
# 按比例初始分配(仅对存在的等级)
prop_present <- counts_present / sum(counts_present)
wh_present <- as.integer(round(n * prop_present))
# 确保每层不超过该层容量,若超过则先截断
over_idx <- which(wh_present > counts_present)
if(length(over_idx) > 0) {
  wh_present[over_idx] <- counts_present[over_idx]
}
# 调整使总和等于 n:若不足则在有剩余容量的层中循环分配
alloc_sum <- sum(wh_present)
diff <- n - alloc_sum
if(diff > 0) {
  # 逐步分配剩余名额到还有容量的层,按容量从大到小循环分配
  avail <- counts_present - wh_present
  order_idx <- order(avail, decreasing = TRUE)
  i <- 1
  while(diff > 0 && sum(avail) > 0) {
    idx <- order_idx[((i - 1) %% length(order_idx)) + 1]
    if(avail[idx] > 0) {
      wh_present[idx] <- wh_present[idx] + 1
      avail[idx] <- avail[idx] - 1
      diff <- diff - 1
    }
    i <- i + 1
  }
}
# 若分配过多(diff < 0),从有多余的层中减少
if(diff < 0) {
  # 从当前分配量最多的层开始减少,直到匹配
  order_idx <- order(wh_present, decreasing = TRUE)
  i <- 1
  while(diff < 0) {
    idx <- order_idx[((i - 1) %% length(order_idx)) + 1]
    if(wh_present[idx] > 0) {
      wh_present[idx] <- wh_present[idx] - 1
      diff <- diff + 1
    }
    i <- i + 1
  }
}
names(wh_present) <- present_levels
message('各层目标样本量(仅列出存在的等级):')
print(wh_present)
# 为 strata 准备数据:按 grade 排序(strata 内部以出现顺序匹配 size)
o <- order(df_nomiss_grade$grade)
data_o <- df_nomiss_grade[o, ]
# 确保 size 的顺序与 data 中实际出现的等级顺序一致
data_levels_in_order <- unique(as.character(data_o$grade))
message('data 中实际出现的等级顺序:')
print(data_levels_in_order)
message('wh_present 的名字:')
print(names(wh_present))
# 从 wh_present 中按 data_levels_in_order 提取样本量(使用安全映射),并确保没有 NA,转换为整数
size_for_strata <- sapply(data_levels_in_order, function(l) {
  if(!is.na(l) && l %in% names(wh_present)) as.integer(wh_present[[l]]) else 0
})
names(size_for_strata) <- data_levels_in_order
# 计算每层实际容量(按 data_o 中的计数),使用 sapply 避免 NA
counts_in_order <- sapply(data_levels_in_order, function(l) sum(data_o$grade == l, na.rm = TRUE))
# 再次确保 size_for_strata 不超过每层实际容量
size_for_strata <- pmin(as.integer(size_for_strata), as.integer(counts_in_order))
message('传递给 strata 的 size 向量(按 data 中等级顺序):')
print(size_for_strata)
message('对应每层的容量(counts_in_order):')
print(counts_in_order)
# 若全部为 0,则停止
if(all(size_for_strata == 0)) stop('分配到各层的样本数均为 0,无法抽样,请检查 n 的值或 grade 分布')
# 调用 strata 进行分层无放回抽样
srp <- strata(data = data_o, stratanames = "grade", size = size_for_strata, method = "srswor")
# 提取样本并恢复原始顺序(可选)
# sr 使用原始数据框和 srp 索引来获取对应行
sr <- getdata(LoanStats3c, srp)
srs <- sr
# 更新数值列索引(供后续使用)
new_num_cols <- sapply(srs, is.numeric)
new_num_col_indices <- which(new_num_cols)
message(sprintf("分层抽样完成,样本行数 = %d", nrow(srs)))
# 查看各层实际入选数量
print(table(srs$grade))
# 输出入选样本前几行
print(head(srs))
# 可选:保存分层样本
write.csv(srs, file = "D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/demo2/srs_by_grade.csv", row.names = FALSE)
复制代码
各层目标样本量(仅列出存在的等级):



  A   B   C   D   E   F 
 77 132 143  92  43  13 


data 中实际出现的等级顺序:



[1] "A" "B" "C" "D" "E" "F"


wh_present 的名字:



[1] "A" "B" "C" "D" "E" "F"


传递给 strata 的 size 向量(按 data 中等级顺序):



[1]  77 132 143  92  43  13


对应每层的容量(counts_in_order):



    A     B     C     D     E     F 
36108 61935 66565 42992 20121  6223 


分层抽样完成,样本行数 = 500




  A   B   C   D   E   F 
 77 132 143  92  43  13 
           id member_id loan_amnt funded_amnt funded_amnt_inv       term
544  36019516  38721136     10000       10000           10000  36 months
560  37690957  40463819     32000       32000           32000  36 months
676  37791309  40554270      8400        8400            8400  36 months
747  37840891  40603766     28000       28000           28000  60 months
965  36733440  39476198      8000        8000            8000  36 months
1514 37840801  40603650      4500        4500            4500  36 months
     int_rate installment sub_grade             emp_title emp_length
544     7.49%      311.02        A4                 owner    8 years
560    12.39%     1068.83        C1       RN CASE MANAGER    3 years
676    14.99%      291.15        C5 Merchandising Manager    8 years
747     9.49%      587.92        B2  System Administrator  10+ years
965     7.49%      248.82        A4            Controller    3 years
1514   12.39%      150.31        C1    OPERATIONS MANAGER    8 years
     home_ownership annual_inc      verification_status issue_d loan_status
544        MORTGAGE     225000 VERIFIED - income source  Dec-14     Current
560             OWN      70000        VERIFIED - income  Dec-14     Current
676             OWN      34750 VERIFIED - income source  Dec-14     Current
747        MORTGAGE     125000 VERIFIED - income source  Dec-14     Current
965            RENT     140000 VERIFIED - income source  Dec-14     Current
1514           RENT      40000             not verified  Dec-14     Current
     pymnt_plan
544           n
560           n
676           n
747           n
965           n
1514          n
                                                                       url desc
544  https://www.lendingclub.com/browse/loanDetail.action?loan_id=36019516     
560  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37690957     
676  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37791309     
747  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37840891     
965  https://www.lendingclub.com/browse/loanDetail.action?loan_id=36733440     
1514 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37840801     
                purpose                   title zip_code addr_state   dti
544    home_improvement        Home improvement    370xx         TN  7.79
560  debt_consolidation      Debt consolidation    769xx         TX 17.75
676  debt_consolidation      Debt consolidation    840xx         UT 25.15
747         credit_card Credit card refinancing    103xx         NY 21.00
965  debt_consolidation      Debt consolidation    864xx         AZ  4.78
1514        credit_card Credit card refinancing    072xx         NJ 18.67
     delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
544            0           Sep-95              0                     76
560            0           Dec-99              0                     30
676            0           Jan-90              1                     30
747            0           Dec-85              0                     30
965            0           Sep-99              1                     26
1514           0           Mar-05              0                     30
     mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
544                      75       11       1     25176     26.20%        23
560                      69       10       0     25295     45.70%        20
676                      69        9       0     11666     81.60%        10
747                      69       23       0     93879     56.40%        44
965                      69        8       0     13018     64.80%        28
1514                     69        7       0      5733     56.80%        11
     initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
544                    f   8996.24       8996.24     1237.84         1237.84
560                    f  29000.22      29000.22     4242.28         4242.28
676                    f   7834.33       7834.33      862.96          862.96
747                    f  26516.57      26516.57     2329.54         2329.54
965                    w   7196.97       7196.97      991.95          991.95
1514                   f   4078.13       4078.13      598.14          598.14
     total_rec_prncp total_rec_int total_rec_late_fee recoveries
544          1003.76        234.08                  0          0
560          2999.78       1242.50                  0          0
676           565.67        297.29                  0          0
747          1483.43        846.11                  0          0
965           803.03        188.92                  0          0
1514          421.87        176.27                  0          0
     collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
544                        0       Apr-15          311.02       May-15
560                        0       Apr-15         1068.83       May-15
676                        0       Apr-15          291.15       May-15
747                        0       Apr-15          587.92       May-15
965                        0       Apr-15          248.82       May-15
1514                       0       Apr-15          150.31       May-15
     last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
544              Apr-15                          0                          76
560              Apr-15                          0                          43
676              Apr-15                          0                          43
747              Apr-15                          0                          43
965              Apr-15                          0                          28
1514             Apr-15                          0                          43
     policy_code grade ID_unit        Prob Stratum
544            1     A     544 0.002132491       1
560            1     A     560 0.002132491       1
676            1     A     676 0.002132491       1
747            1     A     747 0.002132491       1
965            1     A     965 0.002132491       1
1514           1     A    1514 0.002132491       1
三、整群抽样
R 复制代码
#------------------ 整群抽样(稳健实现)------------------#
cluster_col <- "home_ownership"
if(!cluster_col %in% names(LoanStats3c)) {
  stop(sprintf("找不到列 %s,请检查变量名", cluster_col))
}
# 计算可用簇(去除 NA)
clusters <- unique(na.omit(LoanStats3c[[cluster_col]]))
n_clusters <- length(clusters)
requested_clusters <- 10 # 希望抽取的簇数,可修改
if(n_clusters == 0) stop(sprintf("列 %s 没有可用的簇(全部为 NA)", cluster_col))
# 将请求的簇数限制为实际可用簇数
size_clusters <- min(requested_clusters, n_clusters)
message(sprintf("簇总数 = %d, 请求 = %d, 实际将抽取 = %d", n_clusters, requested_clusters, size_clusters))
# 如果 size_clusters 为 0 则无法抽样
if(size_clusters <= 0) stop("没有可用的簇可抽取")
# 调用 cluster 函数进行抽样(无放回)
scp <- cluster(data = LoanStats3c, clustername = cluster_col, size = size_clusters, method = "srswor", description = FALSE)
sc <- getdata(LoanStats3c, scp)
# 查看抽取到的簇和样本数量
message(sprintf("抽到的簇数量 = %d, 抽样得到的行数 = %d", length(unique(scp$ID_1)), nrow(sc)))
print(head(sc))
复制代码
簇总数 = 5, 请求 = 10, 实际将抽取 = 5

抽到的簇数量 = 0, 抽样得到的行数 = 235633

抽到的簇数量 = 0, 抽样得到的行数 = 235633



                                                     id member_id loan_amnt
235630                                                   22953173     13000
235631                                                   22953173     13000
235632 Total amount funded in policy code 1: 3503840175  22953173     13000
235633  Total amount funded in policy code 2: 873663239  22953173     13000
80686                                          26170263  28642950      5000
16527                                          35226318  37907692     19200
       funded_amnt funded_amnt_inv       term int_rate installment grade
235630       13000           13000                          384.14  <NA>
235631       13000           13000                          384.14  <NA>
235632       13000           13000                          384.14  <NA>
235633       13000           13000                          384.14  <NA>
80686         5000            5000  36 months   11.67%      165.29     B
16527        19200           19200  36 months    9.49%      614.95     B
       sub_grade                       emp_title emp_length annual_inc
235630                                                           65000
235631                                                           65000
235632                                                           65000
235633                                                           65000
80686         B4 Office Administrative Assistant    5 years      35680
16527         B2                      Controller  10+ years      38400
            verification_status issue_d loan_status pymnt_plan
235630                                                        
235631                                                        
235632                                                        
235633                                                        
80686  VERIFIED - income source  Sep-14  Fully Paid          n
16527              not verified  Nov-14     Current          n
                                                                         url
235630                                                                      
235631                                                                      
235632                                                                      
235633                                                                      
80686  https://www.lendingclub.com/browse/loanDetail.action?loan_id=26170263
16527  https://www.lendingclub.com/browse/loanDetail.action?loan_id=35226318
       desc            purpose                   title zip_code addr_state
235630                                                                    
235631                                                                    
235632                                                                    
235633                                                                    
80686       debt_consolidation      Debt consolidation    757xx         TX
16527              credit_card Credit card refinancing    476xx         IN
         dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
235630 17.63           0                               0                     30
235631 17.63           0                               0                     30
235632 17.63           0                               0                     30
235633 17.63           0                               0                     30
80686  28.12           0           Dec-03              0                     55
16527  28.94           0           Oct-96              0                     27
       mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
235630                     69       11       0     11686                   24
235631                     69       11       0     11686                   24
235632                     69       11       0     11686                   24
235633                     69       11       0     11686                   24
80686                      69        7       0      3319     43.10%        12
16527                      69        9       0     19353     64.50%        30
       initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
235630                       9823.83       9817.70    3478.720         3478.08
235631                       9823.83       9817.70    3478.720         3478.08
235632                       9823.83       9817.70    3478.720         3478.08
235633                       9823.83       9817.70    3478.720         3478.08
80686                    f      0.00          0.00    5048.625         5048.62
16527                    w  16847.53      16847.53    3064.630         3064.63
       total_rec_prncp total_rec_int total_rec_late_fee recoveries
235630         2152.30        995.42                  0          0
235631         2152.30        995.42                  0          0
235632         2152.30        995.42                  0          0
235633         2152.30        995.42                  0          0
80686          5000.00         48.62                  0          0
16527          2352.47        712.16                  0          0
       collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
235630                       0                       420.64             
235631                       0                       420.64             
235632                       0                       420.64             
235633                       0                       420.64             
80686                        0       Oct-14         5048.63             
16527                        0       Apr-15          614.95       May-15
       last_credit_pull_d collections_12_mths_ex_med
235630                                             0
235631                                             0
235632                                             0
235633                                             0
80686              Apr-15                          0
16527              Apr-15                          0
       mths_since_last_major_derog policy_code home_ownership ID_unit Prob
235630                          43           1                 235630    1
235631                          43           1                 235631    1
235632                          43           1                 235632    1
235633                          43           1                 235633    1
80686                           55           1            ANY   80686    1
16527                           31           1       MORTGAGE   16527    1
四、系统抽样
R 复制代码
i=rep(1,N)
pik1=inclusionprobabilities(i,n) #采用等概率的系统抽样,共抽取n个样本
ssp=UPsystematic(pik1,eps=1e-6) #系统抽样的函数
ss=getdata(LoanStats3c,ssp) #入选样本
print(head(ss))
复制代码
     ID_unit       id member_id loan_amnt funded_amnt funded_amnt_inv
283      283 37751794  40514790     12000       12000           12000
755      755 37741264  40504192      7000        7000            7000
1226    1226 37740968  40503825      3600        3600            3600
1697    1697 37700609  40473393     16000       16000           16000
2168    2168 37760342  40523083     22525       22525           22525
2640    2640 37257945  40030762     25000       25000           25000
           term int_rate installment grade sub_grade            emp_title
283   36 months   11.99%      398.52     B        B5 Office Administrator
755   36 months    6.49%      214.52     A        A2   Associate Engineer
1226  36 months   13.66%      122.45     C        C3    Software Engineer
1697  36 months    9.49%      512.46     B        B2              Manager
2168  60 months   19.24%      587.29     E        E2        Senior Typist
2640  60 months   16.49%      614.48     D        D3       Parole Officer
     emp_length home_ownership annual_inc      verification_status issue_d
283     7 years           RENT      68200 VERIFIED - income source  Dec-14
755    < 1 year           RENT      65000             not verified  Dec-14
1226   < 1 year           RENT      67000             not verified  Dec-14
1697    3 years       MORTGAGE      90000             not verified  Dec-14
2168  10+ years           RENT      49000        VERIFIED - income  Dec-14
2640    2 years           RENT      60000        VERIFIED - income  Dec-14
     loan_status pymnt_plan
283      Current          n
755      Current          n
1226     Current          n
1697     Current          n
2168     Current          n
2640     Current          n
                                                                       url desc
283  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37751794     
755  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37741264     
1226 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37740968     
1697 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37700609     
2168 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37760342     
2640 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37257945     
                purpose                   title zip_code addr_state   dti
283               other                   Other    944xx         CA 17.75
755  debt_consolidation      Debt consolidation    100xx         NY  7.94
1226        credit_card Credit card refinancing    276xx         NC 21.73
1697 debt_consolidation      Debt consolidation    982xx         WA 25.92
2168        credit_card Credit card refinancing    120xx         NY 33.60
2640 debt_consolidation      Debt consolidation    995xx         AK 38.66
     delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
283            0           Jun-93              1                     30
755            0           Feb-01              0                     30
1226           0           Feb-89              0                     29
1697           0           May-05              2                     30
2168           0           May-94              0                     30
2640           0           Aug-01              1                     30
     mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
283                      69        8       0     17695     78.60%        14
755                      69       12       0     10294     38.30%        39
1226                     69       12       0      7191     53.30%        14
1697                     69       16       0     13537     42.40%        29
2168                     69       12       0     28745     96.50%        18
2640                     69       17       0     16058     49.90%        54
     initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
283                    f  10868.71      10868.71     1578.09         1578.09
755                    f   6287.60       6287.60      854.29          854.29
1226                   f   3268.51       3268.51      484.34          484.34
1697                   f  14437.88      14437.88     2032.97         2032.97
2168                   w  21598.46      21598.46     2325.08         2325.08
2640                   w  23893.70      23893.70     2435.02         2435.02
     total_rec_prncp total_rec_int total_rec_late_fee recoveries
283          1131.29        446.80                  0          0
755           712.40        141.89                  0          0
1226          331.49        152.85                  0          0
1697         1562.12        470.85                  0          0
2168          926.54       1398.54                  0          0
2640         1106.30       1328.72                  0          0
     collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
283                        0       Apr-15          398.52       May-15
755                        0       Apr-15          214.52       May-15
1226                       0       Apr-15          122.45       May-15
1697                       0       Apr-15          512.46       May-15
2168                       0       Apr-15          587.29       May-15
2640                       0       Apr-15          614.48       May-15
     last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
283              Apr-15                          0                          43
755              Apr-15                          0                          43
1226             Apr-15                          0                          43
1697             Apr-15                          0                          43
2168             Apr-15                          0                          43
2640             Apr-15                          0                          43
     policy_code
283            1
755            1
1226           1
1697           1
2168           1
2640           1
五、多阶段两阶段抽样
R 复制代码
#------------------ 两阶段抽样(手工实现,替代 mstage)------------------#
# 说明:先抽取若干主簇(cluster),然后在每个被抽中的簇内抽取二级样本。
# 参数(可调整)
primary_cluster_col <- "home_ownership"  # 主簇变量名(请根据数据替换)
primary_k <- 3                             # 希望抽取的主簇数
# 二级样本总目标为 n(之前定义),按主簇均分
secondary_total <- n
# 清洗并验证主簇列
if(!primary_cluster_col %in% names(LoanStats3c)) stop(sprintf("找不到列 %s,请检查字段名", primary_cluster_col))
LoanStats3c[[primary_cluster_col]] <- trimws(as.character(LoanStats3c[[primary_cluster_col]]))
LoanStats3c[[primary_cluster_col]] <- ifelse(LoanStats3c[[primary_cluster_col]]=="", NA, LoanStats3c[[primary_cluster_col]])
clusters <- unique(na.omit(LoanStats3c[[primary_cluster_col]]))
n_clusters <- length(clusters)
if(n_clusters == 0) stop(sprintf("列 %s 没有可用的簇(全部为 NA 或空字符串)", primary_cluster_col))
# 调整主簇数量不超过可用簇数
primary_k <- min(primary_k, n_clusters)
message(sprintf("可用簇数量 = %d,计划抽取主簇 = %d", n_clusters, primary_k))
# 随机抽取主簇(无放回)
set.seed(123) # 可选固定随机种子,便于重现
primary_selected <- sample(clusters, primary_k)
message('抽中的主簇:')
print(primary_selected)
# 为每个主簇计算二级样本目标(均分 + 余数分配)
base_sec <- floor(secondary_total / primary_k)
sec_sizes <- rep(base_sec, primary_k)
rem <- secondary_total - sum(sec_sizes)
if(rem > 0) {
  for(i in seq_len(rem)) sec_sizes[i] <- sec_sizes[i] + 1
}
names(sec_sizes) <- primary_selected
message('每个被抽中簇的目标二级样本数(可能被截断至簇容量):')
print(sec_sizes)
# 在每个被抽中簇内抽样
sampled_rows <- integer(0)
per_cluster_actual <- integer(length(primary_selected))
for(i in seq_along(primary_selected)){
  cl <- primary_selected[i]
  rows_in_cl <- which(LoanStats3c[[primary_cluster_col]] == cl)
  cap <- length(rows_in_cl)
  target <- sec_sizes[i]
  if(cap == 0) {
    per_cluster_actual[i] <- 0
    next
  }
  take <- min(target, cap)
  # 若需要全部抽取可直接取全部
  chosen <- sample(rows_in_cl, take, replace = FALSE)
  sampled_rows <- c(sampled_rows, chosen)
  per_cluster_actual[i] <- length(chosen)
}
# 生成结果对象
mss <- LoanStats3c[sampled_rows, ]
ms <- list(primary_selected = primary_selected, per_cluster_target = sec_sizes, per_cluster_actual = per_cluster_actual)
message(sprintf('两阶段抽样完成:共抽取 %d 行样本', nrow(mss)))
message('各簇实际抽取数:')
print(setNames(per_cluster_actual, primary_selected))
# 保存或查看部分样本
print(head(mss))
# 可选保存
# write.csv(mss, file = "D:/OneDrive - stu.fynu.edu.cn/大四上学期/ysq-大数据探索性分析/demo2/multistage_sample.csv", row.names = FALSE)
复制代码
可用簇数量 = 4,计划抽取主簇 = 3

抽中的主簇:

抽中的主簇:



[1] "OWN"  "ANY"  "RENT"


每个被抽中簇的目标二级样本数(可能被截断至簇容量):



 OWN  ANY RENT 
 167  167  166 


两阶段抽样完成:共抽取 334 行样本

各簇实际抽取数:

各簇实际抽取数:



 OWN  ANY RENT 
 167    1  166 
             id member_id loan_amnt funded_amnt funded_amnt_inv       term
28691  34442221  37105507      5000        5000            5000  36 months
17467  34874813  37558157      8200        8200            8200  36 months
32452  33230956  35874247     13050       13050           13050  60 months
114764 21370073  23682984      9300        9300            9300  36 months
45358  31367396  33940619     28000       28000           28000  60 months
65397  27650347  30153412     10000       10000           10000  36 months
       int_rate installment grade sub_grade            emp_title emp_length
28691    11.99%      166.05     B        B5         truck driver    7 years
17467    14.99%      284.22     C        C5     flight attendant     1 year
32452    19.52%      342.27     E        E2 Asst. Vice President    9 years
114764   16.99%      331.53     D        D3          lab tech II    5 years
45358    13.35%      642.12     C        C2 Staff Civil Engineer   < 1 year
65397    14.99%      346.61     C        C5  Computer Specialist  10+ years
       home_ownership annual_inc      verification_status issue_d loan_status
28691             OWN      70000             not verified  Nov-14     Current
17467             OWN      23000 VERIFIED - income source  Nov-14     Current
32452             OWN      45000 VERIFIED - income source  Nov-14     Current
114764            OWN      47544 VERIFIED - income source  Jul-14     Current
45358             OWN      73000 VERIFIED - income source  Oct-14     Current
65397             OWN      95000 VERIFIED - income source  Oct-14     Current
       pymnt_plan
28691           n
17467           n
32452           n
114764          n
45358           n
65397           n
                                                                         url
28691  https://www.lendingclub.com/browse/loanDetail.action?loan_id=34442221
17467  https://www.lendingclub.com/browse/loanDetail.action?loan_id=34874813
32452  https://www.lendingclub.com/browse/loanDetail.action?loan_id=33230956
114764 https://www.lendingclub.com/browse/loanDetail.action?loan_id=21370073
45358  https://www.lendingclub.com/browse/loanDetail.action?loan_id=31367396
65397  https://www.lendingclub.com/browse/loanDetail.action?loan_id=27650347
       desc            purpose              title zip_code addr_state   dti
28691         home_improvement   Home improvement    604xx         IL 20.69
17467       debt_consolidation Debt consolidation    410xx         KY 34.13
32452       debt_consolidation Debt consolidation    330xx         FL 32.75
114764      debt_consolidation Debt consolidation    631xx         MO 18.60
45358       debt_consolidation Debt consolidation    193xx         PA 11.61
65397       debt_consolidation Debt consolidation    114xx         NY  4.90
       delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
28691            0           Jun-01              0                     56
17467            1           Feb-99              3                     17
32452            0           Oct-04              0                     30
114764           0           Jun-96              1                     30
45358            0           Feb-02              0                     30
65397            4           Apr-93              0                      1
       mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
28691                      69        6       0      5057     84.30%        18
17467                      69        9       0     10573     41.80%        16
32452                      69       28       0     12713     88.90%        49
114764                     60       18       1      7255     34.70%        51
45358                      69       15       0      8689     25.70%        33
65397                      69       10       0      4999     27.20%        45
       initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
28691                    f   4407.83       4407.83      826.92          826.92
17467                    w   7268.06       7268.06     1414.27         1414.27
32452                    w  12378.56      12378.56     1697.20         1697.20
114764                   w   7395.98       7395.98     2983.77         2983.77
45358                    w  25960.28      25960.28     3831.95         3831.95
65397                    w   8627.60       8627.60     2079.66         2079.66
       total_rec_prncp total_rec_int total_rec_late_fee recoveries
28691           592.17        234.75                  0          0
17467           931.94        482.33                  0          0
32452           671.44       1025.76                  0          0
114764         1904.02       1079.75                  0          0
45358          2039.72       1792.23                  0          0
65397          1372.40        707.26                  0          0
       collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
28691                        0       Apr-15          166.05       May-15
17467                        0       Apr-15          284.22       May-15
32452                        0       Apr-15          342.27       May-15
114764                       0       Apr-15          331.53       May-15
45358                        0       Apr-15          642.12       May-15
65397                        0       Apr-15          346.61       May-15
       last_credit_pull_d collections_12_mths_ex_med
28691              Apr-15                          0
17467              Apr-15                          0
32452              Apr-15                          0
114764             Apr-15                          0
45358              Apr-15                          0
65397              Apr-15                          0
       mths_since_last_major_derog policy_code
28691                           43           1
17467                           17           1
32452                           43           1
114764                          43           1
45358                           43           1
65397                            1           1
             id member_id loan_amnt funded_amnt funded_amnt_inv       term
28691  34442221  37105507      5000        5000            5000  36 months
17467  34874813  37558157      8200        8200            8200  36 months
32452  33230956  35874247     13050       13050           13050  60 months
114764 21370073  23682984      9300        9300            9300  36 months
45358  31367396  33940619     28000       28000           28000  60 months
65397  27650347  30153412     10000       10000           10000  36 months
       int_rate installment grade sub_grade            emp_title emp_length
28691    11.99%      166.05     B        B5         truck driver    7 years
17467    14.99%      284.22     C        C5     flight attendant     1 year
32452    19.52%      342.27     E        E2 Asst. Vice President    9 years
114764   16.99%      331.53     D        D3          lab tech II    5 years
45358    13.35%      642.12     C        C2 Staff Civil Engineer   < 1 year
65397    14.99%      346.61     C        C5  Computer Specialist  10+ years
       home_ownership annual_inc      verification_status issue_d loan_status
28691             OWN      70000             not verified  Nov-14     Current
17467             OWN      23000 VERIFIED - income source  Nov-14     Current
32452             OWN      45000 VERIFIED - income source  Nov-14     Current
114764            OWN      47544 VERIFIED - income source  Jul-14     Current
45358             OWN      73000 VERIFIED - income source  Oct-14     Current
65397             OWN      95000 VERIFIED - income source  Oct-14     Current
       pymnt_plan
28691           n
17467           n
32452           n
114764          n
45358           n
65397           n
                                                                         url
28691  https://www.lendingclub.com/browse/loanDetail.action?loan_id=34442221
17467  https://www.lendingclub.com/browse/loanDetail.action?loan_id=34874813
32452  https://www.lendingclub.com/browse/loanDetail.action?loan_id=33230956
114764 https://www.lendingclub.com/browse/loanDetail.action?loan_id=21370073
45358  https://www.lendingclub.com/browse/loanDetail.action?loan_id=31367396
65397  https://www.lendingclub.com/browse/loanDetail.action?loan_id=27650347
       desc            purpose              title zip_code addr_state   dti
28691         home_improvement   Home improvement    604xx         IL 20.69
17467       debt_consolidation Debt consolidation    410xx         KY 34.13
32452       debt_consolidation Debt consolidation    330xx         FL 32.75
114764      debt_consolidation Debt consolidation    631xx         MO 18.60
45358       debt_consolidation Debt consolidation    193xx         PA 11.61
65397       debt_consolidation Debt consolidation    114xx         NY  4.90
       delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
28691            0           Jun-01              0                     56
17467            1           Feb-99              3                     17
32452            0           Oct-04              0                     30
114764           0           Jun-96              1                     30
45358            0           Feb-02              0                     30
65397            4           Apr-93              0                      1
       mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
28691                      69        6       0      5057     84.30%        18
17467                      69        9       0     10573     41.80%        16
32452                      69       28       0     12713     88.90%        49
114764                     60       18       1      7255     34.70%        51
45358                      69       15       0      8689     25.70%        33
65397                      69       10       0      4999     27.20%        45
       initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
28691                    f   4407.83       4407.83      826.92          826.92
17467                    w   7268.06       7268.06     1414.27         1414.27
32452                    w  12378.56      12378.56     1697.20         1697.20
114764                   w   7395.98       7395.98     2983.77         2983.77
45358                    w  25960.28      25960.28     3831.95         3831.95
65397                    w   8627.60       8627.60     2079.66         2079.66
       total_rec_prncp total_rec_int total_rec_late_fee recoveries
28691           592.17        234.75                  0          0
17467           931.94        482.33                  0          0
32452           671.44       1025.76                  0          0
114764         1904.02       1079.75                  0          0
45358          2039.72       1792.23                  0          0
65397          1372.40        707.26                  0          0
       collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
28691                        0       Apr-15          166.05       May-15
17467                        0       Apr-15          284.22       May-15
32452                        0       Apr-15          342.27       May-15
114764                       0       Apr-15          331.53       May-15
45358                        0       Apr-15          642.12       May-15
65397                        0       Apr-15          346.61       May-15
       last_credit_pull_d collections_12_mths_ex_med
28691              Apr-15                          0
17467              Apr-15                          0
32452              Apr-15                          0
114764             Apr-15                          0
45358              Apr-15                          0
65397              Apr-15                          0
       mths_since_last_major_derog policy_code
28691                           43           1
17467                           17           1
32452                           43           1
114764                          43           1
45358                           43           1
65397                            1           1
六、不等概抽样
R 复制代码
vol=LoanStats3c$total_acc
pik=inclusionprobabilities(vol,n) #以成交量为样本规模求出包含概率
usp=UPmidzuno(pik) #用Midzuno方法进行不等概率抽样
us=getdata(LoanStats3c,usp) #入选样本
print(head(us))
复制代码
     ID_unit       id member_id loan_amnt funded_amnt funded_amnt_inv
317      317 37800583  40563367     11875       11875           11875
880      880 37811222  40574134      3000        3000            3000
1187    1187 37661052  40423957     12300       12300           12300
1435    1435 37690890  40463735     25000       25000           25000
2732    2732 37098036  39860860      2000        2000            2000
3000    3000 37077808  39840608     14700       14700           14700
           term int_rate installment grade sub_grade
317   36 months   14.31%      407.65     C        C4
880   36 months   14.99%      103.99     C        C5
1187  36 months    6.49%      376.93     A        A2
1435  36 months    6.03%      760.89     A        A1
2732  36 months   12.99%       67.38     C        C2
3000  36 months   14.99%      509.51     C        C5
                           emp_title emp_length home_ownership annual_inc
317                        admission   < 1 year            OWN   55000.00
880  Contractor Installation Manager    3 years            OWN   63000.00
1187                  office manager   < 1 year            OWN   54000.00
1435    Director, Techincal Services  10+ years       MORTGAGE  160000.00
2732   Client Response Communication    9 years       MORTGAGE   26583.07
3000                          Writer   < 1 year           RENT  100000.00
          verification_status issue_d loan_status pymnt_plan
317              not verified  Dec-14     Current          n
880              not verified  Dec-14     Current          n
1187 VERIFIED - income source  Dec-14     Current          n
1435        VERIFIED - income  Dec-14     Current          n
2732             not verified  Dec-14     Current          n
3000             not verified  Dec-14     Current          n
                                                                       url desc
317  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37800583     
880  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37811222     
1187 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37661052     
1435 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37690890     
2732 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37098036     
3000 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37077808     
                purpose                   title zip_code addr_state   dti
317         credit_card Credit card refinancing    921xx         CA 26.23
880              moving   Moving and relocation    997xx         AK 17.05
1187 debt_consolidation      Debt consolidation    217xx         MD 18.80
1435        credit_card Credit card refinancing    603xx         IL 15.54
2732              other                   Other    844xx         UT 11.87
3000        credit_card Credit card refinancing    100xx         NY  8.07
     delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
317            0           Feb-02              1                     30
880            0           Jul-99              1                     41
1187           0           Jan-01              0                     30
1435           0           Dec-94              0                     30
2732           0           Dec-00              0                     30
3000           0           Jan-66              0                     40
     mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
317                      86       28       1     19422     49.40%        46
880                      69       20       0      3390     51.40%        52
1187                     69       11       0      9839     32.50%        27
1435                     69        9       0     36538     62.90%        27
2732                     99        6       1      8675     70.50%        14
3000                     69        7       0     32014     87.20%        34
     initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
317                    w  10791.65      10791.65     1607.00         1607.00
880                    f   2728.91       2728.91      413.46          413.46
1187                   w  11048.26      11048.26     1503.29         1503.29
1435                   f  22439.72      22439.72     3035.18         3035.18
2732                   f   1814.09       1814.09      268.08          268.08
3000                   f  13371.84      13371.84     2025.80         2025.80
     total_rec_prncp total_rec_int total_rec_late_fee recoveries
317          1083.35        523.65                  0          0
880           271.09        142.37                  0          0
1187         1251.74        251.55                  0          0
1435         2560.28        474.90                  0          0
2732          185.91         82.17                  0          0
3000         1328.16        697.64                  0          0
     collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
317                        0       Apr-15          407.65       May-15
880                        0       Apr-15          103.99       May-15
1187                       0       Apr-15          376.93       May-15
1435                       0       Apr-15          760.89       May-15
2732                       0       Apr-15           67.38       May-15
3000                       0       Apr-15          509.51       May-15
     last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
317              Apr-15                          0                          43
880              Apr-15                          0                          51
1187             Apr-15                          0                          43
1435             Apr-15                          0                          43
2732             Apr-15                          0                          43
3000             Apr-15                          0                          43
     policy_code
317            1
880            1
1187           1
1435           1
2732           1
3000           1
七、二重抽样
R 复制代码
#第一重抽样采用简单随机抽样确定层权,第二重抽样采用分层抽样
srsp1=srswor(3000,N)
srs1=getdata(LoanStats3c,srsp1) #第一重抽样
srsp2=srswor(n,3000)
srs2=getdata(srs1,srsp2) #第二重抽样
print(head(srs2))
复制代码
     ID_unit       id member_id loan_amnt funded_amnt funded_amnt_inv
175      175 37641820  40404842     24000       24000           24000
407      407 37771625  40534643      7150        7150            7150
444      444 37661489  40424497     20000       20000           20000
529      529 37821449  40584421      6000        6000            6000
1180    1180 37670980  40433849     10000       10000           10000
1688    1688 37820639  40583405     12000       12000           12000
           term int_rate installment grade sub_grade
175   36 months    6.03%      730.46     A        A1
407   36 months   17.14%      255.42     D        D4
444   36 months   11.99%      664.20     B        B5
529   36 months   10.49%      194.99     B        B3
1180  60 months   14.99%      237.85     C        C5
1688  36 months   11.99%      398.52     B        B5
                               emp_title emp_length home_ownership annual_inc
175                    Captain/Paramedic  10+ years       MORTGAGE     120000
407             Customer Care Specialust   < 1 year           RENT      30000
444                Operations Management   < 1 year       MORTGAGE      85000
529                        Sales Manager  10+ years           RENT     100000
1180 Policies & Procedures Administrator  10+ years       MORTGAGE      82000
1688         Safety and Security Offider   < 1 year       MORTGAGE      85000
          verification_status issue_d loan_status pymnt_plan
175  VERIFIED - income source  Dec-14     Current          n
407  VERIFIED - income source  Dec-14     Current          n
444  VERIFIED - income source  Dec-14  Fully Paid          n
529              not verified  Dec-14     Current          n
1180        VERIFIED - income  Dec-14     Current          n
1688 VERIFIED - income source  Dec-14     Current          n
                                                                       url desc
175  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37641820     
407  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37771625     
444  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37661489     
529  https://www.lendingclub.com/browse/loanDetail.action?loan_id=37821449     
1180 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37670980     
1688 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37820639     
                purpose                   title zip_code addr_state   dti
175         credit_card Credit card refinancing    957xx         CA 19.94
407  debt_consolidation      Debt consolidation    284xx         NC  9.52
444         credit_card Credit card refinancing    597xx         MT 11.00
529  debt_consolidation      Debt consolidation    917xx         CA  9.99
1180        credit_card Credit card refinancing    801xx         CO 10.08
1688 debt_consolidation      Debt consolidation    063xx         CT  7.71
     delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
175            1           Feb-98              0                      8
407            0           Oct-11              2                     30
444            0           Jul-04              0                     30
529            0           Nov-97              1                     59
1180           2           Oct-01              2                      2
1688           0           Jul-99              0                     45
     mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
175                      69        9       0     32544     46.70%        26
407                      69       12       0      7239     55.30%        14
444                      69        5       0     19542     97.70%         7
529                      69       12       0      5914     32.10%        18
1180                     69       21       0      7952        27%        54
1688                     69        9       0      5143     57.10%        25
     initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
175                    w  22161.21      22161.21     2179.32         2179.32
407                    f   6523.56       6523.56     1011.47         1011.47
444                    f      0.00          0.00    20470.31        20470.31
529                    f   5422.32       5422.32      774.72          774.72
1180                   w   9539.73       9539.73      943.07          943.07
1688                   f  10868.71      10868.71     1578.09         1578.09
     total_rec_prncp total_rec_int total_rec_late_fee recoveries
175          1838.79        340.53                  0          0
407           626.44        385.03                  0          0
444         20000.00        470.31                  0          0
529           577.68        197.04                  0          0
1180          460.27        482.80                  0          0
1688         1131.29        446.80                  0          0
     collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
175                        0       Apr-15          730.46       May-15
407                        0       Apr-15          255.42       May-15
444                        0       Feb-15        19826.09             
529                        0       Apr-15          194.99       May-15
1180                       0       Apr-15          237.85       May-15
1688                       0       Apr-15          398.52       May-15
     last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
175              Apr-15                          0                          43
407              Apr-15                          0                          43
444              Mar-15                          0                          43
529              Apr-15                          0                          59
1180             Apr-15                          0                          43
1688             Apr-15                          0                          45
     policy_code
175            1
407            1
444            1
529            1
1180           1
1688           1
相关推荐
伍哥的传说2 小时前
Lodash-es 完整开发指南:ES模块化JavaScript工具库实战教程
大数据·javascript·elasticsearch·lodash-es·javascript工具库·es模块·按需导入
请提交用户昵称3 小时前
大数据各组件flume,datax,presto,DolphinScheduler,findBI在大数据数仓架构中的作用和功能。
大数据·flume·datax·dolphin·presto·findbi·大数据组件
IT果果日记3 小时前
详解DataX开发达梦数据库插件
大数据·数据库·后端
用户Taobaoapi20144 小时前
微店API秘籍!轻松获取商品详情数据
大数据·数据挖掘·数据分析
chimchim664 小时前
StarRocks导入数据-使用 Broker Load 进行异步导入
大数据·sql
iGarment5 小时前
服装采购跟单系统的高效管理实践
大数据·经验分享·云计算
闯闯桑5 小时前
Spark 中spark.implicits._ 中的 toDF和DataFrame 类本身的 toDF 方法
大数据·ajax·spark·scala
阿里云大数据AI技术6 小时前
【跨国数仓迁移实践9】dbt‑maxcompute 在 GoTerra 迁移过程中的落地与技术贡献
大数据
的小姐姐6 小时前
RMS设备检修管理系统_HawkEye智能运维平台_璞华大数据
大数据·运维