R语言高效数据处理-变量批量统计检验

A：以下的4个检验都会自动判断数据里面的数据类型，然后选择对应的变量作统计检验；

B：检验结果中有的是捕获了检验警告信息的，帮助选择检验结果数据；

C：所有检验均只返回统计检验结果数据，不判断使用哪种检验结果作为最终需要的数据，有需要这一层判断自行添加逻辑

1、单样本检验

R 复制代码

#date_base为需要检验变量的数据，虽然单样本检验为连续型变量，
#但这里不用去选择变量的数据类型，函数里面会去判断
one_independent_test <- function(data_base){
  data_base %>% select(where(is.double)) %>% 
    imap(~ {
      shapiro_p <- shapiro.test(.x)$p.value
      t_or_wil_test=if_else(shapiro.test(.x)$p.value > 0.05,
                            t.test(.x) %>% tidy(),
                            wilcox.test(.x) %>% tidy())
      bind_cols(tibble(shapiro_p,t_or_wil_test))
    }
    ) %>% 
    bind_rows( .id = "variable")
}

2、两独立样本检验

R 复制代码

#date_base为需要检验变量的数据，group_var为分组的变量，
#虽然两独立样本检验为连续型变量，但这里不用去选择变量的数据类型，函数里面会去判断
two_independent_test <- function(data_base,group_var){
  group_vec <- data_base %>% pull({{group_var}})
  test_var_list=data_base %>% select(where(is.double))
  all_test_res=test_var_list %>% 
    imap(~{
      shapiro_res=data_base %>%
        group_by({{group_var}}) %>%
    summarise(
      test_result = list(shapiro.test(!!sym(.y)))) %>%
    mutate(
      results = map(test_result, tidy)) %>%
    unnest(results)
  levene_res=leveneTest(.x~group_vec,data = data_base)%>% tidy() %>%
    mutate(method="Levene's Test for Homogeneity")
  t_res_vareaual=t.test(.x~group_vec,data = data_base,var.equal=TRUE) %>% tidy()
  t_res_varuneaual=t.test(.x~group_vec,data = data_base,var.equal=FALSE) %>% tidy()
  wilcox_res=wilcox.test(.x~group_vec,data = data_base)%>% tidy()
  per_test_res=bind_rows(shapiro_res,
                         levene_res,
                         t_res_vareaual,
                         t_res_varuneaual,
                         wilcox_res)
  }
  )%>%
    bind_rows(.id = "variable")
}

3、配对样本检验

R 复制代码

#date_base为需要检验变量的数据，
#配对样本检验为连续型变量，这里不用去选择变量的数据类型，函数里面会去判断，
#但需要配对检验的变量在判断完数据类型后是相邻的，否则得到结果不是预期的
two_paired_test <- function(data_base){
  test_var_list=data_base %>% select(where(is.double)) %>% names()
  var_pair_former=head(test_var_list,-1)
  var_pair_latter=tail(test_var_list,-1)
  names(var_pair_former) <- var_pair_former
  names(var_pair_latter) <- var_pair_latter
  all_test_res=map2(var_pair_former,var_pair_latter,~{
    x <- data_base[[.x]]
    y <- data_base[[.y]]
    z <- x-y
      shapiro_res=bind_rows(shapiro.test(x) %>% tidy() %>% mutate(variable_format=.x),
                            shapiro.test(y) %>% tidy() %>% mutate(variable_format=.y),
                            shapiro.test(z) %>% tidy() %>% mutate(variable_format=str_c('diff',.x,'-',.y)))
      t_paired_res=t.test(x,y,data = data_base,paired=TRUE) %>%
      tidy() %>%  
      mutate(variable_format=paste0(.x,'~',.y))
      wilcox_paired_res=wilcox.test(x, y,data = data_base,paired=TRUE)%>% 
      tidy() %>% 
      mutate(variable_format=paste0(.x,'~',.y))
      per_test_paired_res=bind_rows(
        shapiro_res,
        t_paired_res,
        wilcox_paired_res)
  }
    )%>%
    bind_rows( .id= "variable")
}

4、分类样本检验

R 复制代码

#date_base为需要检验变量的数据，group_var为分组变量
#分类样本检验为分类变量，这里不用去选择变量的数据类型，函数里面会去判断
multi_category_test <- function(data_base,group_var){
  test_var_list=data_base %>% select(where(is.character)) %>%
  select(-{{group_var}}) %>% names()
  names(test_var_list) <- test_var_list
  all_test_res=test_var_list %>% 
    imap(~{
      var_distinct=data_base %>% select(.x) %>% distinct()
      var_distinct_mum=data_base %>% select(.x) %>%n_distinct()
      data_base_nomiss=if(str_detect(var_distinct,"missing")&var_distinct_mum==2){
        data_base %>%
          filter(is.na(!!sym(.x))==FALSE)
      }else{
        data_base %>%
          filter(!!sym(.x)!='missing') %>%
          filter(is.na(!!sym(.x))==FALSE)
      }
      chisq_base_table=data_base_nomiss %>%
        tabyl(!!sym(.x),{{group_var}})
      var_distinct_mum_deal=data_base_nomiss %>% select(.x) %>%n_distinct()
      warn_msg <- NA_character_
      chisq_not_correct_res=if(var_distinct_mum_deal==1){
        return(tibble(
          test_tag = NA_character_,
          method = NA_character_,
          p.value = NA_real_,
          statistic = NA_real_,
          warning_message = "Only one unique value"))
      }else{
        withCallingHandlers(
        {
          test_res <- chisq.test(chisq_base_table, correct = F) %>% 
          tidy()%>% remove_rownames()
        },
        warning = function(w) {
          warn_msg <<- conditionMessage(w)
          invokeRestart("muffleWarning")  # 阻止警告打印
        }
      )}
      chisq_not_correct_res$warning_message <- warn_msg
      chisq_not_correct=chisq.test(chisq_base_table,correct=F)
      chisq_not_correct_observed=chisq_not_correct$observed %>%
        mutate(test_tag='observed')%>% select(-.x)
      print(chisq_not_correct_observed)
      chisq_not_correct_expected=chisq_not_correct$expected %>%
        mutate(test_tag='expected')%>% select(-.x)
      chisq_correct_res=chisq.test(chisq_base_table,correct=T) %>%
      tidy()%>% remove_rownames()
      fisher_test_res=fisher.test(chisq_base_table) %>% tidy()%>% remove_rownames()

      per_test_res=bind_rows(
                             chisq_not_correct_res,
                             chisq_not_correct_observed,
                             chisq_not_correct_expected,
                             chisq_correct_res,
                             fisher_test_res)
    }
    ) %>% 
    bind_rows(.id = "variable") %>% remove_rownames()
}