最近在做一个因子的驱动力分析。我参考了相关文献,写了个函数,希望能帮助到大家。
python
library(ggplot2)
library(tidyverse)
library(randomForest)
library(devtools)
library(rfUtilities)
library(rfPermute)
library(forcats)
rfCalulate2 <- function(data_excel,dep_var,formulaTXT,imagename){
# 定义变量名映射表 - 在这里自定义变量显示名称
var_name_mapping <- c(
"npp00_23" = "NPP",
"FVC00_23" = "植被覆盖度",
"GDP00_23" = "GDP",
"pop00_23" = "人口",
"tem00_23" = "温度",
"pre00_23" = "降雨",
#"wind00_23" = "风速",
"forest00_23" = "森林面积",
"nature00_23" = "自然生境面积",
"wetland00_23" = "湿地面积",
"csh00_23" = "城市化率",
"TGDP00_23" = "旅游收入",
"Tpop00_23" = "旅游人口",
"WZL00_23" = "农业产品",
"WZC00_23" = "林产品",
"WZM00_23" = "牧产品",
"WZY00_23" = "渔产品",
"WZS00_23" = "生态能源",
"WZW00_23" = "水资源"
)
# 定义因变量显示名称
dep_var_names <- c(
"GEP00_23" = "Gross Ecosystem Product",
"TJ00_23" = "调节服务",
"WH00_23" = "文化服务",
"WZ00_23" = "物质供给"
)
mydata <- read_xlsx(data_excel) %>% # 读取Excel数据
#select(4:30) %>%# 获取前八列作为分析变量
#log1p() %>% # 对数变换 有负数的话就会存在nodata值
na.omit() %>% # 移除缺失值
as.data.frame() # 转换为数据框
# 打印NaNs值的行
mydata <- as.data.frame(mydata)
set.seed(123)
# for i in ["TJ00_23", "WH00_23", "WZ00_23", "GEP00_23"]:
# print(i)
formula_str <- paste(dep_var, formulaTXT)#+ wind00_23 + csh00_23
treat_rf <- randomForest(as.formula(formula_str), data = mydata, importance = TRUE, proximity = TRUE)
# 置换检验显著性
set.seed(123)
treat_perm <- rf.significance(treat_rf, mydata, nperm = 99, ntree = 500)
treat_perm
# 检查 treat_perm 的结构,提取 p-value
p_value <- as.numeric(treat_perm$pValue) # 如果出错,请用 `str(treat_perm)` 确认字段名
# 如果 p-value 提取失败,提示检查数据结构
if (is.na(p_value)) {
stop("Unable to extract or convert p-value. Check treat_perm structure.")
}
# 提取模型的 R-squared
r_squared <- round(treat_rf$rsq[500], 4)
# 变量重要性分析并转换数据
set.seed(123)
ANPP_rfP <- rfPermute(as.formula(formula_str), data = mydata, ntree = 500, na.action = na.omit, nrep = 100, num.cores = 1)
ANPP_dat <- importance(ANPP_rfP, sort.by = NULL, decreasing = TRUE)
# 数据转换并添加显著性标记
plot_data <- ANPP_dat %>%
as_tibble(rownames = "names") %>%
data.frame() %>%
mutate(
label = if_else(X.IncMSE.pval < 0.001, "***",
if_else(X.IncMSE.pval < 0.01, "**",
if_else(X.IncMSE.pval < 0.05, "*", "ns"))),
X.IncMSE = as.numeric(X.IncMSE),
group = if_else(label == "ns", "Non-Significant", "Significant") ) %>%
arrange(X.IncMSE) %>% # 按 X.IncMSE 值升序排序
mutate(names = factor(names, levels = names)) # 确保因子水平顺序为升序
# 自定义颜色
custom_colors <- c("Non-Significant" = "#FD7D6D", "Significant" = "#008CB7")
# 应用变量名重命名
plot_data <- plot_data %>%
mutate(
display_names = ifelse(
names %in% names(var_name_mapping),
var_name_mapping[as.character(names)],
as.character(names)
),
display_names = factor(display_names, levels = unique(display_names))
)
# 自定义颜色
custom_colors <- c("Non-Significant" = "#FD7D6D", "Significant" = "#008CB7")
# 获取因变量显示名称
dep_display_name <- ifelse(dep_var %in% names(dep_var_names),
dep_var_names[dep_var], dep_var)
# 绘图
p <- ggplot(plot_data, aes(x = display_names, y = X.IncMSE, fill = group)) +
geom_bar(stat = "identity") + scale_fill_manual(values = custom_colors) +
geom_text(aes(y = X.IncMSE + 1, label = label), hjust = 0, size = 3) +
labs(
x = "",
y = "%IncMSE",
fill = "Significance",
title = paste("Variable Importance for", dep_display_name)
) +
coord_flip() +
theme_bw() +
theme(legend.position = "right") + # 在右下角添加注释
annotate("text", x = 1, y = max(plot_data$X.IncMSE) * 0.5, hjust = 0,
label=paste0("p-value:",round(p_value,4),"\nR-squared:",r_squared))
# 保存图形(可选)
ggsave(imagename, p, width = 12, height = 9, dpi = 300)
#print(p)
运行的代码。
python
data_excel<- "change_analysis.xlsx"
dep_var <- "WZ00_23"
formulaTXT <- "~ tem00_23 + wetland00_23 + csh00_23 + TGDP00_23 + Tpop00_23 + WZL00_23 + WZC00_23 + WZM00_23 + WZY00_23 + WZS00_23 + WZW00_23"
rfCalulate2(data_excel,dep_var,formulaTXT,imagename="image/变化-WZ00_23.png")
目前这个存在的问题
1.目前这个模型还存在一个问题,就是这个模型有些因子加进去可能是增加其均方差的,删除其后其的排序又会有所变化。
2.另外,就是当调整变量顺序的时候,其影响因子的排序和效果也会发生改变。
参考文献