原理是依据生存数据来区分基因高低表达的阈值,用来判别基因高表达组和低表达组挺好用的,还有个数据库专门做这个PrognoScan: A new database for meta-analysis of the prognostic value of genes.
# ZNF503基因表达与生存分析完整流程
# 本代码用于分析ZNF503基因表达水平与患者生存率之间的关系
# ----------------------------
# 1. 加载所需的R包
# ----------------------------
# 如果尚未安装这些包,请先运行安装命令
# install.packages(c("jsonlite", "tidyr", "dplyr", "survival", "survminer", "ggplot2"))
# 用于处理JSON格式数据
library(jsonlite)
# 用于数据处理和清洗
library(tidyr)
library(dplyr)
# 用于生存分析
library(survival)
# 用于生存分析可视化
library(survminer)
# 用于数据可视化
library(ggplot2)
# ----------------------------
# 2. 读取和准备数据
# ----------------------------
# 读取基因表达数据(FPKM矩阵)
# check.names = F 确保列名不会被自动修改
exprSet <- read.table("G:/冷热肿瘤/data/数据清洗/fpkm_Matrix.txt", check.names = F)
# 读取临床数据(JSON格式)
clinical_json <- jsonlite::fromJSON('G:/冷热肿瘤/data/clinical.cart.2025-01-03.json')
# 提取人口统计学信息并转换为数据框
demographic_data <- as.data.frame(clinical_json$demographic)
# 选择需要的临床特征列
# 3:性别, 5:死亡状态, 7:出生日期(天数), 9:随访时间
demographic_data <- demographic_data[, c(3, 5, 7, 9)]
# 计算年龄(取绝对值,因为天数通常以负数表示)
demographic_data$days_to_birth <- abs(demographic_data$days_to_birth)
# 重命名列名,使其更直观
colnames(demographic_data)[1:4] <- c("sex", "fustat", "age", "futime")
# 提取临床数据中的样本ID信息
clinical_samples <- clinical_json[, 4]
# 将样本ID与人口统计学数据合并
clinical <- cbind(clinical_samples, demographic_data)
# 重命名第一列为"id",便于后续合并
colnames(clinical)[1] <- "id"
# ----------------------------
# 3. 处理样本ID,区分肿瘤和正常样本
# ----------------------------
# 从基因表达数据的列名(样本ID)中提取信息
# TCGA的样本ID格式通常为:TCGA-XX-XXXX-XXX-XXX,我们需要拆分它
condition_table <- tidyr::separate(
data = data.frame('ids' = colnames(exprSet)),
col = 'ids',
sep = '-',
into = c('c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7')
)
# 将原始样本ID与拆分后的信息合并
condition_table <- cbind(
'ids' = colnames(exprSet),
condition_table,
# 提取前12个字符作为submitter_ID(用于匹配临床数据)
'submitter_IDs' = substr(colnames(exprSet), 1, 12)
)
# 根据TCGA ID规则,c4字段的前三位表示样本类型
# 0开头的通常是肿瘤样本,1开头的通常是正常样本
condition_table$c4 <- gsub(condition_table$c4, pattern = '^0..', replacement = 'cancer')
condition_table$c4 <- gsub(condition_table$c4, pattern = '^1..', replacement = 'normal')
# 选择需要的列并重新命名
condition_table <- condition_table[, c('ids', 'c4', 'submitter_IDs')]
names(condition_table) <- c('sample', 'group', 'submitter_id')
# 确保ID列是字符型,避免后续匹配出错
condition_table$submitter_id <- as.character(condition_table$submitter_id)
condition_table$sample <- as.character(condition_table$sample)
# 保存处理好的样本信息,方便以后复用
save(condition_table, file = 'condition_table.rdata')
# ----------------------------
# 4. 合并临床数据和基因表达数据
# ----------------------------
# 重命名列名,使两个数据框有相同的键用于合并
colnames(condition_table)[3] <- "id"
# 通过id列合并样本信息和临床数据
clinical <- condition_table %>% left_join(clinical, by = "id")
# 保存合并后的临床数据
write.table(
clinical,
file = "G:/冷热肿瘤/data/数据清洗/clinical.txt",
sep = "\t",
row.names = TRUE,
col.names = TRUE,
quote = FALSE
)
# 重新读取保存的临床数据,确保格式正确
clinical <- read.table(
"G:/冷热肿瘤/data/数据清洗/clinical.txt",
sep = "\t",
row.names = TRUE,
header = TRUE
)
# 确保样本名是字符型,便于后续匹配
clinical$sample <- as.character(clinical$sample)
colnames(exprSet) <- as.character(colnames(exprSet))
# ----------------------------
# 5. 提取ZNF503基因的表达值
# ----------------------------
# 创建新列存储ZNF503的表达值,先初始化为NA
clinical$ZNF503expression <- NA
# 循环匹配样本,将对应的ZNF503表达值填入临床数据
for (i in 1:nrow(clinical)) {
# 获取当前样本名
sample_name <- clinical$sample[i]
# 检查样本是否在表达数据中存在
if (sample_name %in% colnames(exprSet)) {
# 将表达数据中ZNF503基因对应样本的值赋给临床数据
clinical$ZNF503expression[i] <- exprSet["ZNF503", sample_name]
}
}
# 查看前几行数据,确认ZNF503表达值已正确添加
head(clinical)
# ----------------------------
# 6. 准备生存分析数据
# ----------------------------
# 处理生存状态:Alive(存活)设为0,Dead(死亡)设为1
clinical$fustat <- ifelse(clinical$fustat == "Alive", 0, 1)
# 查看处理后的生存状态
head(clinical$fustat)
# 去除含有缺失值的行(生存分析不能有缺失值)
clinical <- clinical[complete.cases(clinical[, c("futime", "fustat", "ZNF503expression")]), ]
# 按ZNF503表达值从小到大排序
cli <- clinical[order(clinical$ZNF503expression), ]
rownames(cli) <- cli$sample
# ----------------------------
# 7. 寻找最佳的高低表达分组阈值
# ----------------------------
# 初始化存储P值和HR值的向量
pvals <- c()
hrs <- c()
# 尝试不同的分组点,从第1个样本到倒数第1个样本
for(n in 1:(nrow(cli)-1)) {
# 将样本分为两组:前n个为低表达,其余为高表达
ssdf <- cbind(
cli,
data.frame(type = rep(c("ZNF503Low", "ZNF503High"), c(n, nrow(cli)-n)))
)
# 将分组设为因子型变量
ssdf$type <- factor(ssdf$type, levels = c("ZNF503Low", "ZNF503High"))
# 进行生存差异分析
fitd <- survdiff(
Surv(futime, fustat) ~ type, # 生存时间和状态 ~ 分组
data = ssdf,
na.action = na.exclude # 排除缺失值
)
# 计算P值
pValue <- 1 - pchisq(fitd$chisq, length(fitd$n) - 1)
pvals <- c(pvals, pValue)
# 计算风险比(HR)
hr <- (fitd$obs[1] / fitd$exp[1]) / (fitd$obs[2] / fitd$exp[2])
hrs <- c(hrs, hr)
}
# 将结果整理成数据框
dat <- data.frame(
Num = 1:(nrow(cli)-1), # 分组点位置
HR = hrs, # 风险比
Pvalue = pvals # P值
)
# 查看结果的前几行
head(dat)
# ----------------------------
# 8. 可视化不同分组点的结果
# ----------------------------
# 绘制HR和P值的关系图
ggplot(dat, aes(x = log2(HR), y = -log10(Pvalue))) +
geom_point(shape = 21, aes(fill = Num), size = 3) + # 点的大小和形状
scale_fill_continuous(name = "分组点位置") + # 颜色图例
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "gray50") + # 参考线
geom_hline(yintercept = (-log10(0.05)), linetype = "dashed", color = "gray50") + # 显著性阈值线
labs(
x = "log2(HR)",
y = "-log10(P值)",
title = "不同分组点的生存分析结果"
) +
theme_bw() + # 白色背景主题
theme(plot.title = element_text(hjust = 0.5)) # 标题居中
# ----------------------------
# 9. 确定最佳分组并进行最终生存分析
# ----------------------------
# 找到P值最小的分组点(最佳分组)
bestNum <- dat$Num[which.min(pvals)]
cat("最佳分组点位置:", bestNum, "\n")
# 根据最佳分组点创建分组
ssdf2 <- cbind(
cli,
data.frame(type = rep(c("ZNF503Low", "ZNF503High"), c(bestNum, nrow(cli)-bestNum)))
)
# 设置分组因子水平
ssdf2$type <- factor(ssdf2$type, levels = c("ZNF503Low", "ZNF503High"))
# 查看两组的样本数量
cat("分组样本数量:\n")
print(table(ssdf2$type))
# 进行生存差异分析
fitd2 <- survdiff(
Surv(futime, fustat) ~ type,
data = ssdf2,
na.action = na.exclude
)
print(fitd2)
# 计算最终P值
pValue2 <- 1 - pchisq(fitd2$chisq, length(fitd2$n) - 1)
cat("生存差异P值:", pValue2, "\n")
# 格式化P值标签
p.lab2 <- paste0("P", ifelse(pValue2 < 0.001, " < 0.001", paste0(" = ", round(pValue2, 3))))
# 拟合生存曲线模型
fit2 <- survfit(Surv(futime, fustat) ~ type, data = ssdf2)
# 保存分组结果
write.csv(ssdf2, file = "G:/ZNF503/RNA-seq/生存分析/group.csv", row.names = FALSE)
# ----------------------------
# 10. 计算HR值和95%置信区间
# ----------------------------
# 使用Cox比例风险模型计算HR
cox.obj <- coxph(Surv(futime, fustat) ~ type, data = ssdf2, na.action = na.exclude)
cox.sum <- summary(cox.obj)
# 提取HR值和95%置信区间
HR <- round(exp(cox.sum$coef[1]), 2)
HR.ci <- round(exp(confint(cox.obj))[1, ], 2)
hr.lab <- paste0("HR = ", HR, " (95% CI: ", HR.ci[1], "-", HR.ci[2], ")")
cat("风险比(HR):", HR, ",95%置信区间:", HR.ci[1], "-", HR.ci[2], "\n")
# ----------------------------
# 11. 绘制最终的生存曲线
# ----------------------------
# 创建生存曲线
g <- ggsurvplot(
fit2, # 生存分析结果
data = ssdf2, # 数据
conf.int = TRUE, # 显示置信区间
pval = p.lab2, # 显示P值
pval.size = 5, # P值字体大小
pval.coord = c(0, 0.2),# P值位置
palette = c("#377EB8", "#E41A1C"), # 曲线颜色
legend.title = "ZNF503表达水平", # 图例标题
legend.labs = c("低表达", "高表达"), # 图例标签
legend = "right", # 图例位置
legend.linetype = FALSE,
surv.median.line = "hv", # 显示中位生存时间线
xlab = "时间 (天)", # X轴标签
ylab = "生存率", # Y轴标签
risk.table = TRUE, # 显示风险人数表
risk.table.col = "strata", # 风险表按分组着色
risk.table.height = 0.25, # 风险表高度
ggtheme = theme_bw(base_size = 14) # 图表主题
)
# 添加HR值到生存曲线图
g$plot <- g$plot +
annotate("text",
x = 0, y = 0.15, # 文本位置
label = hr.lab, # 文本内容
hjust = 0, # 水平对齐方式
size = 4.5) # 文本大小
# 显示生存曲线
print(g)
# 保存生存曲线为PDF文件
ggsave(
filename = "ZNF503生存曲线.pdf",
plot = g,
width = 10,
height = 8,
device = "pdf"
)