组内有生物学重复的差异表达分析+火山图绘制
原始表达矩阵,即count计数表格见附件资源。
脚本如下:
R
library(DESeq2)
library(ggplot2)
library(openxlsx)
#读取基因表达矩阵
dat <- read.table("counts.txt",sep = "\t", row.names = 1, header = TRUE, stringsAsFactors = FALSE)
head(dat)
#指定分组因子顺序
#注意要保证表达矩阵中的样本顺序和这里的分组顺序是一一对应的
coldata <- data.frame(condition = factor(rep(c('control', 'treat'), each = 4), levels = c('control', 'treat')))
#第一步,构建 DESeqDataSet 对象
dds <- DESeqDataSetFromMatrix(countData = dat, colData = coldata, design= ~condition)
#第二步,计算差异倍数并获得 p 值
#备注:parallel = TRUE 可以多线程运行,在数据量较大时建议开启
dds1 <- DESeq(dds, parallel = FALSE)
#注意,需将 treat 在前,control 在后,意为 treat 相较于 control 中哪些基因上调/下调
res <- results(dds1, contrast = c('condition', 'treat', 'control'))
#输出表格至本地
res1 <- data.frame(res, stringsAsFactors = FALSE, check.names = FALSE)
res1$GeneID <- rownames(res1)
rownames(res1) <- NULL
res1 <- res1[, c("GeneID", setdiff(names(res1), "GeneID"))]
res1 <- na.omit(res1)
dir.create("04_Differential_Expression_Analysis/0401_DEG_raw", recursive = TRUE)
write.table(res1, "04_Differential_Expression_Analysis/0401_DEG_raw/control_vs_treat.DESeq2.txt", col.names = NA, sep = '\t', quote = FALSE)
write.xlsx(res1, "04_Differential_Expression_Analysis/0401_DEG_raw/control_vs_treat.DESeq2.xlsx", raw.names = 1)
#筛选差异表达基因---------------------------------------------------------------------
dir.create("04_Differential_Expression_Analysis/0402_DEG", recursive = TRUE)
#首先对表格排个序,按 padj 值升序排序,相同 padj 值下继续按 log2FC 降序排序
res1 <- res1[order(res1$padj, res1$log2FoldChange, decreasing = c(FALSE, TRUE)), ]
#log2FC≥1 & padj<0.05 标识 up,代表显著上调的基因
#log2FC≤-1 & padj<0.05 标识 down,代表显著下调的基因
#其余标识 none,代表非差异的基因
res1[which(res1$log2FoldChange >= 1 & res1$padj < 0.05),'sig'] <- 'up'
res1[which(res1$log2FoldChange <= -1 & res1$padj < 0.05),'sig'] <- 'down'
res1[which(abs(res1$log2FoldChange) <= 1 | res1$padj >= 0.05),'sig'] <- 'none'
#输出选择的差异基因总表
dir.create("04_Differential_Expression_Analysis/0402_DEG/all/Gene", recursive = TRUE)
res1_select <- subset(res1, sig %in% c("up", "down"))
write.table(res1, "04_Differential_Expression_Analysis/0402_DEG//all/Gene/control_vs_treat.all.txt", sep = '\t', col.names = NA, quote = FALSE)
write.xlsx(res1, "04_Differential_Expression_Analysis/0402_DEG//all/Gene/control_vs_treat.all.xlsx", raw.names = 1)
write.table(res1_select, "04_Differential_Expression_Analysis/0402_DEG//all/Gene/control_vs_treat.all_sig.txt", sep = '\t', col.names = NA, quote = FALSE)
write.xlsx(res1_select, "04_Differential_Expression_Analysis/0402_DEG//all/Gene/control_vs_treat.all_sig.xlsx", raw.names = 1)
#根据 up 和 down 分开输出
res1_up <- subset(res1, sig == 'up')
dir.create("04_Differential_Expression_Analysis/0402_DEG/up/Gene", recursive = TRUE)
res1_down <- subset(res1, sig == 'down')
dir.create("04_Differential_Expression_Analysis/0402_DEG/down/Gene", recursive = TRUE)
write.table(res1_up, "04_Differential_Expression_Analysis/0402_DEG/up/Gene/control_vs_treat.up.txt", sep = '\t', col.names = NA, quote = FALSE)
write.xlsx(res1_up, "04_Differential_Expression_Analysis/0402_DEG/up/Gene/control_vs_treat.up.xlsx", raw.names = 1)
write.table(res1_down, "04_Differential_Expression_Analysis/0402_DEG/down/Gene/control_vs_treat.down.txt", sep = '\t', col.names = NA, quote = FALSE)
write.xlsx(res1_down, "04_Differential_Expression_Analysis/0402_DEG/down/Gene/control_vs_treat.down.xlsx", raw.names = 1)
##ggplot2 差异火山图---------------------------------------------------------------
dir.create("04_Differential_Expression_Analysis/0402_DEG/all/Gene/Volcano_Plot", recursive = TRUE)
pdf("04_Differential_Expression_Analysis/0402_DEG/all/Gene/Volcano_Plot/Volcano_Plot.pdf", width = 8, height = 6)
#默认情况下,横轴展示 log2FoldChange,纵轴展示 -log10 转化后的 padj
p <- ggplot(data = res1, aes(x = log2FoldChange, y = -log10(padj), color = sig)) +
theme_minimal() +
geom_point(size = 1) + #绘制散点图
scale_color_manual(values = c('red', 'gray', 'green'), limits = c('up', 'none', 'down')) + #自定义点的颜色
labs(x = 'log2 Fold Change', y = '-log10 adjust p-value', title = 'control_vs_treat', color = '') + #坐标轴标题
theme(plot.title = element_text(hjust = 0.5, size = 14), panel.grid = element_blank(), #背景色、网格线、图例等主题修改
panel.background = element_rect(color = 'black', fill = "white"),
legend.key = element_blank()) +
geom_vline(xintercept = c(-1, 1), lty = 3, color = 'black') + #添加阈值线
geom_hline(yintercept = 2, lty = 3, color = 'black')# +
#xlim(-12, 12) + ylim(0, 35) #定义刻度边界
print(p)
dev.off()
p <- ggplot(data = res1, aes(x = log2FoldChange, y = -log10(padj), color = sig)) +
geom_point(size = 1) + #绘制散点图
scale_color_manual(values = c('red', 'gray', 'green'), limits = c('up', 'none', 'down')) + #自定义点的颜色
labs(x = 'log2 Fold Change', y = '-log10 adjust p-value', title = 'control_vs_treat', color = '') + #坐标轴标题
theme(plot.title = element_text(hjust = 0.5, size = 14), panel.grid = element_blank(), #背景色、网格线、图例等主题修改
panel.background = element_rect(color = 'black', fill = 'white'),
legend.key = element_blank()) +
geom_vline(xintercept = c(-1, 1), lty = 3, color = 'black') + #添加阈值线
geom_hline(yintercept = 2, lty = 3, color = 'black')# +
#xlim(-12, 12) + ylim(0, 35) #定义刻度边界
ggsave("04_Differential_Expression_Analysis/0402_DEG/all/Gene/Volcano_Plot/Volcano_Plot.png", plot = p, width = 8, height = 6, dpi = 720)
dev.off()
输出结果包含去除NA值后的总差异表达矩阵和经显著性差异倍数筛选的差异表达矩阵,示例结果矩阵如下:

以及示例火山图如下:

至于是对照 vs 实验还是实验 vs 对照,可以在代码中进行调整即可;火山图的样式可以在R-带指定基因标签的火山图_火山图r语言代码-CSDN博客这个的基础进行修改调整即可。
很久之前写的脚本,仅供参考 !!!
后续可能会推出升级再升级版本的可执行脚本,方便给关键文件和阈值即可自动化出带基因symbol、GO、KEGG注释信息等的矩阵结果、高阶版本火山图。