【单细胞-第三节 多样本数据分析】

文件在单细胞\5_GC_py\1_single_cell\1.GSE183904.Rmd
GSE183904
数据原文

1.获取临床信息

筛选样本可以参考临床信息

c 复制代码
rm(list = ls())
library(tinyarray)
a = geo_download("GSE183904")$pd
head(a)
table(a$Characteristics_ch1) #统计各样本有多少

2.批量读取

学会如何读取特定的样本

c 复制代码
if(!file.exists("f.Rdata")){
  #untar("GSE183904_RAW.tar",exdir = "GSE183904_RAW")
  fs = dir("GSE183904_RAW/")[c(2,7)] #dir("GSE183904_RAW/"),列出所有文件
  #为了省点内存只做2个样本,去掉[c(2,7)]即做全部样本
  f = lapply(paste0("GSE183904_RAW/",fs),read.csv,row.names = 1)
  #row.names = 1写在lapply的括号里,但是它是read.csv的参数
  fs = stringr::str_split_i(fs,"_",1)
  names(f) = fs
  save(f,file = "f.Rdata")
}
load("f.Rdata")
library(Seurat)
scelist = list()
for(i in 1:length(f)){
  scelist[[i]] <- CreateSeuratObject(counts = f[[i]], 
                                     project = names(f)[[i]])
  print(dim(scelist[[i]]))
}
sce.all = merge(scelist[[1]],scelist[-1])
sce.all = JoinLayers(sce.all)  #连接数据

head(sce.all@meta.data)
table(sce.all$orig.ident)

3.质控指标

c 复制代码
sce.all[["percent.mt"]] <- PercentageFeatureSet(sce.all, pattern = "^MT-")
sce.all[["percent.rp"]] <- PercentageFeatureSet(sce.all, pattern = "^RP[SL]")
sce.all[["percent.hb"]] <- PercentageFeatureSet(sce.all, pattern = "^HB[^(P)]")

head(sce.all@meta.data, 3)

VlnPlot(sce.all, 
        features = c("nFeature_RNA",
                     "nCount_RNA", 
                     "percent.mt",
                     "percent.rp",
                     "percent.hb"),
        ncol = 3,pt.size = 0, group.by = "orig.ident")

4.整合降维聚类分群

c 复制代码
f = "obj.Rdata"
library(harmony)
if(!file.exists(f)){
  sce.all = sce.all %>% 
    NormalizeData() %>%  
    FindVariableFeatures() %>%  
    ScaleData(features = rownames(.)) %>%  
    RunPCA(pc.genes = VariableFeatures(.))  %>%
    RunHarmony("orig.ident") %>% #RunHarmony 包,整合多个样本,处理多样本的必备步骤
    FindNeighbors(dims = 1:15, reduction = "harmony") %>% 
    FindClusters(resolution = 0.5) %>% 
    RunUMAP(dims = 1:15,reduction = "harmony") %>% 
    #reduction = "harmony"必须写上
    RunTSNE(dims = 1:15,reduction = "harmony")
  save(sce.all,file = f)
}
load(f)
ElbowPlot(sce.all)
UMAPPlot(sce.all,label = T)
TSNEPlot(sce.all,label = T)

5.手动注释

c 复制代码
markers = read.delim("GCmarker.txt",header = F,sep = ";")
library(tidyr)
markers = separate_rows(markers,V2,sep = ",") #拆分marker
markers = split(markers$V2,markers$V1)
DotPlot(sce.all,features = markers,cols = "RdYlBu")+
  RotatedAxis()
ggplot2::ggsave("dotplot.png",height = 10,width = 25)
writeLines(paste0(as.character(0:13),","))
names(markers)

celltype = read.csv("celltype.csv",header = F) #自己照着DotPlot图填的
celltype


new.cluster.ids <- celltype$V2
names(new.cluster.ids) <- levels(sce.all)
seu.obj <- RenameIdents(sce.all, new.cluster.ids)
save(seu.obj,file = "seu.obj.Rdata")
p1 <- DimPlot(seu.obj, 
              reduction = "tsne", 
              label = TRUE, 
              pt.size = 0.5) + NoLegend()
p1

6.自动注释

SingleR完成自主注释,不同的是scRNA = sce.all

c 复制代码
library(celldex)
library(SingleR)
ls("package:celldex")
f = "ref_BlueprintEncode.RData"
if(!file.exists(f)){
  ref <- celldex::BlueprintEncodeData()
  save(ref,file = f)
}
ref <- get(load(f))
library(BiocParallel)
scRNA = sce.all
test = scRNA@assays$RNA@layers$data
rownames(test) = Features(scRNA)
colnames(test) = Cells(scRNA)
pred.scRNA <- SingleR(test = test, 
                      ref = ref,
                      labels = ref$label.main, 
                      clusters = scRNA@active.ident)
pred.scRNA$pruned.labels
#查看注释准确性 
plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
levels(scRNA)
scRNA <- RenameIdents(scRNA,new.cluster.ids)
levels(scRNA)
p2 <- DimPlot(scRNA, reduction = "tsne",label = T,pt.size = 0.5) + NoLegend()
p1+p2

7.marker基因

找不同细胞类型间的差异基因

c 复制代码
f = "markers.Rdata"
if(!file.exists(f)){
  allmarkers <- FindAllMarkers(seu.obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
  save(allmarkers,file = f)
}
load(f)
head(allmarkers)

如果想自行修改orig.ident:使用下边的代码:

c 复制代码
sce.all@meta.data$orig.ident=rep(c("a","b"),times= c(ncol(scelist[[1]]),
ncol(scelistl[[2]])))
相关推荐
饼干哥哥5 天前
开源Skills|搭建亚马逊动态关键词库系统,每天抓SSS级机会词
人工智能·深度学习·数据分析
倔强的石头_6 天前
企业工商数据源站点:无验证无拦截,批量获取工商数据完整方案
数据分析
hboot12 天前
AI工程师第二课 - 数据处理
人工智能·python·数据分析
王小王-12314 天前
基于 Hive 的网易云音乐数据分析及可视化系统
hive·hadoop·数据分析·音乐数据分析·网易云音乐分析·hive音乐分析·hadoop网易云
Database_Cool_14 天前
大规模数据分析降本指南:AnalyticDB Serverless 弹性架构实战
数据仓库·阿里云·架构·数据分析·serverless
YangYang9YangYan14 天前
2026初入职场学习数据分析的价值
学习·数据挖掘·数据分析
有Li14 天前
PTCMIL:基于提示 token 聚类的全切片图像多实例学习分析文献速递/多模态医学影像最新进展
论文阅读·学习·数据挖掘·聚类·文献·医学生
数睿数据无代码开发14 天前
打破数据孤岛:深度解析 smardaten 数据连接器核心功能
数据挖掘·无代码
砚底藏山河14 天前
沪深A股:如何获取基金持股数据
java·python·数据分析·maven
jarreyer14 天前
【数据分析绘图】excel绘图和bi工具区别
数据挖掘·数据分析·excel