【单细胞-第三节 多样本数据分析】

文件在单细胞\5_GC_py\1_single_cell\1.GSE183904.Rmd
GSE183904
数据原文

1.获取临床信息

筛选样本可以参考临床信息

c 复制代码
rm(list = ls())
library(tinyarray)
a = geo_download("GSE183904")$pd
head(a)
table(a$Characteristics_ch1) #统计各样本有多少

2.批量读取

学会如何读取特定的样本

c 复制代码
if(!file.exists("f.Rdata")){
  #untar("GSE183904_RAW.tar",exdir = "GSE183904_RAW")
  fs = dir("GSE183904_RAW/")[c(2,7)] #dir("GSE183904_RAW/"),列出所有文件
  #为了省点内存只做2个样本,去掉[c(2,7)]即做全部样本
  f = lapply(paste0("GSE183904_RAW/",fs),read.csv,row.names = 1)
  #row.names = 1写在lapply的括号里,但是它是read.csv的参数
  fs = stringr::str_split_i(fs,"_",1)
  names(f) = fs
  save(f,file = "f.Rdata")
}
load("f.Rdata")
library(Seurat)
scelist = list()
for(i in 1:length(f)){
  scelist[[i]] <- CreateSeuratObject(counts = f[[i]], 
                                     project = names(f)[[i]])
  print(dim(scelist[[i]]))
}
sce.all = merge(scelist[[1]],scelist[-1])
sce.all = JoinLayers(sce.all)  #连接数据

head(sce.all@meta.data)
table(sce.all$orig.ident)

3.质控指标

c 复制代码
sce.all[["percent.mt"]] <- PercentageFeatureSet(sce.all, pattern = "^MT-")
sce.all[["percent.rp"]] <- PercentageFeatureSet(sce.all, pattern = "^RP[SL]")
sce.all[["percent.hb"]] <- PercentageFeatureSet(sce.all, pattern = "^HB[^(P)]")

head(sce.all@meta.data, 3)

VlnPlot(sce.all, 
        features = c("nFeature_RNA",
                     "nCount_RNA", 
                     "percent.mt",
                     "percent.rp",
                     "percent.hb"),
        ncol = 3,pt.size = 0, group.by = "orig.ident")

4.整合降维聚类分群

c 复制代码
f = "obj.Rdata"
library(harmony)
if(!file.exists(f)){
  sce.all = sce.all %>% 
    NormalizeData() %>%  
    FindVariableFeatures() %>%  
    ScaleData(features = rownames(.)) %>%  
    RunPCA(pc.genes = VariableFeatures(.))  %>%
    RunHarmony("orig.ident") %>% #RunHarmony 包,整合多个样本,处理多样本的必备步骤
    FindNeighbors(dims = 1:15, reduction = "harmony") %>% 
    FindClusters(resolution = 0.5) %>% 
    RunUMAP(dims = 1:15,reduction = "harmony") %>% 
    #reduction = "harmony"必须写上
    RunTSNE(dims = 1:15,reduction = "harmony")
  save(sce.all,file = f)
}
load(f)
ElbowPlot(sce.all)
UMAPPlot(sce.all,label = T)
TSNEPlot(sce.all,label = T)

5.手动注释

c 复制代码
markers = read.delim("GCmarker.txt",header = F,sep = ";")
library(tidyr)
markers = separate_rows(markers,V2,sep = ",") #拆分marker
markers = split(markers$V2,markers$V1)
DotPlot(sce.all,features = markers,cols = "RdYlBu")+
  RotatedAxis()
ggplot2::ggsave("dotplot.png",height = 10,width = 25)
writeLines(paste0(as.character(0:13),","))
names(markers)

celltype = read.csv("celltype.csv",header = F) #自己照着DotPlot图填的
celltype


new.cluster.ids <- celltype$V2
names(new.cluster.ids) <- levels(sce.all)
seu.obj <- RenameIdents(sce.all, new.cluster.ids)
save(seu.obj,file = "seu.obj.Rdata")
p1 <- DimPlot(seu.obj, 
              reduction = "tsne", 
              label = TRUE, 
              pt.size = 0.5) + NoLegend()
p1

6.自动注释

SingleR完成自主注释,不同的是scRNA = sce.all

c 复制代码
library(celldex)
library(SingleR)
ls("package:celldex")
f = "ref_BlueprintEncode.RData"
if(!file.exists(f)){
  ref <- celldex::BlueprintEncodeData()
  save(ref,file = f)
}
ref <- get(load(f))
library(BiocParallel)
scRNA = sce.all
test = scRNA@assays$RNA@layers$data
rownames(test) = Features(scRNA)
colnames(test) = Cells(scRNA)
pred.scRNA <- SingleR(test = test, 
                      ref = ref,
                      labels = ref$label.main, 
                      clusters = scRNA@active.ident)
pred.scRNA$pruned.labels
#查看注释准确性 
plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
levels(scRNA)
scRNA <- RenameIdents(scRNA,new.cluster.ids)
levels(scRNA)
p2 <- DimPlot(scRNA, reduction = "tsne",label = T,pt.size = 0.5) + NoLegend()
p1+p2

7.marker基因

找不同细胞类型间的差异基因

c 复制代码
f = "markers.Rdata"
if(!file.exists(f)){
  allmarkers <- FindAllMarkers(seu.obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
  save(allmarkers,file = f)
}
load(f)
head(allmarkers)

如果想自行修改orig.ident:使用下边的代码:

c 复制代码
sce.all@meta.data$orig.ident=rep(c("a","b"),times= c(ncol(scelist[[1]]),
ncol(scelistl[[2]])))
相关推荐
CoovallyAIHub1 小时前
首届AI交易大赛对决!中国模型包揽冠亚军,GPT-5亏损62%垫底
人工智能·google·数据分析
学掌门1 小时前
用Python做数据分析之数据表清洗
数据挖掘·数据分析
开发者工具分享4 小时前
用户调研样本不具代表性时怎么办
人工智能·数据挖掘
搞科研的小刘选手5 小时前
【多所高校合作】第四届图像处理、计算机视觉与机器学习国际学术会议(ICICML 2025)
图像处理·人工智能·机器学习·计算机视觉·数据挖掘·人脸识别·人机交互
人大博士的交易之路5 小时前
今日行情明日机会——20251104
大数据·数据挖掘·数据分析·缠论·涨停回马枪·道琼斯结构
蒋星熠7 小时前
多模态技术深度探索:融合视觉与语言的AI新范式
人工智能·python·深度学习·机器学习·分类·数据挖掘·多分类
甄心爱学习7 小时前
数据挖掘6-AI总结
人工智能·数据挖掘
api_1800790546010 小时前
请求、认证与响应数据解析:1688 商品 API 接口深度探秘
java·大数据·开发语言·mysql·数据挖掘
hweiyu0019 小时前
Hive 技术深度解析与 P7 数据分析架构师多行业全场景实战课程合集(视频教程)
hive·数据分析
sensen_kiss1 天前
INT303 Big Data Analysis 大数据分析 Pt.3 数据挖掘(Data Mining)
大数据·数据挖掘·数据分析