目录
之前发过一篇用biomaRt进行在线转换的帖子https://blog.csdn.net/XMQ_MOLLY/article/details/144807207?spm=1001.2014.3001.5502
但发现在线相当不稳定,为了方便我直接下载了常用的人鼠官方转换数据库,路径为http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt亲测开个简单的VPN打开另存就好了,大概15M左右
导入待转换基因列表
这里以小鼠转换人类同源基因为例,首先导入需要转换的小鼠基因列表
R
# 提取基因名列表(假设Seurat对象为datamu)
gene_list <- data.frame(
Mouse_Gene_Name = rownames(datamu)
)
# 保存到csv备份
write.csv(
x = gene_list,
file = output_path,
row.names = FALSE, # 不保存行号(避免冗余)
quote = FALSE # 不添加引号(方便后续转换工具读取)
)
# 重新读取小鼠基因名列表
mouse_genes_df <- read.csv(output_path, header = FALSE)
mouse_genes <- as.character(mouse_genes_df$V1) # 转为字符向量
mouse_genes <- mouse_genes[!is.na(mouse_genes) & mouse_genes != ""] # 过滤空值
读取离线数据库RPT
接着读取最开始下载的离线数据库(注意有时候会保存成带txt后缀命名)
数据库所有表头如下

R
# 读取离线同源对照表(假设保存为:Mouse_Human_Homology.rpt)
# 注意:MGI的文件是制表符分隔(sep="\t"),不是逗号
homology_table <- read.delim(
"HOM_MouseHumanSequence.rpt",
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE
)
colnames(homology_table)
# 提取小鼠基因行(Common.Organism.Name列包含"mouse")
mouse_genes_part <- subset(
homology_table,
grepl("mouse", tolower(Common.Organism.Name)) # 兼容大小写(Mouse/MOUSE/mouse)
)
# 小鼠基因核心列:Symbol(小鼠基因名) + DB.Class.Key(同源ID,用于匹配人类基因)
mouse_genes_part <- mouse_genes_part[, c("DB.Class.Key", "Symbol")]
colnames(mouse_genes_part) <- c("Homology_ID", "Mouse_Symbol") # 重命名,方便识别
# 提取人类基因行(Common.Organism.Name列包含"human")
human_genes_part <- subset(
homology_table,
grepl("human", tolower(Common.Organism.Name))
)
# 人类基因核心列:DB.Class.Key(同源ID) + Symbol(人类基因名)
human_genes_part <- human_genes_part[, c("DB.Class.Key", "Symbol")]
colnames(human_genes_part) <- c("Homology_ID", "Human_Symbol") # 重命名
分别提取完人和小鼠的基因名列和统一同源ID列("DB.Class.Key")后,合并
R
# 合并小鼠和人类基因(通过Homology_ID匹配)
mouse_human_merge <- merge(
mouse_genes_part,
human_genes_part,
by = "Homology_ID",
all.x = TRUE # 保留所有小鼠基因,无同源的填NA
)
# 简化映射表(只保留小鼠+人类基因名,去重)
homology_simple <- unique(mouse_human_merge[, c("Mouse_Symbol", "Human_Symbol")])
# 验证映射表
cat("\n 小鼠→人类基因映射表前10行:\n")
head(homology_simple, 10)
cat("\n映射表总行数:", nrow(homology_simple), "\n")
批量匹配同源基因
最后转换我的数据,已保存成mouse_genes
R
# mouse_genes为我的小鼠基因列表
gene_conversion <- data.frame(
Mouse_Gene = mouse_genes,
Human_Gene_Upper = NA, # 最终输出的人类大写基因名
stringsAsFactors = FALSE
)
# 批量匹配同源基因
match_idx <- match(gene_conversion$Mouse_Gene, homology_simple$Mouse_Symbol)
gene_conversion$Human_Gene_Upper <- homology_simple$Human_Symbol[match_idx]
# 处理无同源基因的情况(转为大写,保留原小鼠名)
gene_conversion$Human_Gene_Upper <- toupper(gene_conversion$Human_Gene_Upper)
na_idx <- is.na(gene_conversion$Human_Gene_Upper)
gene_conversion$Human_Gene_Upper[na_idx] <- toupper(gene_conversion$Mouse_Gene[na_idx])
验证导出
最后验证常见的人鼠同源基因,并导出同源对照表
R
# 查看核心基因转换结果(如Calb2/Gja1)
core_genes <- c("Calb2", "Gja1", "Cux2", "0610007C21Rik")
print(subset(gene_conversion, Mouse_Gene %in% core_genes))
# 导出最终转换表(CSV格式,可直接使用)
write.csv(
gene_conversion,
"mouse_to_human_gene_conversion_final.csv",
row.names = FALSE,
quote = FALSE
)
cat("\n转换表已导出:mouse_to_human_gene_conversion_final.csv\n")
# 统计转换情况
conversion_stats <- table(is.na(homology_simple$Human_Symbol[match_idx]))
cat("\n转换统计:\n")
cat("成功匹配人类同源基因:", conversion_stats["FALSE"], "个\n")
cat("无同源基因(保留原小鼠名):", conversion_stats["TRUE"], "个\n")
替换Seurat对象
最后可以将转化后的同源基因贴回rds,以便后续分析或者可视化需求
R
# 加载基因转换表
gene_conversion <- read.csv(
"mouse_to_human_gene_conversion_final.csv",
header = TRUE,
stringsAsFactors = FALSE
)
# 提取Seurat原始基因名
original_genes <- rownames(datamu)
# 过滤转换表:仅保留Seurat中存在的基因,去重
gene_conversion_filtered <- gene_conversion[gene_conversion$Mouse_Gene %in% original_genes, ]
gene_conversion_filtered <- unique(gene_conversion_filtered)
# 构建基因名映射向量
gene_map <- setNames(
nm = gene_conversion_filtered$Mouse_Gene,
object = gene_conversion_filtered$Human_Gene_Upper
)
# 初始化新基因名(默认保留原名)
new_gene_names <- original_genes
# 匹配并替换有同源基因的部分
match_pos <- match(original_genes, names(gene_map))
replace_pos <- which(!is.na(match_pos))
new_gene_names[replace_pos] <- gene_map[original_genes[replace_pos]]
# 提取原始表达数据(优先取counts矩阵,无则取data矩阵)
if (!is.null(datamu@assays$RNA@counts) && (is.matrix(datamu@assays$RNA@counts) || inherits(datamu@assays$RNA@counts, "dgCMatrix"))) {
expr_matrix <- datamu@assays$RNA@counts
cat(" Using counts matrix for reconstruction\n")
} else if (!is.null(datamu@assays$RNA@data) && (is.matrix(datamu@assays$RNA@data) || inherits(datamu@assays$RNA@data, "dgCMatrix"))) {
expr_matrix <- datamu@assays$RNA@data
cat(" Using data matrix for reconstruction\n")
} else {
stop(" No valid expression matrix found in Seurat object!")
}
# 重建矩阵并替换基因名(核心逻辑)
# 转换为普通矩阵(避免稀疏矩阵维度名问题)
expr_matrix_new <- as.matrix(expr_matrix)
# 替换矩阵行名
rownames(expr_matrix_new) <- new_gene_names
# 转回稀疏矩阵(节省内存)
expr_matrix_new <- as(expr_matrix_new, "dgCMatrix")
# 重建Seurat对象(彻底避免原对象结构问题)
datamu_new <- CreateSeuratObject(
counts = expr_matrix_new,
meta.data = datamu@meta.data, # 保留原元数据
project = datamu@project.name # 保留原项目名
)
# 保留原对象的其他关键属性(如有)
if (ncol(datamu@assays$RNA@scale.data) > 0) {
scale_data_new <- as.matrix(datamu@assays$RNA@scale.data)
rownames(scale_data_new) <- new_gene_names
datamu_new@assays$RNA@scale.data <- as(scale_data_new, "dgCMatrix")
cat(" Scale data retained and renamed\n")
}
最后验证保存新的rds
R
cat("\n First 10 rownames of new Seurat object:\n")
print(head(rownames(datamu_new), 10))
cat("\n New Seurat object info:\n")
print(datamu_new)
output_rds <- "mouse_with_human_genes_final.rds"
saveRDS(datamu_new, file = output_rds)
cat("\n Replacement completed! Final object saved to:\n", output_rds, "\n")