我们生信技能树的马拉松授课群里有个学员在分析单细胞,然后这个单细胞数据是个h5ad,但是呢他又不会python,可急坏了:
一开始想挺简单的吧,随手搜了一个帖子:读取h5ad格式的单细胞文件
library(SeuratDisk)
错误: package or namespace load failed for ‘SeuratDisk’ in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]):
不存在叫‘hdf5r’这个名字的程辑包
## 使用西湖大学的 Bioconductor镜像
options(BioC_mirror="https://mirrors.westlake.edu.cn/bioconductor")
options("repos"=c(CRAN="https://mirrors.westlake.edu.cn/CRAN/"))
# 安装
devtools::install_github("hhoeflin/hdf5r")
# h5ad读取方式
# 自己安装 mojaveazure/seurat-disk 这个GitHub包:
# remotes::install_github("mojaveazure/seurat-disk")
library(SeuratDisk)
library(patchwork)
#~~~~~开始读数据~~~~~
## h5ad是python的Scanpy读取文件格式,需要转换
#~~~~读取adipose~~~~
Convert('./GSE222427/GSM6923183_MC_scRNA.h5ad', "h5seurat", overwrite = TRUE,assay = "RNA")
Warning: Unknown file type: h5ad
Creating h5Seurat file for version 3.1.5.9900
Adding X as data
Adding X as counts
Adding meta.features from var
Adding X_pca as cell embeddings for pca
Adding X_pca_harmony as cell embeddings for pca_harmony
Adding X_umap as cell embeddings for umap
Adding miscellaneous information for pca
Adding standard deviations for pca
Adding miscellaneous information for umap
Adding Major_cluster_colors to miscellaneous data
Adding Phenotype_colors to miscellaneous data
Adding donor_id_colors to miscellaneous data
Adding hvg to miscellaneous data
Adding leiden to miscellaneous data
Adding leiden_colors to miscellaneous data
Adding log1p to miscellaneous data
Adding majority_voting_colors to miscellaneous data
Adding predicted_labels_NC2022_colors to miscellaneous data
Adding predicted_labels_colors to miscellaneous data
Adding rank_genes_groups to miscellaneous data
Adding layer ambiguous as data in assay ambiguous
Adding layer ambiguous as counts in assay ambiguous
Adding layer counts as data in assay counts
Adding layer counts as counts in assay counts
Adding layer matrix as data in assay matrix
Adding layer matrix as counts in assay matrix
Adding layer spliced as data in assay spliced
Adding layer spliced as counts in assay spliced
Adding layer unspliced as data in assay unspliced
Adding layer unspliced as counts in assay unspliced
目录下多了一个文件:GSM6923183_MC_scRNA.h5seurat
sce.all <- LoadH5Seurat("./GSE222427/GSM6923183_MC_scRNA.h5seurat")
Validating h5Seurat file
错误: Ambiguous assays
随手一搜,找到一个帖子:https://zhuanlan.zhihu.com/p/12861008987#:~:text=%E4%BD%BF%E7%94%A8%20SeuratDisk%20%E5%8C%85%E4%B8%AD%E7%9A%84%20LoadH5Seurat%20%E5%87%BD%E6%95%B0%E6%97%B6%E6%8A%A5%E9%94%99%EF%BC%9A,%E8%A7%A3%E5%86%B3%E5%8A%9E%E6%B3%95%EF%BC%9A%20overwrite%20%3D%20TRUE%2Cassay%20%3D%20%22RNA%22%29
Validating h5Seurat file
Initializing RNA with data
Adding counts for RNA
Adding feature-level metadata for RNA
错误: Missing required datasets 'levels' and 'values'
但是上面的链接里面的方法不奏效,去github上看了一下,这个问题吵得真热闹啊!!!:https://github.com/mojaveazure/seurat-disk/issues/109,超多人一样的报错,就是没有人出来解决!试试看这个页面最后提到的办法吧:
library("Seurat")
library("anndata")
print("Convert from Scanpy to Seurat...")
data <- read_h5ad("example.hd5ad")
data <- CreateSeuratObject(counts = t(data$X), meta.data = data$obs)
print(str(data))
还是报错!如果是初学者,这个时候可能已经崩溃了!@!
然后又搜了一个办法,生信技能树的优秀学徒写的:单细胞Seruat和h5ad数据格式互换(R与python)方法学习和整理
这里需要创建python的conda环境sc,以及R环境中的Seurat为V4:
# 其他方法:R代码
# install.packages("anndata")
library(sceasy)
library(reticulate)
library(Seurat)
# v4
packageVersion("Seurat")
# [1] ‘4.4.0’
library(BiocParallel)
register(MulticoreParam(workers = 4, progressbar = TRUE))
use_condaenv(condaenv = '/nas2/zhangj/biosoft/miniconda3/envs/sc')
# h5ad转为Seurat
sceasy::convertFormat(obj = "GSE222427/GSM6923183_MC_scRNA.h5ad", from="anndata",to="seurat",outFile = 'scRNA.rds')
#
# X -> counts
# An object of class Seurat
# 20320 features across 53748 samples within 1 assay
# Active assay: RNA (20320 features, 0 variable features)
# 2 layers present: counts, data
# 3 dimensional reductions calculated: pca, pca_harmony, umap
R环境中的Seurat为V4
sce.all <- readRDS("scRNA.rds")
sce.all
# 查看特征
as.data.frame(sce.all@assays$RNA$counts[1:10, 1:2])
head(sce.all@meta.data, 10)
colnames(sce.all@meta.data)
table(sce.all$donor_id)
table(sce.all$batch)
sce.all$orig.ident <- sce.all$donor_id
An object of class Seurat
20320 features across 53748 samples within 1 assay
Active assay: RNA (20320 features, 0 variable features)
2 layers present: counts, data
3 dimensional reductions calculated: pca, pca_harmony, umap
这里读取到seuart v5的R环境中:
# 变成v5
packageVersion("Seurat")
# [1] ‘5.1.0’
sce.all <- readRDS("scRNA.rds")
sce.all
sce.all_v5 <- CreateSeuratObject(counts = sce.all@assays$RNA@counts, meta.data = sce.all@meta.data)
sce.all_v5
sce.all_v5$orig.ident <- sce.all_v5$donor_id
head(sce.all_v5@meta.data)
library(qs)
qsave(sce.all_v5, file="GSE222427/sce.all_v5.qs")
那就超级简单了:你需要首先在Python里面,先导出python中的annadata主要数据。
创建文件:touch h5ad2rds.ipynb,使用vscode打开h5ad2rds.ipynb
# python中导出数据
import scipy.sparse as sparse
import scipy.io as sio
import scipy.stats as stats
import numpy as np
import scanpy as sc
import os
all_data = sc.read_h5ad("./GSE222427/GSM6923183_MC_scRNA.h5ad")
all_data
all_data.var.head()
cellinfo = all_data.obs
cellinfo = all_data.obs
geneinfo = all_data.var
mtx=all_data.X
# 保存
cellinfo.to_csv("cellinfo.csv")
geneinfo.to_csv("geneinfo.csv")
sio.mmwrite("sparse_matrix.mtx",mtx)
library(Seurat)
library(clustree)
library(cowplot)
library(data.table)
library(ggplot2)
library(patchwork)
library(stringr)
library(qs)
library(Matrix)
mtx <- readMM( "sparse_matrix.mtx" )
mtx[1:4,1:4]
dim(mtx)
cl <- fread( "cellinfo.csv", header = T,data.table = F )
head(cl)
dim(cl)
rl <- fread( "geneinfo.csv", header = T,data.table = F )
head(rl)
dim(rl)
rownames(mtx) <- cl$V1
colnames(mtx) <- rl$V1 #paste0('c',cl$V2)
mtx[1:4,1:4]
mtx <- t(mtx)
dim(mtx)
meta <- cl
rownames(meta) <- cl$V1
identical(rownames(meta),colnames(mtx))
library(Seurat)
sce.all <- CreateSeuratObject(counts = mtx, meta.data = meta, min.cells = 5)
as.data.frame(sce.all@assays$RNA$counts[1:10, 1:2])
head(sce.all@meta.data, 10)