常规的读入10x的3个文件,需要自己根据下面的网址去下载 pbmc3k_filtered_gene_bc_matrices.tar.gz 文件,并且解压哦,然后 Read10X 函数读入解压后的文件夹目录即可,代码如下所示:
library(Seurat)
# https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz
## Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "filtered_gene_bc_matrices/hg19/")
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k",
min.cells = 3, min.features = 200)
首先查看表达量矩阵,是稀疏矩阵格式,如下所示:
然后做一个简单的转换:
代码如下所示:
ct=pbmc@assays$RNA@counts
ct
ct[ct>0]=1
ct
代码如下所示;
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize",
scale.factor = 10000)
pbmc <- NormalizeData(pbmc)
## Identify the 2000 most highly variable genes
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
## In addition we scale the data
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc),
verbose = FALSE)
pbmc <- FindNeighbors(pbmc, dims = 1:10, verbose = FALSE)
pbmc <- FindClusters(pbmc, resolution = 0.5, verbose = FALSE)
pbmc <- RunUMAP(pbmc, dims = 1:10, umap.method = "uwot", metric = "cosine")
table(pbmc$seurat_clusters)
# pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25, verbose = FALSE)
接下来可以进行可视化:
library(patchwork)
library(ggplot2)
p1=DimPlot(pbmc, reduction = "umap", group.by = 'seurat_clusters',
label = TRUE, pt.size = 0.5)
p2=DotPlot(pbmc, features = c("MS4A1", "GNLY", "CD3E",
"CD14", "FCER1A", "FCGR3A",
"LYZ", "PPBP", "CD8A"),
group.by = 'seurat_clusters')+theme(axis.text.x = element_text(angle = 45,
vjust = 0.5, hjust=0.5))
p1+p2
如下所示:
0-1矩阵的降维聚类分群
如果我们不进行这样的0-1矩阵转换,得到的图表是:
原始矩阵的降维聚类分群
这样的肉眼查看差异还是有点挑战,我们选择如下所示的代码:
load(file = 'phe-by-basic-seurat.Rdata')
phe_basic=phe
load(file = 'phe-by-0-1-matrix.Rdata')
phe_0_1=phe
identical(rownames(phe_0_1),rownames(phe_basic))
library(gplots)
balloonplot(table(phe_basic$seurat_clusters,phe_0_1$seurat_clusters))
有意思的事情是,仍然是可以很大程度维持降维聚类分群结果的一致性哦!
可以看到:
代码如下所示:
phe_0_1$type_by_0_1 = ifelse(phe_0_1$seurat_clusters %in% c(0,1,2,5,8),'Tcells',
ifelse(phe_0_1$seurat_clusters %in% c(3),'Bcells','myeoloid'
))
table(phe_0_1$type_by_0_1)
phe_basic$type_by_basic = ifelse(phe_basic$seurat_clusters %in% c(0,2,4,6),'Tcells',
ifelse(phe_basic$seurat_clusters %in% c(3),'Bcells','myeoloid'
))
table(phe_basic$type_by_basic)
table(phe_basic$type_by_basic,phe_0_1$type_by_0_1)
结果确实很有意思:
Bcells myeoloid Tcells
Bcells 338 0 11
myeoloid 0 675 26
Tcells 2 0 1648
也就是说,我们的单细胞表达量矩阵里面,每个基因在每个细胞的表达量具体是多少其实并不重要,表达量高低也不是很重要,我们只需要知道它是否表达即可!
当然了,我说的是在降维聚类分群这个层面,并不是说后续差异分析,细胞通讯,转录因子分析哦!