本次走一遍Scanpy全流程,数据集仍为GSE188711,该数据集可至GEO官网下载。
该内容包含:1.10X数据整理和读取;2.Scanpy全流程(过滤降维聚类分群可视化);3.harmonypy整合数据;4.细胞周期矫正;4.scrublet检测双细胞及去除
上一次推文进行了Seurat和h5ad数据相互转化,以及10X多样本数据整理和读取(Python)。推文可见:https://mp.weixin.qq.com/s/JCGnkNKsQhGgfcazj63Mhw
此外,可以向“生信技能树”公众号发送关键词‘单细胞’,直接获取Seurat V5版本的完整代码。
import os
import shutil
import re
os.getcwd()
fs = [os.path.join('GSE188711_RAW', f) for f in os.listdir('GSE188711_RAW') if 'features' in f]
print(fs)
samples1 = fs
# ['GSE188711_RAW/GSM5688707_features_JCA.tsv.gz', 'GSE188711_RAW/GSM5688706_features_WGC.tsv.gz',
# 'GSE188711_RAW/GSM5688711_features_R_CRC4.tsv.gz', 'GSE188711_RAW/GSM5688709_features_RS-CRC1.tsv.gz',
# 'GSE188711_RAW/GSM5688710_features_R_CRC3.tsv.gz', 'GSE188711_RAW/GSM5688708_features_LS-CRC3.tsv.gz']
# samples2 = [re.sub(r'_features', '', f) for f in fs]
# samples2 = [re.sub(r'\.tsv\.gz$', '', f) for f in samples2]
# print(samples2)
samples2 = [re.findall(r'GSM\d+', f)[0] for f in fs]
print(samples2)
sample_to_file = {}
for filepath in samples1:
# 从文件路径里提取样本ID,和samples2格式对应
# 这里用正则匹配 'GSM' + 数字,比如 GSM12345
match = re.findall(r'GSM\d+', filepath)
if match:
sample_id = match[0]
sample_to_file[sample_id] = filepath
print(sample_to_file)
# 创建 outputs 文件夹(如果不存在)
os.makedirs('outputs', exist_ok=True)
# 遍历 samples2 ,根据样本ID找到对应文件路径进行复制
for sample_id in samples2:
if sample_id not in sample_to_file:
print(f"警告:找不到样本 {sample_id} 对应的文件!")
continue
y = sample_to_file[sample_id]
output_dir = os.path.join('outputs', sample_id)
os.makedirs(output_dir, exist_ok=True)
# 复制并重命名 features 文件
shutil.copy(y, os.path.join(output_dir, 'features.tsv.gz'))
# 对应的 matrix 文件名替换,构建出matrix文件,并转移
matrix_file = y.replace('features', 'matrix').replace('.tsv', '.mtx')
shutil.copy(matrix_file, os.path.join(output_dir, 'matrix.mtx.gz'))
# 对应的 barcodes 文件名替换,构建出barcodes文件,并转移
barcodes_file = y.replace('features', 'barcodes')
shutil.copy(barcodes_file, os.path.join(output_dir, 'barcodes.tsv.gz'))
以上就是整理文件的步骤了。
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import numpy as np
import anndata as ad
import pooch
import os
sc.settings.set_figure_params(dpi=100, facecolor="white")
os.makedirs("1-DataPre",exist_ok=True)
os.getcwd()
# '/Users/zaneflying/Desktop/PythonGSE188711'
dir_path = '/Users/zaneflying/Desktop/PythonGSE188711/outputs/'
os.listdir(dir_path)
#['.DS_Store',
# 'GSM5688711',
# 'GSM5688710',
# 'GSM5688709',
# 'GSM5688707',
# 'GSM5688706',
# 'GSM5688708']
# 这里的目的是不读取以.开头的文件
samples = [s for s in os.listdir(dir_path) if not s.startswith('.')]
samples
# ['GSM5688711',
# 'GSM5688710',
# 'GSM5688709',
# 'GSM5688707',
# 'GSM5688706',
# 'GSM5688708']
sce_list = []
# 对于具有多个批次的数据集,应单独对每个样品进行质量控制
for s in samples :
sample_path = os.path.join(dir_path, s)
#使用gene_symbols作为ID,使用10x
adata = sc.read_10x_mtx(sample_path, var_names='gene_symbols', cache=True)
# 过滤:至少5个细胞的基因,至少300个基因的细胞 (对应min.cells和min.features)
sc.pp.filter_genes(adata, min_cells=5)
sc.pp.filter_cells(adata, min_genes=300)
# 给AnnData添加项目名称
adata.obs['project'] = s
sce_list.append(adata)
# sce_list 是包含多个AnnData对象的列表
sce_list
# [AnnData object with n_obs × n_vars = 6277 × 18958
# obs: 'n_genes', 'project'
# var: 'gene_ids', 'feature_types', 'n_cells',
# AnnData object with n_obs × n_vars = 4398 × 17683
# obs: 'n_genes', 'project'
# var: 'gene_ids', 'feature_types', 'n_cells',
# AnnData object with n_obs × n_vars = 6057 × 18601
# obs: 'n_genes', 'project'
# var: 'gene_ids', 'feature_types', 'n_cells',
# AnnData object with n_obs × n_vars = 4760 × 19390
# obs: 'n_genes', 'project'
# var: 'gene_ids', 'feature_types', 'n_cells',
# AnnData object with n_obs × n_vars = 4431 × 18278
# obs: 'n_genes', 'project'
# var: 'gene_ids', 'feature_types', 'n_cells',
# AnnData object with n_obs × n_vars = 6371 × 18581
# obs: 'n_genes', 'project'
# var: 'gene_ids', 'feature_types', 'n_cells']
adata = sc.concat(sce_list,label="sampleid",index_unique='_')
adata.obs_names_make_unique
# AnnData object with n_obs × n_vars = 32294 × 16174
# obs: 'n_genes', 'project', 'sampleid'
adata.write('./1-DataPre/GSE188711.h5ad')
os.chdir("./1-DataPre/")
os.getcwd()
# '/Users/zaneflying/Desktop/PythonGSE188711/1-DataPre'
adata = sc.read_h5ad("GSE188711.h5ad")
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")
adata.var # 可以看到每个基因的元数据
adata.obs # 可以看到每个细胞的元数据
# 计算每个细胞针对指定基因集的质量控制指标
sc.pp.calculate_qc_metrics(
adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)
width_cm = 50
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.violin(
adata,
["pct_counts_mt", "pct_counts_ribo","pct_counts_hb","n_genes_by_counts", "total_counts"],
jitter=0.4,# 横向抖动防止重叠
multi_panel=True, # 绘制在单独的子图中
show=False
)
plt.savefig("过滤前.pdf",bbox_inches='tight') # 不显示就能保存了。。。。show =False
plt.show()
# 对MT、Ribo、Hb、n_cells_by_counts和total_counts数值进行过滤
adata = adata[
(adata.obs['n_genes_by_counts'] > 500) &
(adata.obs['n_genes_by_counts'] < 6000) &
(adata.obs['total_counts'] < 10000) &
(adata.obs['pct_counts_mt'] < 10) &
(adata.obs['pct_counts_ribo'] < 15) &
(adata.obs['pct_counts_hb'] < 1),
:
]
# 这里的:是保留所有的列
width_cm = 50
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.violin(
adata,
["pct_counts_mt", "pct_counts_ribo","pct_counts_hb","n_genes_by_counts", "total_counts"],
jitter=0.4,# 横向抖动防止重叠
multi_panel=True, # 绘制在单独的子图中
#groupby='sampleid', # 或其他 group 字段
palette=["#ff7f0e"], #["#1f77b4", "#2ca02c", "#d62728", "#9467bd"],
show=False
)
plt.savefig("过滤后.pdf",bbox_inches='tight') # 不显示就能保存了。。。。show =False
plt.show()
颜色自己改
width_cm = 10
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt",show=False)
plt.savefig("scatterPlot.pdf",bbox_inches='tight')
plt.show()
这里做了之后后面再考虑去不去除
# 使用scrublet进行双细胞检测
sc.pp.scrublet(adata, batch_key="sampleid")
adata.obs
# 保存原始count矩阵数据
# 这一步也可以不用
adata.layers["counts"] = adata.X.copy()
adata.layers
# Layers with keys: counts
# 说明目前layers中已经保存了原始的counts文件,这里就是跟Seurat自动生成不一样
归一化
# Normalizing to median total counts
sc.pp.normalize_total(adata,target_sum=1e4)
# Logarithmize the data
sc.pp.log1p(adata)
把Normallizing和log后的数据赋值到raw中去
# 保存一个原始数据
adata.raw = adata.copy()
找高变基因
# 全部
#sc.pp.highly_variable_genes(adata, n_top_genes=adata.shape[1], batch_key="sampleid")
#sc.pl.highly_variable_genes(adata)
# 官网推荐
sc.pp.highly_variable_genes(
adata,
min_mean= 0.0125,
max_mean= 3,
min_disp= 0.5
)
sc.pl.highly_variable_genes(adata)
# 保留高变基因,去除其他基因,但其实可以不去除
# adata = adata[:, adata.var.highly_variable]
# adata
# 基因集来自Seurat
s_genes = [
"MCM5", "PCNA", "TYMS", "FEN1", "MCM2", "MCM4", "RRM1",
"UNG", "GINS2", "MCM6", "CDCA7", "DTL", "PRIM1", "UHRF1",
"HELLS", "RFC2", "RPA2", "NASP", "RAD51AP1", "GMNN", "WDR76",
"SLBP", "CCNE2", "UBR7", "POLD3", "MSH2", "ATAD2", "RAD51",
"RRM2", "CDC45", "CDC6", "EXO1", "TIPIN", "DSCC1", "BLM",
"CASP8AP2", "USP1", "CLSPN", "POLA1", "CHAF1B", "BRIP1", "E2F8"
]
g2m_genes = [
"HMGB2", "CDK1", "NUSAP1", "UBE2C", "BIRC5", "TPX2", "TOP2A", "NDC80",
"CKS2", "NUF2", "CKS1B", "MKI67", "TMPO", "CENPF", "TACC3", "SMC4",
"CCNB2", "CKAP2L", "CKAP2", "AURKB", "BUB1", "KIF11", "ANP32E", "TUBB4B",
"GTSE1", "KIF20B", "HJURP", "CDCA3", "CDC20", "TTK", "CDC25C", "KIF2C",
"RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2", "KIF23", "HMMR",
"AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE", "CTCF", "NEK2",
"G2E3", "GAS2L3", "CBX5", "CENPA"
]
# 细胞周期矫正
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
# 整合基因
cell_cycle_genes = s_genes+g2m_genes
矫正前
adata_cc_genes = adata[:, cell_cycle_genes]
sc.pp.scale(adata_cc_genes, max_value=10) # 也可以把10设定为none
sc.tl.pca(adata_cc_genes)
sc.pl.pca_scatter(adata_cc_genes, color='phase')
关键的矫正代码
sc.pp.regress_out(adata, ['S_score', 'G2M_score'])
矫正后
adata_cc_genes = adata[:, cell_cycle_genes]
sc.pp.scale(adata_cc_genes, max_value=10) # 也可以把10设定为none
sc.tl.pca(adata_cc_genes)
sc.pl.pca_scatter(adata_cc_genes, color='phase')
# 整体的scale Data
sc.pp.scale(adata, max_value=10)
# pca降维
sc.tl.pca(adata)
# 这个图就可以选择合适的PCA数值了
sc.pl.pca_variance_ratio(adata,n_pcs=50,log=True)
# 按照不同分组检查pca的情况
sc.pl.pca(
adata,
color = ["project","project","pct_counts_mt","pct_counts_mt"],# 按照什么参数进行上色
dimensions=[(0,1),(2,3),(0,1),(2,3)], # 按照不同的pca进行展示
ncols =2,
size =2,
)
# 图片不展示
构建邻接图
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=15)
# umap处理
sc.tl.umap(adata)
width_cm = 10
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.umap(
adata,
color="project",
# Setting a smaller point size to get prevent overlap
size=2,
show=False
)
plt.savefig("umap_non.pdf",bbox_inches='tight') # 不显示就能保存了。。。。show =False
plt.show()
import scanpy.external as sce
import harmonypy as hm
sce.pp.harmony_integrate(adata, 'project')
# Then we calculate a new umap and tsne.
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=15, use_rep='X_pca_harmony')
sc.tl.umap(adata)
# sc.tl.tsne(adata, use_rep='X_pca_harmony')
# sc.tl.leiden(adata, resolution=0.5)
width_cm = 10
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.umap(
adata,
color="project",
# Setting a smaller point size to get prevent overlap
size=2,
show=False
)
plt.savefig("umap_scanorama.pdf",bbox_inches='tight')
plt.show()
聚类
# 荐莱顿图聚类方法
# 生成0.1到2.0的分辨率列表,步长0.1
resolutions = np.arange(0.1, 2.1, 0.1)
# 对每个分辨率运行Leiden聚类,结果存入adata.obs对应列
for res in resolutions:
sc.tl.leiden(
adata,
resolution=res,
key_added=f'leiden_{res:.1f}',
flavor='igraph',
n_iterations=2
)
# 生成对应的key列表,供绘制聚类树时使用
all_keys = [f'leiden_{res:.1f}' for res in resolutions]
width_cm = 40
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.umap(adata, color=["leiden_0.2","leiden_0.5", "leiden_0.8","leiden_1.0", "leiden_1.2","leiden_1.5","leiden_1.8","leiden_2.0"], show=False)
plt.savefig("umap_leiden.pdf",bbox_inches='tight')
plt.show()
是否需要过滤双细胞
width_cm = 30
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.umap(
adata,
color=["leiden", "predicted_doublet", "doublet_score"],
# increase horizontal space between panels
wspace=0.5,
size=3,
show =False
)
plt.savefig("Doublet_plot.pdf",bbox_inches='tight')# 不显示就能保存了。。。。show =False
plt.show()
有一点双细胞
# 过滤双细胞
adata = adata[
(adata.obs['predicted_doublet'] == False),
:
]
adata
# View of AnnData object with n_obs × n_vars = 19410 × 16174
# obs: 'n_genes', 'project', 'sampleid', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'doublet_score', 'predicted_doublet', 'S_score', 'G2M_score', 'phase', 'leiden_0.1', 'leiden_0.2', 'leiden_0.3', 'leiden_0.4', 'leiden_0.5', 'leiden_0.6', 'leiden_0.7', 'leiden_0.8', 'leiden_0.9', 'leiden_1.0', 'leiden_1.1', 'leiden_1.2', 'leiden_1.3', 'leiden_1.4', 'leiden_1.5', 'leiden_1.6', 'leiden_1.7', 'leiden_1.8', 'leiden_1.9', 'leiden_2.0', 'leiden'
# var: 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'mean', 'std'
# uns: 'scrublet', 'log1p', 'hvg', 'pca', 'project_colors', 'neighbors', 'umap', 'leiden_0.1', 'leiden_0.2', 'leiden_0.3', 'leiden_0.4', 'leiden_0.5', 'leiden_0.6', 'leiden_0.7', 'leiden_0.8', 'leiden_0.9', 'leiden_1.0', 'leiden_1.1', 'leiden_1.2', 'leiden_1.3', 'leiden_1.4', 'leiden_1.5', 'leiden_1.6', 'leiden_1.7', 'leiden_1.8', 'leiden_1.9', 'leiden_2.0', 'leiden', 'leiden_colors', 'leiden_0.2_colors', 'leiden_0.5_colors', 'leiden_1.0_colors', 'leiden_1.2_colors', 'leiden_0.1_colors', 'leiden_2.0_colors', 'leiden_0.8_colors', 'leiden_1.5_colors', 'leiden_1.8_colors', 'predicted_doublet_colors'
# obsm: 'X_pca', 'X_umap', 'Scanorama'
# varm: 'PCs'
# layers: 'counts'
# obsp: 'distances', 'connectivities'
marker_genes = {
"Malignant": ["KRT14","KRT8","EPCAM"], # "KRT5",
"B/Plasma": ["CD79A", "MS4A1", "TNFRSF13C", "TNFRSF17"],
"T/NK": ["CD3D", "CD3E", "CD4", "CD8A", "GZMB", "NKG7"],
"Mono/Macro": ["CD68", "C1QA", "C1QB", "C1QC", "CD14", "FCGR3A", "FCN1", "SPI1"],
"Pericytes": ["RGS5", "MYH11"],
"Mast": ["TPSB2", "KIT"],
"Dendritic": ["CD1C", "FCER1A", "LILRA4"], # "CLEC9A"
"Fibroblasts": ["LUM", "DCN","COL1A1"],
"Endothelial": ["RAMP2", "CD34"],
"VSMCs":["ACTA2","MYH11"]
}
width_cm = 30
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.dotplot(adata,
marker_genes,
groupby="leiden_1.0",
standard_scale="var",
show =False
)
plt.savefig("leiden_1.0_marker.pdf",bbox_inches='tight')
plt.show()
获取集群特异性差异表达基因
sc.tl.rank_genes_groups(adata,
groupby="leiden_1.0",
method="wilcoxon")
width_cm = 30
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.rank_genes_groups_dotplot(
adata, groupby="leiden_1.0",
standard_scale="var",
n_genes=5,
show =False
)
plt.savefig("leiden_1.0_DEGs.pdf",bbox_inches='tight')
plt.show()
# 取出所有组别的名字,比如 '0', '1', '2' 等
groups = adata.uns['rank_genes_groups']['names'].dtype.names
all_dfs = []
for group in groups:
# 用scanpy自带的函数导出每个组的差异基因结果为DataFrame
df = sc.get.rank_genes_groups_df(adata, group=group)
df['group'] = group # 加一列表示是哪个组
all_dfs.append(df)
# 合并所有组的DataFrame
all_results = pd.concat(all_dfs,axis=0) # 0是行合并,1是列合并
# 保存为Excel文件
all_results.to_csv('rank_genes_groups_results.csv', index=False)
sc.get.rank_genes_groups_df(adata, group="12").head(15)
dc_cluster_genes = sc.get.rank_genes_groups_df(adata, group="12").head(5)["names"]
sc.pl.umap(
adata,
color=[*dc_cluster_genes, "leiden_1.0"],
legend_loc="on data",
frameon=False,
ncols=3,
)
细胞群映射
# 获取所有cluster ID,排序并转成字符串列表(方便后续匹配)
cluster_ids = sorted(adata.obs['leiden_1.0'].unique().tolist())
cluster_ids = [str(c) for c in cluster_ids] # 转成字符串
# 初始化 DataFrame,所有celltype先设为'Unknown'
celltype = pd.DataFrame({
'ClusterID': cluster_ids,
'celltype': ['Unknown'] * len(cluster_ids)
})
celltype
adata.obs['leiden_1.0'].value_counts()
# leiden_1.0
# 16 2594
# 4 1897
# 0 1775
# 13 1702
# 3 1566
# 15 1479
# 14 1193
# 8 1104
# 1 1039
# 7 1019
# 10 809
# 6 698
# 11 606
# 5 477
# 17 290
# 18 258
# 19 231
# 12 222
# 20 200
# 9 179
# 2 72
# Name: count, dtype: int64
# 根据cluster ID分组赋值细胞类型(字符串类型匹配)
# .loc[行条件, 列名] = 新值 语法表示:对满足“行条件”的那些行,在指定列名的位置赋新值
celltype.loc[celltype['ClusterID'].isin(['10']), 'celltype'] = 'Malignant cells'
celltype.loc[celltype['ClusterID'].isin(['3',"4","12"]), 'celltype'] = 'B cells'
celltype.loc[celltype['ClusterID'].isin(["2","13"]), 'celltype'] = 'Plasma'
celltype.loc[celltype['ClusterID'].isin(['0','1',"11","14","15","16","19"]), 'celltype'] = 'T/NK cells'
celltype.loc[celltype['ClusterID'].isin(['7','8','9',"20"]), 'celltype'] = 'Mono/Macro cells'
celltype.loc[celltype['ClusterID'].isin(['6']), 'celltype'] = 'Fibroblast'
celltype.loc[celltype['ClusterID'].isin(['5']), 'celltype'] = 'Mast cells'
celltype.loc[celltype['ClusterID'].isin(['18']), 'celltype'] = 'Eendritic cells'
celltype.loc[celltype['ClusterID'].isin(['17']), 'celltype'] = 'VSMCs'
# 接下来,将celltype信息映射回adata.obs
# 先确保adata.obs里的cluster列也是字符串形式
adata.obs['leiden_1.0_str'] = adata.obs['leiden_1.0'].astype(str)
# 建立映射字典:cluster_id -> celltype
mapping = dict(zip(celltype['ClusterID'], celltype['celltype']))
# 用映射字典给每个cell添加celltype信息
adata.obs['celltype'] = adata.obs['leiden_1.0_str'].map(mapping)
# 查看一下分布
print(adata.obs['celltype'].value_counts())
#celltype
#T/NK cells 8917
#B cells 3685
#Mono/Macro cells 2502
#Plasma 1774
#Malignant cells 809
#Fibroblast 698
#Mast cells 477
#VSMCs 290
#Eendritic cells 258
#Name: count, dtype: int64
width_cm = 10
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
fig = sc.pl.umap(
adata,
color=["celltype"],
legend_loc="on data",
frameon=True,
size=8, # 调整点大小,可根据需要改
legend_fontsize=12, # 调整图例字体大小
show=False)
# 遍历所有文本对象,调节字体大小
for txt in fig.axes.texts:
txt.set_fontsize(8) # 你想要的字体大小
plt.savefig("celltype.pdf",bbox_inches='tight')
plt.show()
width_cm = 30
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.dotplot(adata,
marker_genes,
groupby="celltype",
standard_scale="var",
show =False
)
plt.savefig("celltype_marker.pdf",bbox_inches='tight')
plt.show()
width_cm = 30
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.stacked_violin(adata,
marker_genes,
groupby="celltype")
plt.savefig("celltype_marker2.pdf",bbox_inches='tight',dpi=300)
plt.show()
# 对celltype做一下差异分析
sc.tl.rank_genes_groups(adata,
groupby="celltype",
method="wilcoxon")
width_cm = 30
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.rank_genes_groups_dotplot(
adata, groupby="celltype",
standard_scale="var",
n_genes=5,
show =False
)
plt.savefig("celltype_DEGs.pdf",bbox_inches='tight',dpi=300)
plt.show()
# UMAP基因图
width_cm = 30
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.umap(adata,
color=["CST3", "NKG7", "CD3E"],
show =False
)
plt.savefig("genes.pdf",bbox_inches='tight',dpi=300)
plt.show()
# 基因-小提琴图
width_cm = 60
height_cm = 10
plt.figure(figsize=(width_cm / 2.54, height_cm / 2.54))
sc.pl.violin(adata,
["CST3", "NKG7", "CD3E"],
groupby="celltype",
show = False)
fig = plt.gcf() # 获取当前figure
# 遍历所有文本对象,调节字体大小
for ax in fig.axes:
for label in ax.get_xticklabels():
label.set_rotation(45)
label.set_fontsize(12)
plt.savefig("ViolinPlot.pdf",bbox_inches='tight',dpi=300)
plt.show()
results_file = "./sce.all.h5ad"
adata.write(results_file, compression="gzip")
以上是Scanpy的完整分析流程。接下来有一个问题:在Scanpy中,如何实现类似Seurat中FindSubCluster函数的功能?
致谢:感谢曾老师以及生信技能树团队全体成员。更多精彩内容可关注公众号:生信技能树,单细胞天地,生信菜鸟团等公众号。
注:若对内容有疑惑或者有发现明确错误的朋友,请联系后台(欢迎交流)。更多相关内容可关注公众号:生信方舟。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。