值得注意的是本次投稿并不是基于R语言,而是python哦!
下面申小忱的投稿
作者:xjtu申小忱
这次我们来复现一篇单细胞的文章。这篇我们只来复现细胞图谱和拟时序分析 像细胞通讯,还有富集分析还是很简单的。大家可以继续走下去,然后我们来交流讨论! 这篇全篇基于python复现。
import pandas as pd
import numpy as np
import pandas as pd
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib import rcParams
import scanpy as sc
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
#sc.logging.print_versions()
#results_file = './write/pa.h5ad'
sc.settings.set_figure_params(dpi=300, frameon=False, figsize=(3, 3), facecolor='white')
raw_UMIcounts = pd.read_table("./GSM4798908_B2019-1.expression_matrix.txt", header=0, index_col=0)
#raw_UMIcounts.head(10)
raw_UMIcounts_pl = pd.read_table("./GSM4798909_B2019-2.expression_matrix.txt", header=0, index_col=0)
raw_UMIcounts_nl= pd.read_table("./GSM4798910_B2019-3.expression_matrix.txt", header=0, index_col=0)
pi_gene_bar= raw_UMIcounts.columns
pi_gene_bar=pd.Series(pi_gene_bar)+"_primary"
raw_UMIcounts.columns= pi_gene_bar
raw_UMIcounts_pl.columns = pd.Series(raw_UMIcounts_pl.columns)+"_positive"
raw_UMIcounts_nl.columns = pd.Series(raw_UMIcounts_nl.columns)+"_negtive"
counts_join=raw_UMIcounts.join(raw_UMIcounts_pl,how="inner")
counts_join= counts_join.join(raw_UMIcounts_nl,how="inner")
print(11323+11356+11467)
一共是34146个细胞!
34146
counts_join.T.to_csv("./counts_join.csv")
adata = sc.read_csv("./counts_join.csv", first_column_names=True)
adata.write('./join_adata.h5ad')
adata=sc.read_h5ad("./join_adata.h5ad")
sc.pl.highest_expr_genes(adata, n_top=20, )
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=5)
adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
adata = adata[adata.obs.n_genes_by_counts <3000, :]
adata = adata[adata.obs.pct_counts_mt <20, :]
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True)
a=[]
for i in adata.obs.index:
a.append(i[13:])
from collections import Counter
count = Counter(a)
print(count)
Counter({'positive': 11147, 'negtive': 10036, 'primary': 9621})
adata.obs["disease_group"]=a
Trying to set attribute `.obs` of view, copying.
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
adata = adata[:, adata.var.highly_variable]
Counter(adata.var.highly_variable)
Counter({True: 3907})
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
C:\Users\shenxiaochen\anaconda3\lib\site-packages\anndata\_core\anndata.py:1228: ImplicitModificationWarning: Initializing view as actual.
warnings.warn(
Trying to set attribute `.obs` of view, copying.
... storing 'disease_group' as categorical
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata, log=True)
pca 降维结果。
results_file= './first_ana.h5ad'
adata.write(results_file)
adata=sc.read_h5ad("./first_ana.h5ad")
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=20)
computing neighbors
using 'X_pca' with = 20
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
sc.tl.umap(adata,min_dist=0.3)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:14)
adata
AnnData object with n_obs × n_vars = 30804 × 3907
obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'disease_group'
var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'pca', 'neighbors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'distances', 'connectivities'
sc.pl.umap(adata, color="disease_group",)
import scanpy as sc
import pandas as pd
from matplotlib.pyplot import rc_context
with rc_context({'figure.figsize': (5, 5)}):
sc.pl.umap(adata, color='disease_group', add_outline=True,
legend_fontsize=12, legend_fontoutline=2,frameon=False,
title='clustering of cells', palette='Paired')
sc.tl.leiden(adata)
running Leiden clustering
finished: found 23 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:04)
sc.tl.louvain(adata)
running Louvain clustering
using the "louvain" package of Traag (2017)
finished: found 20 clusters and added
'louvain', the cluster labels (adata.obs, categorical) (0:00:05)
#with rc_context({'figure.figsize': (10, 5)}):
sc.pl.umap(adata, color=['leiden','louvain'], add_outline=True,legend_loc='on data',
legend_fontsize=12, legend_fontoutline=2,frameon=False,
title=['clustering of cells','clustering of cells'], palette='Set1')
sc.tl.rank_genes_groups(adata, 'louvain', method='wilcoxon')
#sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
ranking genes
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:17)
C:\Users\shenxiaochen\anaconda3\lib\site-packages\scanpy\tools\_rank_genes_groups.py:420: RuntimeWarning: invalid value encountered in log2
self.stats[group_name, 'logfoldchanges'] = np.log2(
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
marker=pd.DataFrame(
{group + '_' + key[:1]: result[key][group]
for group in groups for key in ['names', 'pvals']})
marker.to_csv("./marker_louvain.csv")
cluster2annotation = {
'0':'Bcells',
'1':'myofibroblasts',
'2': 'NaiveT',
'3':'NaiveT',
'4':'cancercells',
'5': 'Bcells',
'6':'NaiveT',
'7': 'NaiveT',
'8':'myofibroblasts',
'9':'CD8Effector',
'10':'myeloid',
'11':'Bcells',
'12': 'endothelial',
'13':'Macrophages',
'14': 'cancerstemcells',
'15':'myeloid',
'16':'CXCL14cancer',
'17':'plasma',
'18':'matureDC',
'19':'pericytes'
}
# add a new `.obs` column called `cell type` by mapping clusters to annotation using pandas `map` function
adata.obs['cell type'] = adata.obs['louvain'].map(cluster2annotation).astype('category')
marker_genes_dict = {
'cancercells': ['KRT19'],
'cancerstemcells': ['KRT19','TOP2A'],
'CXCL14cancer': ['CXCL14'],
#'NaiveT': [],
#'CD8Effector': [],
'Bcells': ['CD79A','CD79B'],
'Macrophages': ['LYZ','IL1B'],
'myeloid':['LYZ'],
'matureDC':['LAMP3','CCR7'],
'plasma':['JCHAIN','IGHG3','MZB1'],
'endothelial':['MCAM','PECAM1'],
'pericytes':['ACTA2','TAGLN','MCAM'],
'myofibroblasts':['LUM','DCN','TAGLN'],
'cyclingcells':['TOP2A']
}
sc.pl.dotplot(adata, marker_genes_dict, 'louvain', dendrogram=True)
WARNING: dendrogram data not found (using key=dendrogram_louvain). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_louvain']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: 0, 1, 2, etc.
var_group_labels: cancercells, cancerstemcells, CXCL14cancer, etc.
png
ax = sc.pl.stacked_violin(adata, marker_genes_dict, groupby='louvain', swap_axes=False, dendrogram=True,cmap='Paired_r')
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: 0, 1, 2, etc.
var_group_labels: cancercells, cancerstemcells, CXCL14cancer, etc.
sc.pl.tracksplot(adata, marker_genes_dict, groupby='louvain', swap_axes=False, dendrogram=True,cmap='Paired_r')
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: 0, 1, 2, etc.
var_group_labels: cancercells, cancerstemcells, CXCL14cancer, etc.
adata.var_names
Index(['CCND1', 'ACTA2', 'POSTN', 'GADD45G', 'PSMA7', 'PDCD11', 'IGFBP7',
'ING5', 'RPL32P3', 'AC241952.1',
...
'AC125807.1', 'CD1A', 'H2AFZP3', 'IGLV1-44', 'CLEC17A', 'EEF1A1P9',
'AC073111.2', 'AL355076.2', 'AC103563.7', 'IGHV5-78'],
dtype='object', length=3907)
with rc_context({'figure.figsize': (4.5, 3)}):
sc.pl.violin(adata, ['CD79A', 'CD79B'], groupby='louvain' )
sc.pl.umap(adata, color='louvain', legend_loc='on data',
frameon=False, legend_fontsize=10, legend_fontoutline=2,title="")
sc.pl.umap(adata, color='cell type', legend_loc='on data',
frameon=False, legend_fontsize=5, legend_fontoutline=0.5,save="jmzeng.jpg")
WARNING: saving figure to file figures\umapjmzeng.jpg.pdf
adata.obs['cell type'].cat.categories
Index(['Bcells', 'CD8Effector', 'CXCL14cancer', 'Macrophages', 'NaiveT',
'cancercells', 'cancerstemcells', 'endothelial', 'matureDC', 'myeloid',
'myofibroblasts', 'pericytes', 'plasma'],
dtype='object')
adata_new=adata[(adata.obs['cell type']== 'CXCL14cancer')|(adata.obs['cell type']=='cancercells')|(adata.obs['cell type']== 'cancerstemcells'), :]
adata_new.obs['cell type'].cat.categories
Index(['CXCL14cancer', 'cancercells', 'cancerstemcells'], dtype='object')
adata_new
View of AnnData object with n_obs × n_vars = 2883 × 3907
obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'disease_group', 'leiden', 'louvain', 'cell type'
var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'pca', 'neighbors', 'umap', 'disease_group_colors', 'leiden', 'louvain', 'leiden_colors', 'louvain_colors', 'rank_genes_groups', 'dendrogram_louvain', 'cell type_colors'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'distances', 'connectivities'
sc.tl.paga(adata_new, groups='cell type')
running PAGA
Trying to set attribute `.uns` of view, copying.
finished: added
'paga/connectivities', connectivities adjacency (adata.uns)
'paga/connectivities_tree', connectivities subtree (adata.uns) (0:00:00)
sc.pl.paga(adata_new, color=['cell type','CXCL14'])
--> added 'pos', the PAGA positions (adata.uns['paga'])
#adata_new.uns['iroot'] = np.flatnonzero(adata_new.obs['cell type'] == 'cancerstemcells')[0]
sc.tl.draw_graph(adata_new, init_pos='paga')
drawing single-cell graph using layout 'fa'
finished: added
'X_draw_graph_fa', graph_drawing coordinates (adata.obsm) (0:00:17)
adata_new.uns['iroot'] = np.flatnonzero(adata_new.obs['cell type'] == 'cancerstemcells')[0]
sc.tl.dpt(adata_new)
WARNING: Trying to run `tl.dpt` without prior call of `tl.diffmap`. Falling back to `tl.diffmap` with default parameters.
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
finished (0:00:00)
eigenvalues of transition matrix
[1. 0.9954424 0.9901249 0.98625064 0.98440194 0.9764193
0.97149456 0.96843475 0.9636642 0.9555143 0.94893146 0.9446058
0.94269824 0.9330055 0.9283923 ]
finished: added
'X_diffmap', diffmap coordinates (adata.obsm)
'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
computing Diffusion Pseudotime using n_dcs=10
finished: added
'dpt_pseudotime', the pseudotime (adata.obs) (0:00:00)
sc.pl.draw_graph(adata_new, color=['cell type', 'dpt_pseudotime','disease_group'], legend_loc='on data')
sc.tl.paga(adata, groups='cell type')
sc.pl.paga(adata, color=['cell type'],node_size_scale=0.5)
running PAGA
finished: added
'paga/connectivities', connectivities adjacency (adata.uns)
'paga/connectivities_tree', connectivities subtree (adata.uns) (0:00:00)
--> added 'pos', the PAGA positions (adata.uns['paga'])
sc.tl.draw_graph(adata, init_pos='paga')
drawing single-cell graph using layout 'fa'
finished: added
'X_draw_graph_fa', graph_drawing coordinates (adata.obsm) (0:05:47)
sc.pl.draw_graph(adata, color=['cell type'], legend_loc='on data',legend_fontsize='xx-small',legend_fontweight='normal')
adata.uns['iroot'] = np.flatnonzero(adata.obs['cell type'] == 'NaiveT')[0]
sc.tl.dpt(adata)
WARNING: Trying to run `tl.dpt` without prior call of `tl.diffmap`. Falling back to `tl.diffmap` with default parameters.
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
finished (0:00:00)
eigenvalues of transition matrix
[1. 0.9967204 0.9959651 0.9953239 0.9936516 0.9933531
0.98944426 0.98545295 0.9851446 0.9843129 0.9827176 0.98058397
0.9788297 0.97733855 0.97462684]
finished: added
'X_diffmap', diffmap coordinates (adata.obsm)
'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
computing Diffusion Pseudotime using n_dcs=10
finished: added
'dpt_pseudotime', the pseudotime (adata.obs) (0:00:00)
sc.pl.draw_graph(adata, color=['cell type', 'dpt_pseudotime','disease_group'], legend_loc='on data',legend_fontsize='xx-small',legend_fontweight='normal')