Source code for pbmc3k_tutorial

import os
import requests
import tarfile
import numpy as np
import pandas as pd
import scanpy as sc
import json
from mdvtools.mdvproject import MDVProject

# Import the chart classes
from mdvtools.charts.row_chart import RowChart
from mdvtools.charts.dot_plot import DotPlot
from mdvtools.charts.histogram_plot import HistogramPlot
from mdvtools.charts.scatter_plot_3D import ScatterPlot3D
from mdvtools.charts.scatter_plot import ScatterPlot
from mdvtools.charts.table_plot import TablePlot

try:
    os.mkdir("data")
    os.mkdir("write")
except FileExistsError:
    pass
[docs] url = "http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz"
[docs] r = requests.get(url)
with open("data/pbmc3k_filtered_gene_bc_matrices.tar.gz", "wb") as f: f.write(r.content)
[docs] tar = tarfile.open("data/pbmc3k_filtered_gene_bc_matrices.tar.gz")
tar.extractall("data") tar.close() print("Downloaded and extracted data") # https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html # * * * Data Analysis section * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) sc.logging.print_header()
[docs] dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(dir) adata = sc.read_10x_mtx( "data/filtered_gene_bc_matrices/hg19/", # the directory with the `.mtx` file var_names="gene_symbols", # use gene symbols for the variable names (variables-axis index) cache=True, # write a cache file for faster subsequent reading ) # adata.var_names_make_unique() # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx` # annotate the group of mitochondrial genes as "mt" adata.var["mt"] = adata.var_names.str.startswith("MT-") sc.pp.calculate_qc_metrics( adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True ) sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) # pylance in vscode is giving type errors for the following lines - even though the code not only runs correctly, # but pyright when called from terminal or in CI action also doesn't give any errors. adata = adata[adata.obs.n_genes_by_counts < 2500, :] # type: ignore adata = adata[adata.obs.pct_counts_mt < 5, :].copy() # type: ignore sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) adata.raw = adata
[docs] adata = adata[:, adata.var.highly_variable] # type: ignore
sc.pp.regress_out(adata, ["total_counts", "pct_counts_mt"]) sc.pp.scale(adata, max_value=10) sc.tl.pca(adata, svd_solver="arpack") sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) sc.tl.umap(adata) sc.tl.leiden( adata, resolution=0.9, random_state=0, # flavor="igraph", n_iterations=2, directed=False, ) sc.tl.rank_genes_groups(adata, "leiden", method="t-test") sc.settings.verbosity = 2 # reduce the verbosity sc.tl.rank_genes_groups(adata, "leiden", method="wilcoxon") sc.tl.rank_genes_groups(adata, "leiden", method="logreg", max_iter=1000) # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[docs] cells_df = pd.DataFrame(adata.obs)
[docs] pca_np = np.array(adata.obsm["X_pca"])
cells_df["X_pca_1"] = pca_np[:, 0] cells_df["X_pca_2"] = pca_np[:, 1] cells_df["X_pca_3"] = pca_np[:, 2]
[docs] umap_np = np.array(adata.obsm["X_umap"])
cells_df["X_umap_1"] = umap_np[:, 0] cells_df["X_umap_2"] = umap_np[:, 1] cells_df["cell_id"] = adata.obs.index # cells_df["n_cells"] = adata.obs[["Sample", "leiden"]].value_counts().reset_index() """adata.layers['counts'] = adata.X.copy() marker_genes = [ *["IL7R", "CD79A", "MS4A1", "CD8A", "CD8B", "LYZ", "CD14"], *["LGALS3", "S100A8", "GNLY", "NKG7", "KLRB1"], *["FCGR3A", "MS4A7", "FCER1A", "CST3", "PPBP"], ] #cells_df[marker_genes] = adata.var["gene_ids"][adata.var["gene_ids"] == marker_genes] print(adata.uns["rank_genes_groups"]["names"].field(1)) result = adata.uns["rank_genes_groups"] groups = result["names"].dtype.names print(pd.DataFrame( { group + "_" + key[:1]: result[key][group] for group in groups for key in ["names", "scores"] } )) """ assert isinstance(adata.X, np.ndarray) adata.layers["counts"] = adata.X.copy() ## these can be added dynamically with the 'Add Gene Expr' UI in the frontend assert isinstance(adata.layers["counts"], np.ndarray)
[docs] counts = np.array(adata.layers["counts"])
[docs] transpose_counts = np.transpose(counts)
cells_df["ARVCF"] = transpose_counts[adata.var.index.get_loc("ARVCF")] # cells_df['CTB-113I20.2']=transpose_counts[adata.var.index.get_loc("CTB-113I20.2")]) cells_df["DOK3"] = transpose_counts[adata.var.index.get_loc("DOK3")] cells_df["FAM210B"] = transpose_counts[adata.var.index.get_loc("FAM210B")] cells_df["GBGT1"] = transpose_counts[adata.var.index.get_loc("GBGT1")] cells_df["NFE2L2"] = transpose_counts[adata.var.index.get_loc("NFE2L2")] # cells_df['PPBP']=transpose_counts[adata.var.index.get_loc("PPBP")]) cells_df["UBE2D4"] = transpose_counts[adata.var.index.get_loc("UBE2D4")] cells_df["YPEL2"] = transpose_counts[adata.var.index.get_loc("YPEL2")] # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # Αdd a second datasource 'genes'
[docs] gene_table = adata.var
gene_table["gene_ids"] = gene_table.index
[docs] varm_np = np.array(adata.varm["PCs"])
gene_table["PCs_1"] = varm_np[:, 0] gene_table["PCs_2"] = varm_np[:, 1] # print(adata.var) # print(gene_table.columns.values.tolist()) # gene_table["PCs_1"] = rna.varm["PCs"][:, 0] # gene_table["PCs_2"] = rna.varm["PCs"][:, 1] # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # CELLS GRAPHS # Create a RowChart instance with configuration based on the JSON
[docs] row_chart = RowChart( title="leiden", param="leiden", position=[380, 270], size=[260, 260] )
# Configure the row chart row_chart.set_axis_properties("x", {"textSize": 13, "label": "", "tickfont": 10}) # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # Create a DotPlot instance with configuration based on the JSON
[docs] dot_plot = DotPlot( title="leiden", params=["leiden", "ARVCF", "DOK3", "FAM210B", "GBGT1", "NFE2L2", "UBE2D4", "YPEL2"], size=[400, 250], position=[10, 10], )
# Configure the dot plot dot_plot.set_axis_properties("x", {"label": "", "textSize": 13, "tickfont": 10}) dot_plot.set_axis_properties("y", {"label": "", "textSize": 13, "tickfont": 10}) dot_plot.set_axis_properties("ry", {"label": "", "textSize": 13, "tickfont": 10}) dot_plot.set_color_scale(log_scale=False) dot_plot.set_color_legend(True, [40, 10]) dot_plot.set_fraction_legend(True, [0, 0]) # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[docs] histogram_plot = HistogramPlot( title="n_genes_by_counts", param="n_genes_by_counts", bin_number=17, display_min=0, display_max=2500, size=[360, 250], position=[10, 280], )
histogram_plot.set_x_axis(size=30, label="n_genes_by_counts", textsize=13, tickfont=10) histogram_plot.set_y_axis( size=45, label="frequency", textsize=13, tickfont=10, rotate_labels=False ) # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # Create a ScatterPlot3D instance
[docs] scatter_plot = ScatterPlot3D( title="X_pca_1 x X_pca_2 x X_pca_3", params=["X_pca_1", "X_pca_2", "X_pca_3"], size=[250, 250], position=[420, 10], default_color="#377eb8", brush="default", center=[0, 0, 0], on_filter="hide", radius=5, opacity=0.8, axis_scales=[220, 220, 220], camera={"distance": 37543.999999999956, "theta": -1.038, "phi": 0.261}, # color_by="leiden" )
scatter_plot.set_color_by("leiden") # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # Genes graphs # Create a ScatterPlot instance
[docs] scatter2D_plot = ScatterPlot( title="PCs_1 x PCs_2", params=["PCs_1", "PCs_2"], size=[250, 250], position=[270, 10], )
scatter2D_plot.set_default_color("#377eb8") scatter2D_plot.set_brush("poly") scatter2D_plot.set_opacity(0.8) scatter2D_plot.set_radius(2) scatter2D_plot.set_color_legend(display=True, position=[20, 1]) scatter2D_plot.set_axis_properties( "x", {"label": "PCs_1", "textSize": 13, "tickfont": 10} ) scatter2D_plot.set_axis_properties( "y", {"label": "PCs_2", "textSize": 13, "tickfont": 10} ) scatter2D_plot.set_color_by("dispersions") # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[docs] table_plot = TablePlot( title="Data table", params=gene_table.columns.values.tolist(), size=[250, 500], position=[10, 10], )
# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[docs] histogram_plot_2 = HistogramPlot( title="n_cells", param="n_cells", bin_number=17, display_min=0, display_max=2500, size=[240, 240], position=[270, 270], )
histogram_plot_2.set_x_axis(size=30, label="n_cells", textsize=13, tickfont=10) histogram_plot_2.set_y_axis( size=45, label="frequency", textsize=13, tickfont=10, rotate_labels=False ) # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # Set up and serve the MDV project
[docs] base = os.path.expanduser(os.path.join("~", "mdv"))
[docs] project_path = os.path.join(base, "pbmc3k_test")
[docs] p = MDVProject(os.path.expanduser(project_path), delete_existing=True)
# Add data to the project p.add_datasource("cells", cells_df) p.add_datasource("genes", gene_table) # Convert plot data to JSON and setup the project view
[docs] list_charts_cells = []
[docs] list_charts_genes = []
[docs] row_chart_data = json.loads(json.dumps(row_chart.plot_data, indent=2).replace("\\", ""))
[docs] dot_chart_data = json.loads(json.dumps(dot_plot.plot_data, indent=2).replace("\\", ""))
[docs] histogram_chart_data = json.loads( json.dumps(histogram_plot.plot_data, indent=2).replace("\\", "") )
[docs] scatter_chart = json.loads( json.dumps(scatter_plot.plot_data, indent=2).replace("\\", "") )
[docs] table_chart = json.loads(json.dumps(table_plot.plot_data, indent=2).replace("\\", ""))
[docs] scatter2D_chart = json.loads( json.dumps(scatter2D_plot.plot_data, indent=2).replace("\\", "") )
[docs] histogram_chart_data_2 = json.loads( json.dumps(histogram_plot_2.plot_data, indent=2).replace("\\", "") )
list_charts_cells.append(row_chart_data) list_charts_cells.append(dot_chart_data) list_charts_cells.append(histogram_chart_data) list_charts_cells.append(scatter_chart) list_charts_genes.append(table_chart) list_charts_genes.append(scatter2D_chart) list_charts_genes.append(histogram_chart_data_2)
[docs] view_config = { "initialCharts": {"cells": list_charts_cells, "genes": list_charts_genes} }
# link the two datasets p.add_rows_as_columns_link("cells", "genes", "gene_ids", "Gene Expr") # add the gene expression p.add_rows_as_columns_subgroup( "cells", "genes", "gs", adata.X, name="scores", label="Gene Scores" ) p.set_view("default", view_config) p.set_editable(True) p.serve()