Source code for cellrefiner.preprocessing._ligand_receptor_database

import numpy as np
import pandas as pd
from anndata import AnnData
import pkgutil
import io
from typing import Optional, Union, List, Iterable

[docs] def ligand_receptor_database( database: str = "CellChat", species: str = "mouse", signaling_types: Union[str,Iterable[str]] = ["Secreted Signaling", "Cell-Cell Contact", "ECM-Receptor"] # "Secreted Signaling", "Cell-Cell Contact", "ECM-Receptor" ): """ Extract ligand-receptor pairs from LR database. Parameters ---------- database The name of the ligand-receptor database. Use 'CellChat' for CellChatDB [Jin2021]_ of 'CellPhoneDB' for CellPhoneDB_v4.0 [Efremova2020]_. species The species of the ligand-receptor pairs. Choose between 'mouse' and 'human'. heteromeric_delimiter The character to separate the heteromeric units of heteromeric ligands and receptors. For example, if the heteromeric receptor (TGFbR1, TGFbR2) will be represented as 'TGFbR1_TGFbR2' if this parameter is set to '_'. signaling_type The type of signaling. Choose from 'Secreted Signaling', 'Cell-Cell Contact', and 'ECM-Receptor' for CellChatDB or 'Secreted Signaling' and 'Cell-Cell Contact' for CellPhoneDB_v4.0. If None, all pairs in the database are returned. Returns ------- df_ligrec : pandas.DataFrame A pandas DataFrame of the LR pairs with the three columns representing the ligand, receptor, and the signaling pathway name, respectively. References ---------- .. [Jin2021] Jin, S., Guerrero-Juarez, C. F., Zhang, L., Chang, I., Ramos, R., Kuan, C. H., ... & Nie, Q. (2021). Inference and analysis of cell-cell communication using CellChat. Nature communications, 12(1), 1-20. .. [Efremova2020] Efremova, M., Vento-Tormo, M., Teichmann, S. A., & Vento-Tormo, R. (2020). CellPhoneDB: inferring cell–cell communication from combined expression of multi-subunit ligand–receptor complexes. Nature protocols, 15(4), 1484-1506. """ assert database in ("CellChat", "CellPhoneDB") assert species in ("mouse", "human", "zebrafish") if isinstance(signaling_types, str): signaling_types = [signaling_types] df_list = [] for signaling_type in signaling_types: assert signaling_type in ("Secreted Signaling", "Cell-Cell Contact", "ECM-Receptor") if database == "CellChat": data = pkgutil.get_data(__name__, "_data/LRdatabase/CellChatDB.ligrec."+species+".csv") df_ligrec = pd.read_csv(io.BytesIO(data), index_col=0) if not signaling_type is None: df_ligrec = df_ligrec[df_ligrec.iloc[:,3] == signaling_type] elif database == 'CellPhoneDB': data = pkgutil.get_data(__name__, "_data/LRdatabase/CellPhoneDBv4.0."+species+".csv") df_ligrec = pd.read_csv(io.BytesIO(data), index_col=0) if not signaling_type is None: df_ligrec = df_ligrec[df_ligrec.iloc[:,3] == signaling_type] df_list.append(df_ligrec) df_ligrec = pd.concat(df_list) df_ligrec.columns = ['Ligand', 'Receptor', 'Pathway', 'Type'] df_ligrec['interaction_name'] = df_ligrec['Ligand']+'_'+df_ligrec['Receptor'] return df_ligrec
[docs] def filter_lr_database( df_ligrec: pd.DataFrame, adata: AnnData, heteromeric: bool = True, heteromeric_delimiter: str = "_", heteromeric_rule: str = "min", filter_criteria: str = "min_cell_pct", min_cell: int = 100, min_cell_pct: float = 0.05 ): """ Filter ligand-receptor pairs. Parameters ---------- df_ligrec The pandas dataframe of ligand-receptor database with three columns being ligand, receptor, and pathway name respectively. adata The AnnData object of gene expression. Unscaled data (minimum being zero) is expected. heteromeric Whether the ligands and receptors are described as heteromeric. heteromeric_delimiter If heteromeric notations are used for ligands and receptors, the character separating the heteromeric units. heteromeric_rule When heteromeric is True, the rule to quantify the level of a heteromeric ligand or receptor. Choose from minimum ('min') and average ('ave'). filter_criteria Use either cell percentage ('min_cell_pct') or cell numbers (min_cell) to filter genes. min_cell If filter_criteria is 'min_cell', the LR-pairs with both ligand and receptor detected in greater than or equal to min_cell cells are kept. min_cell_pct If filter_criteria is 'min_cell_pct', the LR-pairs with both ligand and receptor detected in greater than or equal to min_cell_pct percentage of cells are kepts. Returns ------- df_ligrec_filtered: pd.DataFrame A pandas DataFrame of the filtered ligand-receptor pairs. """ data_genes = set(adata.var_names) ncell = adata.shape[0] all_genes = list(adata.var_names) gene_ncell = np.array( (adata.X > 0).sum(axis=0) ).reshape(-1) ligrec_list = [] genes_keep = [] columns = df_ligrec.columns if not heteromeric: tmp_genes = set(df_ligrec.iloc[:,0]).union(set(df_ligrec.iloc[:,1])) tmp_genes = list(tmp_genes.intersection(data_genes)) for gene in tmp_genes: if not gene in all_genes: continue if filter_criteria == 'min_cell_prc': if gene_ncell[all_genes.index(gene)] / ncell >= min_cell_pct: genes_keep.append(gene) elif filter_criteria == 'min_cell': if gene_ncell[all_genes.index(gene)] >= min_cell: genes_keep.append(gene) elif heteromeric: tmp_genes = list(set(df_ligrec.iloc[:,0]).union(set(df_ligrec.iloc[:,1]))) for het_gene in tmp_genes: genes = het_gene.split(heteromeric_delimiter) gene_found = True for gene in genes: if not gene in all_genes: gene_found = False if not gene_found: continue keep = True if filter_criteria == 'min_cell_pct' and heteromeric_rule == 'min': for gene in genes: if gene_ncell[all_genes.index(gene)] / ncell < min_cell_pct: keep = False elif filter_criteria == 'min_cell' and heteromeric_rule == 'min': for gene in genes: if gene_ncell[all_genes.index(gene)] < min_cell: keep = False elif heteromeric_rule == 'ave': ave_ncell = [] for gene in genes: ave_ncell.append( gene_ncell[all_genes.index(gene)] ) if filter_criteria == 'min_cell_pct': if np.mean(ave_ncell) / ncell < min_cell_pct: keep = False elif filter_criteria == 'min_cell': if np.mean(ave_ncell) < min_cell: keep = False if keep: genes_keep.append(het_gene) for i in range(df_ligrec.shape[0]): if df_ligrec.iloc[i,0] in genes_keep and df_ligrec.iloc[i,1] in genes_keep: ligrec_list.append(list(df_ligrec.iloc[i,:])) return pd.DataFrame(data=ligrec_list, columns = columns)