Source code for cellrefiner.preprocessing._ligand_receptor_database

import numpy as np
import pandas as pd
from anndata import AnnData
import pkgutil
import io
from typing import Optional, Union, List, Iterable


[docs]
def ligand_receptor_database(
    database: str = "CellChat",
    species: str = "mouse",
    signaling_types: Union[str,Iterable[str]] = ["Secreted Signaling", "Cell-Cell Contact", "ECM-Receptor"] # "Secreted Signaling", "Cell-Cell Contact", "ECM-Receptor"
):
    """
    Extract ligand-receptor pairs from LR database.

    Parameters
    ----------
    database
        The name of the ligand-receptor database. Use 'CellChat' for CellChatDB [Jin2021]_ of 'CellPhoneDB' for CellPhoneDB_v4.0 [Efremova2020]_.
    species
        The species of the ligand-receptor pairs. Choose between 'mouse' and 'human'.
    heteromeric_delimiter
        The character to separate the heteromeric units of heteromeric ligands and receptors. 
        For example, if the heteromeric receptor (TGFbR1, TGFbR2) will be represented as 'TGFbR1_TGFbR2' if this parameter is set to '_'.
    signaling_type
        The type of signaling. Choose from 'Secreted Signaling', 'Cell-Cell Contact', and 'ECM-Receptor' for CellChatDB or 'Secreted Signaling' and 'Cell-Cell Contact' for CellPhoneDB_v4.0. 
        If None, all pairs in the database are returned.

    Returns
    -------
    df_ligrec : pandas.DataFrame
        A pandas DataFrame of the LR pairs with the three columns representing the ligand, receptor, and the signaling pathway name, respectively.

    References
    ----------

    .. [Jin2021] Jin, S., Guerrero-Juarez, C. F., Zhang, L., Chang, I., Ramos, R., Kuan, C. H., ... & Nie, Q. (2021). 
        Inference and analysis of cell-cell communication using CellChat. Nature communications, 12(1), 1-20.
    .. [Efremova2020] Efremova, M., Vento-Tormo, M., Teichmann, S. A., & Vento-Tormo, R. (2020). 
        CellPhoneDB: inferring cell–cell communication from combined expression of multi-subunit ligand–receptor complexes. Nature protocols, 15(4), 1484-1506.

    """
    assert database in ("CellChat", "CellPhoneDB")
    assert species in ("mouse", "human", "zebrafish")
    if isinstance(signaling_types, str):
        signaling_types = [signaling_types]
    df_list = []
    for signaling_type in signaling_types:
        assert signaling_type in ("Secreted Signaling", "Cell-Cell Contact", "ECM-Receptor")
        if database == "CellChat":
            data = pkgutil.get_data(__name__, "_data/LRdatabase/CellChatDB.ligrec."+species+".csv")
            df_ligrec = pd.read_csv(io.BytesIO(data), index_col=0)
            if not signaling_type is None:
                df_ligrec = df_ligrec[df_ligrec.iloc[:,3] == signaling_type]
        elif database == 'CellPhoneDB':
            data = pkgutil.get_data(__name__, "_data/LRdatabase/CellPhoneDBv4.0."+species+".csv")
            df_ligrec = pd.read_csv(io.BytesIO(data), index_col=0)
            if not signaling_type is None:
                df_ligrec = df_ligrec[df_ligrec.iloc[:,3] == signaling_type]
        df_list.append(df_ligrec)
    df_ligrec = pd.concat(df_list)
    df_ligrec.columns = ['Ligand', 'Receptor', 'Pathway', 'Type']
    df_ligrec['interaction_name'] = df_ligrec['Ligand']+'_'+df_ligrec['Receptor']
    return df_ligrec



[docs]
def filter_lr_database(
    df_ligrec: pd.DataFrame,
    adata: AnnData,
    heteromeric: bool = True,
    heteromeric_delimiter: str = "_",
    heteromeric_rule: str = "min",
    filter_criteria: str = "min_cell_pct",
    min_cell: int = 100,
    min_cell_pct: float = 0.05
):
    """
    Filter ligand-receptor pairs.

    Parameters
    ----------
    df_ligrec
        The pandas dataframe of ligand-receptor database with three columns being ligand, receptor, and pathway name respectively.
    adata
        The AnnData object of gene expression. Unscaled data (minimum being zero) is expected.
    heteromeric
        Whether the ligands and receptors are described as heteromeric.
    heteromeric_delimiter
        If heteromeric notations are used for ligands and receptors, the character separating the heteromeric units.
    heteromeric_rule
        When  heteromeric is True, the rule to quantify the level of a heteromeric ligand or receptor. Choose from minimum ('min') and average ('ave').
    filter_criteria
        Use either cell percentage ('min_cell_pct') or cell numbers (min_cell) to filter genes.
    min_cell
        If filter_criteria is 'min_cell', the LR-pairs with both ligand and receptor detected in greater than or equal to min_cell cells are kept.
    min_cell_pct
        If filter_criteria is 'min_cell_pct', the LR-pairs with both ligand and receptor detected in greater than or equal to min_cell_pct percentage of cells are kepts.

    Returns
    -------
    df_ligrec_filtered: pd.DataFrame
        A pandas DataFrame of the filtered ligand-receptor pairs.

    """

    data_genes = set(adata.var_names)
    ncell = adata.shape[0]
    all_genes = list(adata.var_names)
    gene_ncell = np.array( (adata.X > 0).sum(axis=0) ).reshape(-1)
    ligrec_list = []
    genes_keep = []
    columns = df_ligrec.columns
    if not heteromeric:
        tmp_genes = set(df_ligrec.iloc[:,0]).union(set(df_ligrec.iloc[:,1]))
        tmp_genes = list(tmp_genes.intersection(data_genes))
        for gene in tmp_genes:
            if not gene in all_genes: continue
            if filter_criteria == 'min_cell_prc':
                if gene_ncell[all_genes.index(gene)] / ncell >= min_cell_pct:
                    genes_keep.append(gene)
            elif filter_criteria == 'min_cell':
                if gene_ncell[all_genes.index(gene)] >= min_cell:
                    genes_keep.append(gene)
    elif heteromeric:
        tmp_genes = list(set(df_ligrec.iloc[:,0]).union(set(df_ligrec.iloc[:,1])))
        for het_gene in tmp_genes:
            genes = het_gene.split(heteromeric_delimiter)
            gene_found = True
            for gene in genes:
                if not gene in all_genes:
                    gene_found = False
            if not gene_found: continue
            keep = True
            if filter_criteria == 'min_cell_pct' and heteromeric_rule == 'min':
                for gene in genes:
                    if gene_ncell[all_genes.index(gene)] / ncell < min_cell_pct:
                        keep = False
            elif filter_criteria == 'min_cell' and heteromeric_rule == 'min':
                for gene in genes:
                    if gene_ncell[all_genes.index(gene)] < min_cell:
                        keep = False
            elif heteromeric_rule == 'ave':
                ave_ncell = []
                for gene in genes:
                    ave_ncell.append( gene_ncell[all_genes.index(gene)] )
                if filter_criteria == 'min_cell_pct':
                    if np.mean(ave_ncell) / ncell < min_cell_pct:
                        keep = False
                elif filter_criteria == 'min_cell':
                    if np.mean(ave_ncell) < min_cell:
                        keep = False
            if keep:
                genes_keep.append(het_gene)
    for i in range(df_ligrec.shape[0]):
        if df_ligrec.iloc[i,0] in genes_keep and df_ligrec.iloc[i,1] in genes_keep:
            ligrec_list.append(list(df_ligrec.iloc[i,:]))
    
    return pd.DataFrame(data=ligrec_list, columns = columns)