Source code for openomics.database.interaction

import logging
from abc import abstractmethod
from typing import List, Dict

import networkx as nx
from Bio import SeqIO

from openomics.database.annotation import *
from openomics.database.base import Database
from openomics.database.sequence import SequenceDatabase


[docs]class Interactions(Database):
    def __init__(self, path, file_resources: Dict, source_col_name: str = None, target_col_name: str = None,
                 source_index: str = None, target_index: str = None, edge_attr: List[str] = None, filters: dict = None,
                 directed: bool = True, relabel_nodes: dict = None, verbose: bool = False):
        """
        This is an abstract class used to instantiate a database given a folder containing various file resources. When creating a Database class, the load_data function is called where the file resources are load as a DataFrame and performs necessary processings. This class provides an interface for RNA classes to annotate various genomic annotation, functional annotation, sequences, and disease associations.
        Args:
            path (str):
                The folder path containing the data files.
            file_resources (dict):
                Default None, used to list required files for load_network of the dataset. A dictionary where keys are required filenames and value are file paths. If None, then the class constructor should automatically build the required file resources dict.
            source_col_name (str):
                Column name of DataFrame to be used as the source node names.
            target_col_name (str):
                Column name of DataFrame to be used as the target node names.
            source_index (str):
                One of {"gene_name", "gene_id", "transcript_name", "transcript_id", "protein_name", "protein_id"}
            target_index (str):
                One of {"gene_name", "gene_id", "transcript_name", "transcript_id", "protein_name", "protein_id"}
            edge_attr (list):
                A list of column names to be included as attributes for each edge (source-target pairs).
            filters (dict):
                Optional. A dict with key matching the data table (from load_network()) columns and values for the filtering on that column.
            directed (bool): default True,
                Whether to create a directed or an undirected network.
            relabel_nodes (dict): default None,
                A dictionary to rename nodes in the network, where the nodes with name <dict[key]> will be renamed to <dict[value]>
        """
        # This class should NOT call super's __init__()
        self.validate_file_resources(path, file_resources, verbose=verbose)

        self.data_path = path
        self.file_resources = file_resources
        self.source_index = source_index
        self.target_index = target_index
        self.network = self.load_network(file_resources=file_resources, source_col_name=source_col_name,
                                         target_col_name=target_col_name,
                                         edge_attr=edge_attr, directed=directed, filters=filters)
        assert isinstance(self.network, nx.Graph)

        self.network.name = self.name()

        if self.network is None:
            raise Exception(
                "Make sure load_network() returns a Networkx Graph and is called with super().__init__() in the constructor.")

        if relabel_nodes is not None:
            self.network = nx.relabel_nodes(self.network, mapping=relabel_nodes)

        self.verbose = verbose
        self.info() if verbose else None

[docs]    def info(self):
        print("{}".format(nx.info(self.network)))

[docs]    @classmethod
    def name(cls):
        return cls.__name__

[docs]    @abstractmethod
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        """
        Handles data processing from `file_resources` to a Pandas DataFrame which contain edgelist data, then constructs
        and return a NetworkX Graph.
        Args:
            file_resources: a dict of file name and file path/object
            source_col_name (str): column name of the dataframe for source in the edge
            target_col_name (str): column name of the dataframe for target in the edge
            edge_attr (list): list of str for column data to include in each edge
            directed (bool): True to return a DiGraph(), else Graph()
            filters: A dict of {column name: column values} to filter the dataframe
        Returns:
            network: a NetworkX Graph or DiGraph
        """
        raise NotImplementedError

[docs]    def get_interactions(self, nodelist=None, data=False, inclusive=True):
        """

        Args:
            nodelist (list):
                A list of nodes to fetch edges from
            data (bool): default False
                Whether to include edge attributes
            inclusive (bool): default False
                Whether to only retrieve edges from nodes inclusive in nodelist.

        Returns:
            edges (OutEdgeView): a NetworkX edgelist
        """
        if not hasattr(self, "network"):
            raise Exception(
                "{} does not have network interaction data yet. Must run load_network() and assign self.network field first.".format(
                    self.name()))

        if nodelist is None:
            return self.network.edges(data=data)

        if inclusive:
            return self.network.subgraph(nodelist).edges(data=data)
        else:
            return self.network.edges(nbunch=nodelist, data=data)

[docs]    def filter_values(self, df: pd.DataFrame, filters: dict, case=False):
        if filters is None:
            return df

        for key, values in filters.items():
            if key not in df.columns:
                logging.info("Filter key `", key, "` must be in one of ", df.columns)
                continue
            n_rows = df.shape[0]

            if isinstance(values, list):
                if case is False:
                    df = df[df[key].str.upper().isin([val.upper() for val in values])]
                else:
                    df = df[df[key].isin(values)]
            elif isinstance(values, str):
                df = df[df[key].str.contains(values, case=case)]
            else:
                df = df[df[key] == values]

            logging.info("INFO: Removed ", n_rows - df.shape[0], " rows with `", key, "` != ", values)

        assert df.shape[0] > 0, "ERROR: Dataframe is empty because of filter: {filters}"
        return df


[docs]class GeneMania(Interactions):
    def __init__(self, path, file_resources=None, source_col_name="Gene_A", target_col_name="Gene_B",
                 source_index="gene_name", target_index="gene_name",
                 edge_attr=None, filters=None, directed=True, relabel_nodes=None):
        if edge_attr is None:
            edge_attr = ["Weight"]
        if file_resources is None:
            file_resources = {}
            file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"] = os.path.join(path,
                                                                                        "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt")
            file_resources["identifier_mappings.txt"] = os.path.join(path,
                                                                     "identifier_mappings.txt")

        super(GeneMania, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                                        target_col_name=target_col_name, source_index=source_index,
                                        target_index=target_index,
                                        edge_attr=edge_attr, filters=filters, directed=directed,
                                        relabel_nodes=relabel_nodes)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        interactions = pd.read_table(file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"], low_memory=True)
        identifier = pd.read_table(file_resources["identifier_mappings.txt"])

        # Rename ENSG ID's to gene names
        identifier = identifier[identifier["Source"] == "Gene Name"]
        id_mapping = pd.Series(identifier["Name"].values, index=identifier["Preferred_Name"]).to_dict()
        interactions.replace(id_mapping, inplace=True)

        genemania_RNA_RNA_network = nx.from_pandas_edgelist(interactions, source=source_col_name,
                                                            target=target_col_name,
                                                            edge_attr=edge_attr,
                                                            create_using=nx.DiGraph())
        return genemania_RNA_RNA_network


[docs]class BioGRID(Interactions):
    def __init__(self, path="https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/",
                 file_resources=None, source_col_name="Official Symbol Interactor A",
                 target_col_name="Official Symbol Interactor B",
                 source_index="gene_name", target_index="gene_name",
                 edge_attr=['Score', 'Throughput', 'Experimental System', 'Experimental System Type'],
                 filters={"Organism Interactor A": 9606}, directed=False, relabel_nodes=None):
        if file_resources is None:
            file_resources = {}
            file_resources["BIOGRID-ALL-LATEST.tab2.zip"] = os.path.join(path, "BIOGRID-ALL-LATEST.tab2.zip")

        super(BioGRID, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                                      target_col_name=target_col_name, source_index=source_index,
                                      target_index=target_index, edge_attr=edge_attr, directed=directed,
                                      relabel_nodes=relabel_nodes, filters=filters)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        df = pd.read_table(file_resources["BIOGRID-ALL-LATEST.tab2.zip"],
                           na_values=["-"],
                           # usecols=['Official Symbol Interactor A', 'Official Symbol Interactor B',
                           #          'Organism Interactor A', 'Score', 'Throughput', 'Qualifications',
                           #          'Modification', 'Phenotypes'],
                           low_memory=True)

        logging.info("{}: {}".format(self.name(), df.columns.tolist()))

        df = self.filter_values(df, filters)
        network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                          edge_attr=edge_attr,
                                          create_using=nx.DiGraph() if directed else nx.Graph())
        return network


[docs]class STRING(Interactions, SequenceDatabase):
    COLUMNS_RENAME_DICT = {
        "protein_external_id": "protein_id",
        "preferred_name": "protein_name",
    }

    def __init__(self, path="https://stringdb-static.org/download/", file_resources=None,
                 species_id="9606",
                 source_col_name="item_id_a", target_col_name="item_id_b", source_index="protein_name",
                 target_index="protein_name",
                 edge_attr=["score"], directed=False,
                 relabel_nodes=None, verbose=False):
        """

        Args:
            species_id (str): Required. Must provide species id number to download the correct STRING dataset.
        """
        if file_resources is None:
            file_resources = {}
            file_resources["protein.actions.txt"] = os.path.join(path,
                                                                 "protein.actions.v11.0/{}.protein.actions.v11.0.txt.gz".format(
                                                                     species_id))
            file_resources["protein.links.txt"] = os.path.join(path,
                                                               "protein.links.v11.0/{}.protein.links.v11.0.txt.gz".format(
                                                                   species_id))
            file_resources["protein.info.txt"] = os.path.join(path,
                                                              "protein.info.v11.0/{}.protein.info.v11.0.txt.gz".format(
                                                                  species_id))
            file_resources["protein.sequences.fa"] = os.path.join(path,
                                                                  "protein.sequences.v11.0/{}.protein.sequences.v11.0.fa.gz".format(
                                                                      species_id))

        super(STRING, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                                     target_col_name=target_col_name,
                                     source_index=source_index, target_index=target_index, edge_attr=edge_attr,
                                     directed=directed, relabel_nodes=relabel_nodes, verbose=verbose)

        self.file_resources["protein.info.txt"].seek(0)
        self.data = pd.read_table(file_resources["protein.info.txt"])
        self.data = self.data.reset_index()
        self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        # protein_interactions = pd.read_table(file_resources["protein.links.txt"], sep=" ", low_memory=True)
        protein_interactions = pd.read_table(file_resources["protein.actions.txt"], sep="\t", low_memory=True)
        logging.info("{}: {}".format(self.name(), protein_interactions.columns.tolist()))
        protein_info = pd.read_table(file_resources["protein.info.txt"])

        self.protein_id2name = protein_info.set_index("protein_external_id")["preferred_name"].to_dict()
        network = nx.from_pandas_edgelist(protein_interactions, source=source_col_name, target=target_col_name,
                                          edge_attr=edge_attr,
                                          create_using=nx.DiGraph() if directed else nx.Graph())
        network = nx.relabel_nodes(network, self.protein_id2name)
        return network

[docs]    def get_sequences(self, index="protein_name", omic=None, agg_sequences=None):
        if hasattr(self, "seq_dict"):
            return self.seq_dict

        self.seq_dict = {}
        collisions = 0
        for record in SeqIO.parse(self.file_resources["protein.sequences.fa"], "fasta"):
            gene_id = str(record.name)
            gene_name = self.protein_id2name[gene_id]
            sequence_str = str(record.seq)
            if index == "protein_name":
                key = gene_name
            elif index == "protein_id":
                key = gene_id


            if key in self.seq_dict:
                collisions += 1

            self.seq_dict[key] = sequence_str

        logging.info("Seq {} collisions: {}".format(index, collisions))
        return self.seq_dict


[docs]class LncBase(Interactions, Database):
    def __init__(self, path, file_resources=None, strip_mirna_name=False,
                 source_col_name="mirna", target_col_name="geneId",
                 source_index="transcript_name", target_index="gene_id",
                 edge_attr=None, filters={"species": "Homo sapiens"}, directed=True,
                 relabel_nodes=None, ):
        self.strip_mirna_name = strip_mirna_name

        if edge_attr is None:
            edge_attr = ["tissue", "positive_negative"]
        if file_resources is None:
            file_resources = {}
            file_resources["LncBasev2_download.csv"] = os.path.join(path, "LncBasev2_download.csv")

        super(LncBase, self).__init__(path=path, file_resources=file_resources,
                                      source_col_name=source_col_name,
                                      target_col_name=target_col_name, source_index=source_index,
                                      target_index=target_index,
                                      edge_attr=edge_attr, directed=directed, relabel_nodes=relabel_nodes,
                                      filters=filters)

[docs]    def get_rename_dict(self, from_index="geneId", to_index="geneName"):
        lncbase_df = pd.read_table(self.file_resources["LncBasev2_download.csv"], low_memory=True)
        gene_id_to_gene_name_dict = pd.Series(lncbase_df["geneName"].values,
                                              index=lncbase_df["geneId"]).to_dict()
        return gene_id_to_gene_name_dict

[docs]    def load_network(self, file_resources, source_col_name="mirna", target_col_name="gene_id",
                     edge_attr=None, directed=True, filters=None):
        if edge_attr is None:
            edge_attr = ["tissue", "positive_negative"]
        df = pd.read_table(file_resources["LncBasev2_download.csv"], low_memory=True)
        logging.info(self.name(), df.columns.tolist())
        df.replace({"species": {"Homo Sapiens": "Homo sapiens", "Mus Musculus": "Mus musculus"}}, inplace=True)

        df = self.filter_values(df, filters)

        if self.strip_mirna_name:
            df['mirna'] = df['mirna'].str.lower()
            df['mirna'] = df['mirna'].str.replace("-3p.*|-5p.*", "")

        lncBase_lncRNA_miRNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                                               edge_attr=edge_attr,
                                                               create_using=nx.DiGraph() if directed else nx.Graph())
        return lncBase_lncRNA_miRNA_network


[docs]class LncReg(Interactions):

    def __init__(self, path, file_resources,
                 source_col_name='A_name_in_paper', target_col_name='B_name_in_paper',
                 source_index="transcript_name", target_index="gene_name",
                 edge_attr=["relationship", "mechanism", "pmid"], filters=None, directed=True, relabel_nodes=None,
                 verbose=False):
        if file_resources is None:
            file_resources = {}
            file_resources["data.xlsx"] = os.path.join(path, "data.xlsx")

        super(LncReg, self).__init__(path, file_resources=file_resources, source_col_name=source_col_name,
                                     target_col_name=target_col_name, source_index=source_index,
                                     target_index=target_index,
                                     edge_attr=edge_attr, filters=filters,
                                     directed=directed, relabel_nodes=relabel_nodes, verbose=verbose)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        df = pd.read_excel(self.file_resources["data.xlsx"])
        logging.info(self.name(), df.columns.tolist())

        df = df[df["species"] == "Homo sapiens"]
        df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][
            "B_name_in_paper"].str.replace("-3p.*|-5p.*", "")
        df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][
            "B_name_in_paper"].str.replace("MIR", "hsa-mir-")
        df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][
            "B_name_in_paper"].str.replace("let-", "hsa-let-")

        LncReg_lncRNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                                            edge_attr=edge_attr,
                                                            create_using=nx.DiGraph())
        return LncReg_lncRNA_RNA_network


[docs]class lncRInter(Interactions):

    def __init__(self, path, file_resources=None, source_col_name="lncrna",
                 target_col_name='Interacting partner',
                 source_index="gene_name", target_index="gene_name",
                 edge_attr=None, filters=None,
                 directed=True, relabel_nodes=None):
        if edge_attr is None:
            edge_attr = ["Interaction Class", "Interaction Mode", "Tissue", "Phenotype"]
        if file_resources is None:
            file_resources = {}
            file_resources["human_interactions.txt"] = os.path.join(path, "human_interactions.txt")

        super(lncRInter, self).__init__(path, file_resources, source_col_name=source_col_name,
                                        target_col_name=target_col_name,
                                        source_index=source_index,
                                        target_index=target_index,
                                        edge_attr=edge_attr, directed=directed, relabel_nodes=relabel_nodes,
                                        filters=filters)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        lncRInter_df = pd.read_table(file_resources["human_interactions.txt"])
        logging.info(self.name(), lncRInter_df.columns.tolist())

        lncRInter_df = self.filter_values(lncRInter_df, filters)
        # Data cleaning
        lncRInter_df.loc[lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"] = \
            lncRInter_df.loc[
                lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"].str.lower()
        lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mirlet", "hsa-let-")
        lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mir", "hsa-mir-")
        lncRInter_df["Interacting partner"][
            lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")] = \
            lncRInter_df["Interacting partner"][
                lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")].apply(
                lambda x: x[:-1] + "-" + x[-1])

        lncRInter_network = nx.from_pandas_edgelist(lncRInter_df, source=source_col_name,
                                                    target=target_col_name,
                                                    edge_attr=edge_attr,
                                                    create_using=nx.DiGraph() if directed else nx.Graph())
        return lncRInter_network


[docs]class LncRNA2Target(Interactions):
    def __init__(self, path="http://123.59.132.21/lncrna2target/data/", file_resources=None, source_index="gene_name",
                 target_index="gene_name", edge_attr=None, filters={"species_id": 9606, "Species": "Homo sapiens"},
                 directed=True, relabel_nodes=None, version="high_throughput", ):
        """

        Args:
            version (str): one of ["high_throughput", "low_throughput"].
                The high_throughput version of lncRNA2Target database is v2.0 and low_throughput is v1.0, according to the database's website.
            species_id (str, int): one of [9606, "Homo sapiens"].
                The species column in high_throughput is formatted in int (e.g. 9606) and in low_throughput is in str (e.g. "Homo sapiens")
        """
        self.version = version
        if file_resources is None:
            file_resources = {}
            file_resources["lncRNA_target_from_high_throughput_experiments.txt"] = os.path.join(path,
                                                                                                "lncrna_target.rar")
            file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"] = os.path.join(path,
                                                                                                "lncRNA_target_from_low_throughput_experiments.xlsx")

        if self.version == "high_throughput":
            super(LncRNA2Target, self).__init__(path, file_resources, source_col_name="lncrna_symbol",
                                                target_col_name="gene_symbol", source_index=source_index,
                                                target_index=target_index,
                                                edge_attr=edge_attr, filters=filters, directed=directed,
                                                relabel_nodes=relabel_nodes)
        if self.version == "low_throughput":
            super(LncRNA2Target, self).__init__(path, file_resources, source_col_name="GENCODE_gene_name",
                                                target_col_name="Target_official_symbol", source_index=source_index,
                                                target_index=target_index,
                                                edge_attr=edge_attr, filters=filters, directed=directed,
                                                relabel_nodes=relabel_nodes)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        if self.version == "high_throughput":
            return self.load_network_high_throughput(file_resources, source_col_name, target_col_name, edge_attr,
                                                     directed)
        elif self.version == "low_throughput":
            return self.load_network_low_throughput(file_resources, source_col_name, target_col_name, edge_attr,
                                                    directed)
        else:
            raise Exception("LncRNA2Target version argument must be one of 'high_throughput' or 'low_throughput'")

[docs]    def load_network_high_throughput(self, file_resources, source_col_name="lncrna_symbol",
                                     target_col_name="gene_symbol",
                                     edge_attr=None, directed=True, filters=None):
        table = pd.read_table(file_resources["lncRNA_target_from_high_throughput_experiments.txt"], sep="\t")
        table = self.filter_values(table, filters)
        logging.info(self.name(), table.columns.tolist())

        table["lncrna_symbol"] = table["lncrna_symbol"].str.upper()
        table["lncrna_symbol"] = table["lncrna_symbol"].str.replace("LINC", "")
        table["gene_symbol"] = table["gene_symbol"].str.upper()
        lncrna2target_high_throughput_network = nx.from_pandas_edgelist(table,
                                                                        source=source_col_name,
                                                                        target=target_col_name,
                                                                        edge_attr=edge_attr,
                                                                        create_using=nx.DiGraph() if directed else nx.Graph())
        return lncrna2target_high_throughput_network

[docs]    def load_network_low_throughput(self, file_resources, source_col_name="GENCODE_gene_name",
                                    target_col_name="Target_official_symbol",
                                    edge_attr=None, directed=True, filters=None):
        table = pd.read_excel(file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"])
        table = self.filter_values(table, filters)
        logging.info(self.name(), table.columns.tolist())

        table["Target_official_symbol"] = table["Target_official_symbol"].str.replace("(?i)(mir)", "hsa-mir-",
                                                                                      regex=True)
        table["Target_official_symbol"] = table["Target_official_symbol"].str.replace("--", "-")
        table["Target_official_symbol"].apply(lambda x: x.lower() if "mir" in x.lower() else x.upper())
        table["GENCODE_gene_name"] = table["GENCODE_gene_name"].str.upper()
        lncrna2target_low_throughput_network = nx.from_pandas_edgelist(table,
                                                                       source=source_col_name,
                                                                       target=target_col_name,
                                                                       edge_attr=edge_attr,
                                                                       create_using=nx.DiGraph() if directed else nx.Graph())
        return lncrna2target_low_throughput_network


[docs]class lncRNome(Interactions, Database):
    def __init__(self, path, file_resources, source_col_name='Gene Name', target_col_name='Binding miRNAs',
                 source_index="gene_name", target_index="gene_name",
                 edge_attr=["miRNA Interaction Site", "Transcript ID"], directed=True, relabel_nodes=None,
                 npartitions=0):
        if file_resources is None:
            file_resources = {}
            file_resources["miRNA_binding_sites.txt"] = os.path.join(path, "miRNA_binding_sites.txt")
            file_resources["general_information.txt"] = os.path.join(path, "general_information.txt")

        super(lncRNome, self).__init__(path, file_resources=file_resources, source_col_name=source_col_name,
                                       target_col_name=target_col_name, source_index=source_index,
                                       target_index=target_index, edge_attr=edge_attr,
                                       directed=directed, relabel_nodes=relabel_nodes, npartitions=npartitions)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        df = pd.read_table(self.file_resources["miRNA_binding_sites.txt"], header=0)
        logging.info(self.name(), df.columns.tolist())

        df['Binding miRNAs'] = df['Binding miRNAs'].str.lower()
        df['Binding miRNAs'] = df['Binding miRNAs'].str.replace("-3p.*|-5p.*", "")

        lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name,
                                                                       target=target_col_name,
                                                                       edge_attr=edge_attr,
                                                                       create_using=nx.DiGraph())

        return lncRNome_miRNA_binding_sites_network

[docs]    def load_dataframe(self, file_resources, npartitions=None):
        return pd.read_table(self.file_resources["general_information.txt"], header=0,
                             usecols=["Gene Name", "Transcript Name", "Transcript Type", "Location", "Strand"])


[docs]class NPInter(Interactions):

    def __init__(self, path="http://bigdata.ibp.ac.cn/npinter4/download/", file_resources=None,
                 source_col_name='ncName', target_col_name='tarName',
                 source_index="gene_name", target_index="gene_name",
                 edge_attr=["tarType", "tissueOrCell", "tag", "level"],
                 filters=None,
                 directed=True, relabel_nodes=None, verbose=False):
        if file_resources is None:
            file_resources = {}
            file_resources["interaction_NPInterv4.expr.txt"] = os.path.join(path,
                                                                            "file/interaction_NPInterv4.expr.txt.gz")

        super(NPInter, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                                      target_col_name=target_col_name, source_index=source_index,
                                      target_index=target_index, edge_attr=edge_attr, filters=filters,
                                      directed=directed,
                                      relabel_nodes=relabel_nodes, verbose=verbose)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        df = pd.read_table(file_resources["interaction_NPInterv4.expr.txt"], header=0, na_values=["-"])
        logging.info(self.name(), df.columns.tolist())
        df["ncName"] = df["ncName"].str.upper()
        df["ncName"] = df["ncName"].str.strip("LNCRNA-")
        df["ncName"] = df["ncName"].str.replace("MALAT-1", "MALAT1")
        df["ncName"] = df["ncName"].str.replace("^MIR-", "hsa-mir-", regex=True)
        df["ncName"] = df["ncName"].str.replace("^MICRORNA-", "hsa-mir-", regex=True)

        df["tarName"] = df["tarName"].str.upper()

        df = self.filter_values(df, filters)

        lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name,
                                                                       target=target_col_name,
                                                                       edge_attr=edge_attr,
                                                                       create_using=nx.DiGraph() if directed else nx.Graph())

        return lncRNome_miRNA_binding_sites_network


[docs]class StarBase(Interactions):

    def __init__(self, path, file_resources, source_col_name="geneName", target_col_name="pairGeneName",
                 source_index="gene_name", target_index="gene_name",
                 min_interactionNum=1, min_expNum=1,
                 edge_attr=None, directed=True, relabel_nodes=None, npartitions=0):
        if file_resources is None:
            file_resources = {}
            file_resources["starbase_3.0_lncrna_rna_interactions.csv"] = \
                os.path.join(path, "starbase_3.0_lncrna_rna_interactions.csv")
        self.min_interactionNum = min_interactionNum
        self.min_expNum = min_expNum
        super(StarBase, self).__init__(path, file_resources, source_col_name, target_col_name, source_index,
                                       target_index, edge_attr,
                                       directed, relabel_nodes, npartitions)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        df = pd.read_csv(self.file_resources["starbase_3.0_lncrna_rna_interactions.csv"], header=0)

        df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][
            "pairGeneName"].str.lower()
        df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][
            "pairGeneName"].str.replace("-3p.*|-5p.*", "")
        df = df[df["interactionNum"] >= self.min_interactionNum]
        df = df[df["expNum"] >= self.min_expNum]

        self.starBase_RNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                                                edge_attr=["interactionNum"],
                                                                create_using=nx.DiGraph())
        return self.starBase_RNA_RNA_network


[docs]class MiRTarBase(Interactions):
    def __init__(self, path="http://mirtarbase.mbc.nctu.edu.tw/cache/download/7.0/", file_resources=None,
                 source_col_name="miRNA", target_col_name="Target Gene",
                 source_index="transcript_name", target_index="gene_name",
                 edge_attr=None, filters={"Species (Target Gene)": "Homo sapiens"}, directed=True, relabel_nodes=None,
                 strip_mirna_name=False):
        if edge_attr is None:
            edge_attr = ["Support Type"]
        self.strip_mirna_name = strip_mirna_name

        if file_resources is None:
            file_resources = {}
            file_resources["miRTarBase_MTI.xlsx"] = os.path.join(path, "miRTarBase_MTI.xlsx")

        super(MiRTarBase, self).__init__(path=path, file_resources=file_resources,
                                         source_col_name=source_col_name,
                                         target_col_name=target_col_name, source_index=source_index,
                                         target_index=target_index,
                                         edge_attr=edge_attr, filters=filters, directed=directed,
                                         relabel_nodes=relabel_nodes, )

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters):
        df = pd.read_excel(self.file_resources["miRTarBase_MTI.xlsx"])
        logging.info(self.name(), df.columns.tolist())

        df = self.filter_values(df, filters)

        if self.strip_mirna_name:
            df['miRNA'] = df['miRNA'].str.lower()
            df['miRNA'] = df['miRNA'].str.replace("-3p.*|-5p.*", "")

        mir_target_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                                     edge_attr=edge_attr,
                                                     create_using=nx.DiGraph() if directed else nx.Graph())
        return mir_target_network


[docs]class TargetScan(Interactions, Database):
    def __init__(self, path, file_resources=None, source_col_name="MiRBase ID", target_col_name="Gene Symbol",
                 source_index="transcript_name", target_index="transcript_name",
                 edge_attr=["tissue", "positive_negative"], directed=True, relabel_nodes=None, species=9606,
                 strip_mirna_name=False):
        if edge_attr is None:
            edge_attr = ["tissue", "positive_negative"]
        self.strip_mirna_name = strip_mirna_name
        self.species = species
        if file_resources is None:
            file_resources = {}
            file_resources["miR_Family_Info.txt"] = os.path.join(path, "miR_Family_Info.txt")
            file_resources["Predicted_Targets_Info.default_predictions.txt"] = os.path.join(path,
                                                                                            "Predicted_Targets_Info.default_predictions.txt")

        super(TargetScan, self).__init__(path=path, file_resources=file_resources,
                                         source_col_name=source_col_name,
                                         target_col_name=target_col_name, source_index=source_index,
                                         target_index=target_index,
                                         edge_attr=edge_attr, directed=directed, relabel_nodes=relabel_nodes)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name,
                     edge_attr, directed, filters):
        self.df = self.process_miR_family_info_table(file_resources, self.species)
        interactions_df = self.process_interactions_table(file_resources, self.df, self.species)
        logging.info(self.name(), interactions_df.columns.tolist())

        mir_target_network = nx.from_pandas_edgelist(interactions_df,
                                                     source=source_col_name, target=target_col_name,
                                                     edge_attr=edge_attr,
                                                     create_using=nx.DiGraph() if directed else nx.Graph())
        return mir_target_network

[docs]    def process_miR_family_info_table(self, file_resources, species=None):
        miR_Family_Info_df = pd.read_table(file_resources["miR_Family_Info.txt"], delimiter='\t')

        if species:
            miR_Family_Info_df = miR_Family_Info_df[miR_Family_Info_df['Species ID'] == species]

        # Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19
        if self.strip_mirna_name:
            miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.lower()
            miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "")

        miR_Family_Info_df.drop_duplicates(inplace=True)
        miR_Family_Info_df = miR_Family_Info_df.filter(items=['miR family', 'MiRBase ID', 'Seed+m8', 'Mature sequence',
                                                              'Family Conservation?', 'MiRBase Accession'],
                                                       axis="columns")
        miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].astype(str)
        return miR_Family_Info_df

[docs]    def process_interactions_table(self, file_resources, family_to_miR_df, species):
        """
        This functions joins the interactions data table between miR Family and targets, and
        Args:
            file_resources:
            family_to_miR_df:
            species:

        Returns:

        """
        # Load data frame from file
        family_interactions_df = pd.read_table(file_resources["Predicted_Targets_Info.default_predictions.txt"],
                                               delimiter='\t', low_memory=True)

        # Select only homo sapiens miRNA-target pairs
        if species:
            family_interactions_df = family_interactions_df[family_interactions_df["Species ID"] == species]

        family_interactions_df = family_interactions_df.filter(items=["miR Family", "Gene Symbol"], axis="columns")
        family_to_miR_df = family_to_miR_df.filter(items=['miR family', 'MiRBase ID'], axis="columns")
        family_to_miR_df.rename(columns={'miR family': 'miR Family'}, inplace=True)

        # map miRBase ID names to miR Family
        # family_interactions_df = pd.merge(family_interactions_df, family_to_miR_df, how='outer', on="miR Family")

        family_to_miR_df.set_genes_index("miR Family", inplace=True)
        family_interactions_df.set_genes_index("miR Family", inplace=True)
        mir_interactions_df = family_interactions_df.join(family_to_miR_df, how='outer', on="miR Family").reset_index()

        # Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19
        if self.strip_mirna_name:
            mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.lower()
            mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "")

        return mir_interactions_df