Source code for openomics.database.interaction

import copy
import os
from abc import abstractmethod
from collections.abc import Iterable
from typing import List, Dict, Any, Union, Optional

import dask.dataframe as dd
import networkx as nx
import pandas as pd
import scipy.sparse as ssp
from Bio import SeqIO
from logzero import logger
from pandas.core.dtypes.common import is_numeric_dtype

from openomics.database.base import Database
from openomics.database.sequence import SequenceDatabase, UniProt
from openomics.transforms.df import filter_rows

__all__ = ['STRING', 'GeneMania', 'IntAct', 'BioGRID', 'MiRTarBase', 'LncBase', 'TargetScan', 'TarBase',
           'LncReg', 'LncRNA2Target', 'lncRNome', 'NPInter', 'RNAInter', 'StarBase']

[docs]class Interactions(Database):
    edges: Optional[Union[pd.DataFrame, dd.DataFrame]]
    def __init__(self, path, file_resources: Dict, source_col_name: str = None, target_col_name: str = None,
                 edge_attr: List[str] = None, filters: Union[str, Dict[str, Union[str, List[str]]]] = None,
                 directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs):
        """
        This is an abstract class used to instantiate a database given a folder containing various file resources. When creating a Database class, the load_data function is called where the file resources are load as a DataFrame and performs necessary processings. This class provides an interface for RNA classes to annotate various genomic annotation, functional annotation, sequences, and disease associations.
        Args:
            path (str):
                The folder path containing the data files.
            file_resources (dict):
                Default None, used to list required files for load_network of the dataset. A dictionary where keys are required filenames and value are file paths. If None, then the class constructor should automatically build the required file resources dict.
            source_col_name (str):
                Column name of DataFrame to be used as the source node names.
            target_col_name (str):
                Column name of DataFrame to be used as the target node names.
            edge_attr (list):
                A list of column names to be included as attributes for each edge (source-target pairs).
            filters (dict):
                Optional. A dict with key matching the data table (from load_network()) columns and values for the filtering on that column.
            directed (bool): default True,
                Whether to create a directed or an undirected network.
            relabel_nodes (dict): default None,
                A dictionary to rename nodes in the network, where the nodes with name <dict[key]> will be renamed to <dict[value]>
            blocksize ():
        """
        self.filters = filters
        self.source_col_name = source_col_name
        self.target_col_name = target_col_name
        self.directed = directed
        self.edge_attr = edge_attr

        super().__init__(path=path, file_resources=file_resources, blocksize=blocksize, **kwargs)
        self.network = self.load_network(file_resources=self.file_resources, source_col_name=source_col_name,
                                         target_col_name=target_col_name, edge_attr=edge_attr, directed=directed,
                                         filters=filters, blocksize=blocksize)

        if relabel_nodes is not None:
            self.network = nx.relabel_nodes(self.network, mapping=relabel_nodes)

        self.close()

[docs]    @classmethod
    def name(cls):
        return cls.__name__

[docs]    @abstractmethod
    def load_network(self, file_resources: Dict, source_col_name: str, target_col_name: str,
                     edge_attr: Union[str, List[str]], directed: bool, filters: Dict[str, Any], blocksize=None) \
        -> nx.Graph:
        """
        Handles data processing from `file_resources` to a Pandas DataFrame which contain edgelist data, then constructs
        and return a NetworkX Graph.
        Args:
            file_resources: a dict of file name and file path/object
            source_col_name (str): column name of the dataframe for source in the edge
            target_col_name (str): column name of the dataframe for target in the edge
            edge_attr (list): list of str for column data to include in each edge
            directed (bool): True to return a DiGraph(), else Graph()
            filters: A dict of {column name: column values} to filter the dataframe
            blocksize ():
        Returns:
            network: a NetworkX Graph or DiGraph
        """
        raise NotImplementedError

[docs]    def get_interactions(self, nodelist=None, data=False, inclusive=True, relabel_nodes: Dict[str, str] = None):
        """

        Args:
            nodelist (list):
                A list of nodes to fetch edges from
            data (bool): default False
                Whether to include edge attributes
            inclusive (bool): default False
                Whether to only retrieve edges from nodes inclusive in nodelist.

        Returns:
            edges (OutEdgeView): a NetworkX edgelist
        """
        if not hasattr(self, "network"):
            raise Exception(
                "{} does not have network interaction data yet. Must run load_network() and assign self.network field first.".format(
                    self.name()))

        g = self.network
        if relabel_nodes:
            g = nx.relabel_nodes(g, relabel_nodes, copy=False)

        if nodelist is None:
            return g.edges(data=data)

        if inclusive:
            return g.subgraph(nodelist).edges(data=data)
        else:
            return g.edges(nbunch=nodelist, data=data)


[docs]class STRING(Interactions, SequenceDatabase):
    """Loads the STRING database from https://string-db.org/ .

    Default path: "https://stringdb-static.org/download/" .
    Default file_resources: {
        "{species_id}.protein.info.txt.gz": f"protein.info.{version}/{species_id}.protein.info.{version}.txt.gz",
        "{species_id}.protein.aliases.txt.gz": f"protein.links.{version}/{species_id}.protein.aliases.{version}.txt.gz",
        "{species_id}.protein.links.txt.gz": f"protein.links.{version}/{species_id}.protein.links.{version}.txt.gz",
        "{species_id}.protein.sequences.fa.gz": f"protein.sequences.{version}/{species_id}.protein.sequences.{version}.fa.gz"
    }

    Edge attributes for protein.actions.txt include ["mode", 'action', 'is_directional', 'a_is_acting' "score"]
    Edge attributes for protein.actions.txt include ["combined_score"]
    """
    COLUMNS_RENAME_DICT = {
        "#string_protein_id": "string_protein_id",
        "protein_external_id": "protein_id",
        "preferred_name": "gene_name",
        '#ncbi_taxid': 'species_id',
        'string_protein_id_2': 'homologous_protein_id',
    }

    def __init__(self, path="https://stringdb-static.org/download/", file_resources=None,
                 species_id: Union[str, List[str]] = "9606", version="v11.0",
                 source_col_name="protein1", target_col_name="protein2",
                 edge_attr: Union[str, List[str]] = 'combined_score', directed=False,
                 relabel_nodes=None,
                 index_col='#string_protein_id',
                 keys=None,
                 alias_types={'Ensembl_UniProt', 'Ensembl_UniProt_AC'},
                 blocksize=None, **kwargs):
        """

        Args:
            path ():
            file_resources ():
            species_id (): List of str of species id's
                Provide a species_id string or a list of species_id's to download the species-specific STRING dataset, and
                integrate them. If species_id is None, then download the full-dataset version of STRING, which is very
                time-consuming.
            version ():
            source_col_name ():
            target_col_name ():
            source_index ():
            target_index ():
            edge_attr ():
            directed ():
            relabel_nodes ():
            verbose ():
            blocksize ():
        """
        self.version = version
        self.species_id = copy.copy(species_id)
        self.alias_types = alias_types
        assert isinstance(edge_attr, str)

        if file_resources is None:
            file_resources = {}
            if isinstance(species_id, (Iterable, str)) and len(species_id):
                species_list = [species_id] if isinstance(species_id, str) else species_id
                for species in species_list:
                    file_resources[f"{species}.protein.info.txt.gz"] = \
                        os.path.join(path, f"protein.info.{version}/{species}.protein.info.{version}.txt.gz")
                    file_resources[f"{species}.protein.links.txt.gz"] = \
                        os.path.join(path, f"protein.links.{version}/{species}.protein.links.{version}.txt.gz")
                    file_resources[f"{species}.protein.links.detailed.txt.gz"] = \
                        os.path.join(path, f"protein.links.detailed.{version}/"
                                           f"{species}.protein.links.detailed.{version}.txt.gz")
                    file_resources[f"{species}.protein.homology.txt.gz"] = \
                        os.path.join(path, f"protein.homology.{version}/{species}.protein.homology.{version}.txt.gz")
                    file_resources[f"{species}.clusters.proteins.txt.gz"] = \
                        os.path.join(path, f"clusters.proteins.{version}/{species}.clusters.proteins.{version}.txt.gz")
                    file_resources[f"{species}.protein.aliases.txt.gz"] = \
                        os.path.join(path, f"protein.aliases.{version}/{species}.protein.aliases.{version}.txt.gz")
                    file_resources[f"{species}.enrichment.terms.txt.gz"] = \
                        os.path.join(path, f"enrichment.terms.{version}/{species}.enrichment.terms.{version}.txt.gz")
                    file_resources[f"{species}.protein.sequences.fa.gz"] = \
                        os.path.join(path, f"protein.sequences.{version}/{species}.protein.sequences.{version}.fa.gz")
            else:
                file_resources["protein.info.txt.gz"] = os.path.join(path, f"protein.info.{version}.txt.gz")
                file_resources["protein.links.txt.gz"] = os.path.join(path, f"protein.links.{version}.txt.gz")
                file_resources["protein.sequences.fa.gz"] = os.path.join(path, f"protein.sequences.{version}.fa.gz")
        else:
            if isinstance(self.species_id, Iterable):
                file_resources = {fn: fp for fn, fp in file_resources.items() \
                                  if any(fn.startswith(species) for species in self.species_id)}

        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                         target_col_name=target_col_name, edge_attr=edge_attr, directed=directed,
                         relabel_nodes=relabel_nodes, blocksize=blocksize, index_col=index_col, keys=keys,
                         col_rename=STRING.COLUMNS_RENAME_DICT, **kwargs)

    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
        # Load nodes
        dfs = []
        if blocksize:
            for filename in [fn for fn, path in file_resources.items() \
                             if 'info.txt' in fn and isinstance(path, str)]:
                compression = 'gzip' if filename.endswith(".gz") else None
                info_df = dd.read_table(file_resources[filename], na_values=['annotation not available'],
                                        low_memory=True, compression=compression,
                                        dtype={'protein_size': 'int8'},
                                        blocksize=None if isinstance(blocksize, bool) else blocksize)

                if self.keys is not None:
                    info_df = info_df.loc[info_df[self.index_col].isin(self.keys)]

                if self.index_col:
                    info_df = info_df.set_index(self.index_col, sorted=True)

                # Join other attributes to node_info
                species_id = filename.split(".")[0]
                attrs = self.load_accessory_data(file_resources, species_id=species_id,
                                                 alias_types=self.alias_types, blocksize=False)
                if attrs is not None:
                    new_cols = attrs.columns.difference(info_df.columns)
                    info_df = info_df.join(attrs[new_cols], on=self.index_col)

                dfs.append(info_df)
        else:
            for filename in file_resources:
                if filename.endswith("protein.info.txt"):
                    info_df = pd.read_table(file_resources[filename], na_values=['annotation not available'],
                                            dtype={'protein_size': 'int8'},
                                            index_col=self.index_col, low_memory=True)
                    index_split = info_df['#string_protein_id'].str.split(".", expand=True, n=1)
                    info_df = info_df.assign(species_id=index_split[0], protein_embl_id=index_split[1])

                    # Join other attributes to node_info
                    species_id = filename.split(".")[0]
                    attrs = self.load_accessory_data(file_resources, species_id=species_id,
                                                     alias_types=self.alias_types,
                                                     blocksize=blocksize)
                    if attrs is not None:
                        new_cols = attrs.columns.difference(info_df.columns)
                        info_df = info_df.join(attrs[new_cols], on=self.index_col)
                    dfs.append(info_df)

        if not len(dfs):
            raise Exception("Must provide at least one 'protein.info.txt' file.")

        if blocksize:
            protein_info: dd.DataFrame = dd.concat(dfs, axis=0, interleave_partitions=True)
        else:
            protein_info = pd.concat(dfs, axis=0)

        return protein_info

    def load_accessory_data(self, file_resources: Dict[str, str], species_id: str,
                            accessory_files=['protein.aliases', 'protein.homology', 'protein.enrichment',
                                             'clusters.proteins'],
                            alias_types={'Ensembl_UniProt', 'Ensembl_UniProt_AC'}, blocksize=False, ) \
        -> Union[pd.DataFrame, dd.DataFrame]:
        """
        Stack the annotations files for the provided `species_id`, such that rows in the annotations are filtered by
        `keys` (if not null), indexed by "#string_protein_id", and with attributes transformed to a dataframe columns.

        Args:
            file_resources (): a dict of filename and filepath
            species_id (str): the species_id string which is used to select only files that have the same prefix.
            accessory_files (List[str]):
                A list of strings that specify which types of annotation files to integrate, i.e., only select files
                having a substring matching one of these.
                Default ['protein.aliases', 'protein.homology', 'protein.enrichment', 'clusters.proteins'].
            alias_types (): a set of string, default {'Ensembl_UniProt_AC'}
                A set of `source` values in the `protein.aliases` annotation to aggregate `alias`'s for.
                Must be a subset of {'Ensembl_Source', 'Ensembl_gene', 'Ensembl_transcript', 'Ensembl_UniGene',
                    'Ensembl_RefSeq_short', 'Ensembl_RefSeq', 'Ensembl_OTTG', 'Ensembl_OTTP', 'Ensembl_UCSC',
                    'Ensembl_UniProt', 'Ensembl_UniProt_AC', 'Ensembl_EntrezGene', 'Ensembl_EMBL', 'Ensembl_protein_id'}
            blocksize (bool): Recommended to use Pandas to avoid uncessary overhead.

        Returns:
            dd.Dataframe or pd.DataFrame

        """
        allowed_prefixes = {'protein.aliases', 'protein.homology', 'protein.enrichment', 'clusters.proteins'}
        if not set(accessory_files).issubset(allowed_prefixes):
            logger.warn(f'{set(accessory_files).difference(allowed_prefixes)} files are not supported')

        select_files = []
        for fn, path in file_resources.items():
            if fn.startswith(species_id) and any(ftype in fn for ftype in accessory_files):
                select_files.append(fn)

        dfs = []
        for filename in select_files:
            args = dict(
                low_memory=True,
                dtype={'cluster_id': 'category', '#ncbi_taxid': 'category', 'category': 'category',
                       'source': 'category'})
            compression = 'gzip' if filename.endswith(".gz") else None
            if blocksize:
                if not isinstance(file_resources[filename], str): continue
                df = dd.read_table(file_resources[filename], compression=compression, **args)
            else:
                df = pd.read_table(file_resources[filename], **args)

            # Set index for df
            for col in ['#string_protein_id', 'protein_id', '#string_protein_1']:
                if col in df.columns:
                    df = df.set_index(col, sorted=True) if blocksize else df.set_index(col)
                    break

            # Set index
            if df.index.name is None:
                continue
            elif self.index_col and df.index.name != self.index_col:
                df.index = df.index.rename(self.index_col)
            if blocksize:
                assert df.known_divisions

            # Filter rows
            if self.keys is not None:
                df = df.loc[df.index.isin(self.keys)]

            # Groupby on index and perform appropriate transforms depending on the annotation type
            if 'protein.homology' in filename:
                df = df.loc[df.index != df['string_protein_id_2']]
                df = df.groupby(self.index_col)['string_protein_id_2'].unique().to_frame()
                # TODO ignored column of size of homologous regions

            elif 'clusters.protein' in filename:
                df = df.groupby(self.index_col)[['cluster_id', '#ncbi_taxid']].unique()

            elif 'protein.enrichment' in filename:
                df = df.groupby(self.index_col)['term'].unique().to_frame()

            elif 'protein.aliases' in filename:
                df = df.loc[df['source'].isin(alias_types)]
                df['source'] = df['source'].cat.set_categories(alias_types)
                if blocksize:
                    # Set alias values to lists so pivot_table(..., aggfunc='sum') will concatenate them
                    df = df.assign(alias=df['alias'].map(lambda x: [x], meta=pd.Series([[""]])))
                    df = dd.pivot_table(df.reset_index(),
                                        index='#string_protein_id', columns='source', values='alias', aggfunc='sum')
                else:
                    df = df.reset_index().groupby([self.index_col, 'source'])['alias'].unique().unstack(level=1)

            if blocksize and not df.known_divisions:
                df.divisions = df.compute_current_divisions()

            if not len(df.index):
                continue

            dfs.append(df)

        if dfs:
            attrs = dd.concat(dfs, axis=1) if blocksize else pd.concat(dfs, axis=1)
        else:
            attrs = None

        return attrs

[docs]    def load_network(self, file_resources, source_col_name='protein1', target_col_name='protein2',
                     edge_attr: Union[str, List[str]] = 'combined_score', directed=False, filters=None, blocksize=None):
        keys = self.data.index.compute() if isinstance(self.data, dd.DataFrame) else self.data.index
        select_files = [fn for fn, path in file_resources.items() if "links" in fn]

        # Load edges
        edges_dfs = []
        for filename in select_files:
            args = dict(sep=" ", low_memory=True,
                        dtype={'protein1': 'category', 'protein2': 'category',
                               'neighborhood': 'uint8', 'fusion': 'uint8', 'cooccurence': 'uint8',
                               'coexpression': 'uint8', 'experimental': 'uint8', 'database': 'uint8',
                               'textmining': 'uint8', 'combined_score': 'uint8'})
            if blocksize:
                if not isinstance(file_resources[filename], str): continue
                compression = 'gzip' if filename.endswith(".gz") else None
                df: dd.DataFrame = dd.read_table(file_resources[filename], compression=compression, **args,
                                                 blocksize=None if isinstance(blocksize, bool) else blocksize)

                if compression:
                    logger.info(f"Repartitioning {filename} from {df.npartitions} "
                                f"partitions to {blocksize}-size partitions")
                    df = df.repartition(partition_size=blocksize)

            else:
                df = pd.read_table(file_resources[filename], **args)

            df = df.loc[df[source_col_name].isin(keys) & df[target_col_name].isin(keys)]
            edges_dfs.append(df)

        if len(edges_dfs) == 0:
            return

        # Concatenate multiple edgelists into dataframe
        edges_df = dd.concat(edges_dfs, axis=0) if blocksize else pd.concat(edges_dfs, axis=0)
        edges_df = edges_df.rename(columns=self.COLUMNS_RENAME_DICT)
        logger.info(f"{self.name()}-{self.species_id}: {edges_df.columns.tolist()}, {edges_df.shape}")

        # Convert edge_attr (edge weights) from 3 digit integer to float
        assignfunc = {}
        for col in (edge_attr if isinstance(edge_attr, list) else [edge_attr]):
            if col in edges_df.columns and is_numeric_dtype(edges_df[col]):
                assignfunc[col] = edges_df[col].astype('float16') / 1000
        if assignfunc:
            edges_df = edges_df.assign(**assignfunc)

        edges_df = filter_rows(edges_df, filters=filters)

        self.edges = edges_df
        # Set ordering for rows and columns
        node2idx = {node: i for i, node in enumerate(keys)}

        if isinstance(edges_df, dd.DataFrame):
            def edgelist2adj(df: pd.DataFrame) -> ssp.coo_matrix:
                if df.shape[0] == 1 and df.iloc[0, 0] == 'foo':
                    return None

                df = df.assign(row=df[source_col_name].map(node2idx).astype('int'),
                               col=df[target_col_name].map(node2idx).astype('int'))
                df = df.dropna(subset=['row', 'col'])

                if df.shape[0] == 0:
                    return None

                coo_adj = ssp.coo_matrix((df[edge_attr], (df['row'], df['col'])),
                                         shape=(len(keys), len(keys)))
                coo_adj.eliminate_zeros()
                return coo_adj

            # Create a sparse adjacency matrix for each partition, then add them to combine
            adj = edges_df.reduction(chunk=edgelist2adj,
                                     aggregate=lambda x: x.dropna().sum() if not x.isna().all() else None,
                                     meta=pd.Series([ssp.coo_matrix])).compute()
            assert len(adj) == 1, f"len(adj) = {len(adj)}"

            G = nx.from_scipy_sparse_matrix(adj[0], create_using=nx.DiGraph() if directed else nx.Graph(),
                                            edge_attribute='weight')
            idx2node = {i: node for i, node in enumerate(keys)}
            G = nx.relabel_nodes(G, mapping=idx2node, copy=True)
            del adj

        else:
            # Determine which edge attr to add
            if isinstance(edge_attr, (list, tuple)):
                cols = edges_df.columns.intersection(edge_attr + [source_col_name, target_col_name])
                edges_df = edges_df[cols]
                use_attrs = True
            elif isinstance(edge_attr, str):
                cols = edges_df.columns.intersection([source_col_name, target_col_name, edge_attr])
                edges_df = edges_df[cols]
                use_attrs = edge_attr
            else:
                use_attrs = False
            G = nx.from_pandas_edgelist(edges_df, source=source_col_name, target=target_col_name,
                                        edge_attr=use_attrs, create_using=nx.DiGraph() if directed else nx.Graph())

        return G

[docs]    def get_sequences(self, index="protein_id", omic=None, agg=None):
        if hasattr(self, "seq_dict"):
            return self.seq_dict

        self.seq_dict = {}
        collisions = 0
        for record in SeqIO.parse(self.file_resources["protein.sequences.fa"], "fasta"):
            gene_id = str(record.name)

            sequence_str = str(record.seq)
            if index == "protein_name":
                key = self.protein_id2name[gene_id]
            elif index == "protein_id":
                key = gene_id

            if key in self.seq_dict:
                collisions += 1

            self.seq_dict[key] = sequence_str

        logger.warn("Seq {} collisions: {}".format(index, collisions))
        return self.seq_dict


[docs]class GeneMania(Interactions):
    """Loads the GeneMania database from  .

    Default path: local_directory .
    Default file_resources: {
        "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt": "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt",
        "identifier_mappings.txt": "identifier_mappings.txt",
    }
    """

    def __init__(self, path, file_resources=None, source_col_name="Gene_A", target_col_name="Gene_B",
                 edge_attr=None, filters=None, directed=True, relabel_nodes=None, **kwargs):
        if edge_attr is None:
            edge_attr = ["Weight"]
        if file_resources is None:
            file_resources = {}
            file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"] = os.path.join(path,
                                                                                        "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt")
            file_resources["identifier_mappings.txt"] = os.path.join(path,
                                                                     "identifier_mappings.txt")

        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
                         relabel_nodes=relabel_nodes, **kwargs)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        interactions = pd.read_table(file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"], low_memory=True)
        identifier = pd.read_table(file_resources["identifier_mappings.txt"])

        # Rename ENSG ID's to gene names
        identifier = identifier[identifier["Source"] == "Gene Name"]
        id_mapping = pd.Series(identifier["Name"].values, index=identifier["Preferred_Name"]).to_dict()
        interactions.replace(id_mapping, inplace=True)

        genemania_RNA_RNA_network = nx.from_pandas_edgelist(interactions, source=source_col_name,
                                                            target=target_col_name,
                                                            edge_attr=edge_attr,
                                                            create_using=nx.DiGraph())
        return genemania_RNA_RNA_network


[docs]class IntAct(Interactions):

    def __init__(self, path, file_resources: Dict, source_col_name: str = None, target_col_name: str = None,
                 source_index: str = None, target_index: str = None, edge_attr: List[str] = None, filters: dict = None,
                 directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs):
        super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed,
                         relabel_nodes, blocksize, **kwargs)


[docs]class BioGRID(Interactions):
    """Loads the BioGRID database from https://thebiogrid.org .

    Default path: "https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/" .
    Default file_resources: {
        "BIOGRID-ALL-LATEST.tab2.zip": "BIOGRID-ALL-LATEST.tab2.zip",
    }
    """

    def __init__(self, path="https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/",
                 file_resources=None, source_col_name="Official Symbol Interactor A",
                 target_col_name="Official Symbol Interactor B",
                 edge_attr=['Score', 'Throughput', 'Experimental System', 'Experimental System Type'],
                 filters=None, directed=False, relabel_nodes=None, **kwargs):
        """

        Args:
            path ():
            file_resources ():
            source_col_name ():
            target_col_name ():
            source_index ():
            target_index ():
            edge_attr ():
            filters (): Default None, example {"Organism Interactor A": 9606}.
            directed ():
            relabel_nodes ():
            **kwargs ():
        """
        if file_resources is None:
            file_resources = {}
            file_resources["BIOGRID-ALL-LATEST.tab2.zip"] = os.path.join(path, "BIOGRID-ALL-LATEST.tab2.zip")

        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
                         relabel_nodes=relabel_nodes, **kwargs)

    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
        args = dict(na_values=["-"], header=0, low_memory=True,
                    # usecols=['Official Symbol Interactor A', 'Official Symbol Interactor B',
                    #          'Organism Interactor A', 'Score', 'Throughput', 'Qualifications',
                    #          'Modification', 'Phenotypes', 'Source Database'],
                    dtype={'Score': 'float', 'Entrez Gene Interactor A': 'category',
                           'Entrez Gene Interactor B': 'category',
                           'BioGRID ID Interactor A': 'category', 'BioGRID ID Interactor B': 'category',
                           'Systematic Name Interactor A': 'category', 'Systematic Name Interactor B': 'category',
                           'Official Symbol Interactor A': 'category', 'Official Symbol Interactor B': 'category',
                           'Pubmed ID': 'str', 'Throughput': 'category', 'Experimental System Type': 'category',
                           'Experimental System': 'category', 'Modification': 'category', 'Source Database': 'category',
                           'Organism Interactor A': 'category', 'Organism Interactor B': 'category'})

        if blocksize:
            edges = dd.read_table(file_resources["BIOGRID-ALL-LATEST.tab2"], blocksize=blocksize, **args, )
        else:
            edges = pd.read_table(file_resources["BIOGRID-ALL-LATEST.tab2"], **args, )

        self.edges = edges

        return edges

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        df = self.edges
        df = filter_rows(df, filters)
        network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                          edge_attr=edge_attr,
                                          create_using=nx.DiGraph() if directed else nx.Graph())
        return network


[docs]class MiRTarBase(Interactions):
    """Loads the  database from  .

        Default path:  .
        Default file_resources: {
            "": "",
            "": "",
            "": "",
        }
        """

    def __init__(self, path="http://mirtarbase.mbc.nctu.edu.tw/cache/download/7.0/", file_resources=None,
                 source_col_name="miRNA", target_col_name="Target Gene",
                 edge_attr=None,
                 filters=None,
                 directed=True,
                 relabel_nodes=None,
                 strip_mirna_name=False, **kwargs):
        """

        Args:
            path ():
            file_resources ():
            source_col_name ():
            target_col_name ():
            source_index ():
            target_index ():
            edge_attr ():
            filters (): default None, example {"Species (Target Gene)": "Homo sapiens"}
            directed ():
            relabel_nodes ():
            strip_mirna_name ():
            **kwargs ():
        """
        if edge_attr is None:
            edge_attr = ["Support Type"]
        self.strip_mirna_name = strip_mirna_name

        if file_resources is None:
            file_resources = {}
            file_resources["miRTarBase_MTI.xlsx"] = os.path.join(path, "miRTarBase_MTI.xlsx")

        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
                         relabel_nodes=relabel_nodes, **kwargs)

    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
        df = pd.read_excel(self.file_resources["miRTarBase_MTI.xlsx"])
        self.edges = df
        return df

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        df = self.data
        df = filter_rows(df, filters)

        df['miRNA'] = df['miRNA'].str.rstrip('*')

        if self.strip_mirna_name:
            df['miRNA'] = df['miRNA'].str.lower().str.replace("-3p.*|-5p.*", "", regex=True)

        mir_target_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                                     edge_attr=edge_attr,
                                                     create_using=nx.DiGraph() if directed else nx.Graph())
        return mir_target_network


[docs]class LncBase(Interactions, Database):
    """Loads the LncBase database from http://carolina.imis.athena-innovation.gr/diana_tools/web/index.php?r=lncbasev2%2Findex .

    Default path: local_directory .
    Default file_resources: {
        "LncBasev2_download.csv": "LncBasev2_download.csv"",
    }
    """

    def __init__(self, path='https://dianalab.e-ce.uth.gr/downloads/', file_resources=None, strip_mirna_name=False,
                 source_col_name="mirna", target_col_name="geneId",
                 edge_attr=None,
                 filters=None,
                 directed=True,
                 relabel_nodes=None, ):
        """

        Args:
            path ():
            file_resources ():
            strip_mirna_name ():
            source_col_name ():
            target_col_name ():
            source_index ():
            target_index ():
            edge_attr ():
            filters (): default None. Example: {"species": "Homo sapiens"}
            directed ():
            relabel_nodes ():
        """
        self.strip_mirna_name = strip_mirna_name

        if edge_attr is None:
            edge_attr = ["tissue", "positive_negative"]
        if file_resources is None:
            file_resources = {}
            file_resources["LncBasev2_download.csv"] = os.path.join(path, "lncbase_v2_exp_data.tar.gz")

        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
                         relabel_nodes=relabel_nodes)

[docs]    def get_rename_dict(self, from_index="geneId", to_index="geneName"):
        lncbase_df = pd.read_table(self.file_resources["LncBasev2_download.csv"], low_memory=True)
        gene_id_to_gene_name_dict = pd.Series(lncbase_df["geneName"].values,
                                              index=lncbase_df["geneId"]).to_dict()
        return gene_id_to_gene_name_dict

    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
        df = pd.read_table(file_resources["LncBasev2_download.csv"], low_memory=True)
        df.replace({"species": {"Homo Sapiens": "Homo sapiens", "Mus Musculus": "Mus musculus"}}, inplace=True)
        return df

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        df = self.data
        df = filter_rows(df, filters)

        if self.strip_mirna_name:
            df['mirna'] = df['mirna'].str.lower()
            df['mirna'] = df['mirna'].str.replace("-3p.*|-5p.*", "", regex=True)

        if edge_attr is None:
            edge_attr = ["tissue", "positive_negative"]
        lncBase_lncRNA_miRNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                                               edge_attr=edge_attr,
                                                               create_using=nx.DiGraph() if directed else nx.Graph())
        return lncBase_lncRNA_miRNA_network


[docs]class TarBase(Interactions):
    """

    """

    def __init__(self, path='https://dianalab.e-ce.uth.gr/downloads', file_resources: Dict = None,
                 source_col_name: str = 'mirna', target_col_name: str = 'geneName',
                 edge_attr: List[str] = None, filters: Union[str, Dict[str, Union[str, List[str]]]] = None,
                 directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs):
        """

        Args:
            path ():
            file_resources ():
            source_col_name ():
            target_col_name ():
            edge_attr ():
            filters ():
            directed ():
            relabel_nodes ():
            blocksize ():
            **kwargs ():
        """
        if file_resources is None:
            file_resources = {
                'tarbase_v8_data.tar.gz': 'https://dianalab.e-ce.uth.gr/downloads/tarbase_v8_data.tar.gz',
                'speclist': 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/speclist',
            }

        super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed,
                         relabel_nodes, blocksize, **kwargs)

[docs]    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
        edges = pd.read_table(file_resources['tarbase_v8_data.tar.gz'], compression='tar',
                              dtype={'tissue': 'category', 'method': 'category', 'positive_negative': 'category',
                                     'species': 'category',
                                     'direct_indirect': 'category', 'up_down': 'category', 'cell_line': 'category',
                                     })

        if 'speclist' in file_resources:
            species_df = UniProt.get_species_list(file_resources['speclist'])
            species_df = species_df[['Official (scientific) name', 'Common name', 'Synonym']].melt(ignore_index=False)
            species_df = species_df.dropna().reset_index()
            species_name2id = species_df.set_index('value')['NCBI-taxon'].to_dict()
            edges['species_id'] = edges['species'].map(species_name2id)

        self.edges = edges
        return edges

[docs]    def load_network(self, file_resources: Dict, source_col_name: str, target_col_name: str, edge_attr: List[str],
                     directed: bool, filters: Dict[str, Any], blocksize=None):
        df = self.data
        df = filter_rows(df, filters)

        # Remove parenthesis containing 3 letter species name
        df['geneName'] = df['geneName'].str.replace(r'(\(\w{3}\)){1}$', '', regex=True)
        idx = df['geneName'].str.contains('\(')
        df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.replace(r'(\(\d of \d\))', '', regex=True).str.strip()

        idx = df['geneName'].str.contains("\(\w*\)", regex=True)
        df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.extract(r'\((\w*)\)(\w*)')[0]

        idx = df['geneName'].str.contains('\(')
        df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.split('(', expand=True)[0]

        g = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                    edge_attr=edge_attr,
                                    create_using=nx.DiGraph() if directed else nx.Graph())
        return g


[docs]class RNAInter(Interactions):
    """

    """

    def __init__(self, path='http://www.rnainter.org/raidMedia/download/', file_resources: Dict = None,
                 source_col_name: str = 'Interactor1.Symbol', target_col_name: str = 'Interactor2.Symbol',
                 edge_attr: List[str] = 'score', filters: Union[str, Dict[str, Union[str, List[str]]]] = None,
                 directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs):
        """

        Args:
            path ():
            file_resources ():
            source_col_name ():
            target_col_name ():
            edge_attr ():
            filters ():
            directed ():
            relabel_nodes ():
            blocksize ():
            **kwargs ():
        """
        if file_resources is None:
            file_resources = {
                'Download_data_RR.tar.gz': 'Download_data_RR.tar.gz',
                'Download_data_RP.tar.gz': 'Download_data_RP.tar.gz',
            }

        super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed,
                         relabel_nodes, blocksize, **kwargs)

[docs]    def load_dataframe(self, file_resources: Dict, blocksize: int = None) -> pd.DataFrame:
        args = dict(dtype={'Category1': 'category', 'Category2': 'category',
                           'Species1': 'category', 'Species2': 'category', 'score': 'float',
                           'predict': 'category', 'weak': 'category', 'strong': 'category'})
        edge_files = (fn for fn in file_resources if fn.startswith('Download_data'))
        for fn in edge_files:
            if blocksize:
                if not isinstance(file_resources[fn], str): continue
                edges = dd.read_table(file_resources[fn], compression='tar' if fn.endswith('.tar.gz') else None, **args)
            else:
                edges = pd.read_table(file_resources[fn], compression='tar' if fn.endswith('.tar.gz') else None, **args)

        edges = filter_rows(edges, self.filters)

        self.edges = edges
        return edges

[docs]    def load_network(self, file_resources, source_col_name='Interactor1.Symbol', target_col_name='Interactor2.Symbol',
                     edge_attr='score', directed=True, filters=None, blocksize=None):
        edges = self.data
        if filters != self.filters:
            edges = filter_rows(edges, filters)

        g = nx.from_pandas_edgelist(edges, source=source_col_name, target=target_col_name,
                                    edge_attr=edge_attr,
                                    create_using=nx.DiGraph() if directed else nx.Graph())
        return g


[docs]class TargetScan(Interactions, Database):
    """Loads the TargetScan database from "http://www.targetscan.org/" .

    Default path: "http://www.targetscan.org/vert_72/vert_72_data_download/" .
    Default file_resources: {
        "miR_Family_Info.txt": "miR_Family_Info.txt.zip",
        "Predicted_Targets_Info.default_predictions.txt": "Predicted_Targets_Info.default_predictions.txt.zip",
        "": "",
    }
    """

    def __init__(self, path="http://www.targetscan.org/vert_72/vert_72_data_download/", file_resources=None,
                 source_col_name="MiRBase ID", target_col_name="Gene Symbol",
                 edge_attr=["tissue", "positive_negative"], directed=True, relabel_nodes=None, species_id=None,
                 strip_mirna_name=False, **kwargs):
        self.strip_mirna_name = strip_mirna_name
        self.species_id = species_id
        if file_resources is None:
            file_resources = {}
            file_resources["miR_Family_Info.txt.zip"] = os.path.join(path, "miR_Family_Info.txt.zip")
            file_resources["Predicted_Targets_Info.default_predictions.txt"] = os.path.join(path,
                                                                                            "Predicted_Targets_Info.default_predictions.txt")

        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                         target_col_name=target_col_name,
                         directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        self.df = self.process_miR_family_info_table(file_resources, self.species_id)
        interactions_df = self.process_interactions_table(file_resources, self.df, self.species_id)
        print(self.name(), interactions_df.columns.tolist())

        mir_target_network = nx.from_pandas_edgelist(interactions_df,
                                                     source=source_col_name, target=target_col_name,
                                                     edge_attr=edge_attr,
                                                     create_using=nx.DiGraph() if directed else nx.Graph())
        return mir_target_network

[docs]    def process_miR_family_info_table(self, file_resources, species=None):
        miR_Family_Info_df = pd.read_table(file_resources["miR_Family_Info.txt"], delimiter='\t')

        if species:
            miR_Family_Info_df = miR_Family_Info_df[miR_Family_Info_df['Species ID'] == species]

        # Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19
        if self.strip_mirna_name:
            miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.lower()
            miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "")

        miR_Family_Info_df.drop_duplicates(inplace=True)
        miR_Family_Info_df = miR_Family_Info_df.filter(items=['miR family', 'MiRBase ID', 'Seed+m8', 'Mature sequence',
                                                              'Family Conservation?', 'MiRBase Accession'],
                                                       axis="columns")
        miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].astype(str)
        return miR_Family_Info_df

[docs]    def process_interactions_table(self, file_resources, family_to_miR_df, species_id):
        """
        This functions joins the interactions data table between miR Family and targets, and
        Args:
            file_resources:
            family_to_miR_df:
            species_id:

        Returns:

        """
        # Load data frame from file
        family_interactions_df = pd.read_table(file_resources["Predicted_Targets_Info.default_predictions.txt"],
                                               dtype={'Species ID': 'category'},
                                               delimiter='\t', low_memory=True)

        # Select only miRNA-target pairs of certain species_id
        if species_id:
            family_interactions_df = family_interactions_df[family_interactions_df["Species ID"] == species_id]

        family_interactions_df = family_interactions_df.filter(items=["miR Family", "Gene Symbol"], axis="columns")
        family_to_miR_df = family_to_miR_df.filter(items=['miR family', 'MiRBase ID'], axis="columns")
        family_to_miR_df = family_to_miR_df.rename(columns={'miR family': 'miR Family'})

        # map miRBase ID names to miR Family
        # family_interactions_df = pd.merge(family_interactions_df, family_to_miR_df, how='outer', on="miR Family")

        family_to_miR_df.set_genes_index("miR Family", inplace=True)
        family_interactions_df.set_genes_index("miR Family", inplace=True)
        mir_interactions_df = family_interactions_df.join(family_to_miR_df, how='outer', on="miR Family").reset_index()

        # Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19
        if self.strip_mirna_name:
            mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.lower()
            mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "")

        return mir_interactions_df


[docs]class LncReg(Interactions):
    """Loads the  database from  .

    Default path:  .
    Default file_resources: {
        "": "",
        "": "",
        "": "",
    }
    """
    def __init__(self, path, file_resources,
                 source_col_name='A_name_in_paper', target_col_name='B_name_in_paper',
                 source_index="transcript_name", target_index="gene_name",
                 edge_attr=["relationship", "mechanism", "pmid"], filters=None, directed=True, relabel_nodes=None,
                 verbose=False):
        if file_resources is None:
            file_resources = {}
            file_resources["data.xlsx"] = os.path.join(path, "data.xlsx")

        super().__init__(path, file_resources=file_resources, source_col_name=source_col_name,
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
                         relabel_nodes=relabel_nodes, verbose=verbose)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        df = pd.read_excel(self.file_resources["data.xlsx"])
        print(self.name(), df.columns.tolist())

        df = df[df["species"] == "Homo sapiens"]
        df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][
            "B_name_in_paper"].str.replace("-3p.*|-5p.*", "")
        df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][
            "B_name_in_paper"].str.replace("MIR", "hsa-mir-")
        df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][
            "B_name_in_paper"].str.replace("let-", "hsa-let-")

        LncReg_lncRNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                                            edge_attr=edge_attr,
                                                            create_using=nx.DiGraph())
        return LncReg_lncRNA_RNA_network


[docs]class lncRInter(Interactions):
    """Loads the  database from  .

    Default path:  .
    Default file_resources: {
        "": "",
        "": "",
        "": "",
    }
    """

    def __init__(self, path, file_resources=None, source_col_name="lncrna",
                 target_col_name='Interacting partner',
                 edge_attr=None, filters=None,
                 directed=True, relabel_nodes=None, **kwargs):
        if edge_attr is None:
            edge_attr = ["Interaction Class", "Interaction Mode", "Tissue", "Phenotype"]
        if file_resources is None:
            file_resources = {}
            file_resources["human_interactions.txt"] = os.path.join(path, "human_interactions.txt")

        super().__init__(path, file_resources, source_col_name=source_col_name, target_col_name=target_col_name,
                         edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, **kwargs)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        lncRInter_df = pd.read_table(file_resources["human_interactions.txt"])
        print(self.name(), lncRInter_df.columns.tolist())

        lncRInter_df = filter_rows(lncRInter_df, filters)
        # Data cleaning
        lncRInter_df.loc[lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"] = \
            lncRInter_df.loc[
                lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"].str.lower()
        lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mirlet", "hsa-let-")
        lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mir", "hsa-mir-")
        lncRInter_df["Interacting partner"][
            lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")] = \
            lncRInter_df["Interacting partner"][
                lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")].apply(
                lambda x: x[:-1] + "-" + x[-1])

        lncRInter_network = nx.from_pandas_edgelist(lncRInter_df, source=source_col_name,
                                                    target=target_col_name,
                                                    edge_attr=edge_attr,
                                                    create_using=nx.DiGraph() if directed else nx.Graph())
        return lncRInter_network


[docs]class LncRNA2Target(Interactions):
    """Loads the  database from  .

            Default path:  .
            Default file_resources: {
                "": "",
                "": "",
                "": "",
            }
            """

    def __init__(self, path="http://123.59.132.21/lncrna2target/data/", file_resources=None, edge_attr=None,
                 filters=None,
                 directed=True, relabel_nodes=None, version="high_throughput", **kwargs):
        """

        Args:
            filters (): default None, example {"species_id": 9606, "Species": "Homo sapiens"}.
            version (str): one of ["high_throughput", "low_throughput"].
                The high_throughput version of lncRNA2Target database is v2.0 and low_throughput is v1.0, according to the database's website.
            species_id (str, int): one of [9606, "Homo sapiens"].
                The species column in high_throughput is formatted in int (e.g. 9606) and in low_throughput is in str (e.g. "Homo sapiens")
        """
        self.version = version
        if file_resources is None:
            file_resources = {}
            file_resources["lncRNA_target_from_high_throughput_experiments.txt.rar"] = \
                os.path.join(path, "lncrna_target.rar")
            file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"] = \
                os.path.join(path, "lncRNA_target_from_low_throughput_experiments.xlsx")

        if self.version == "high_throughput":
            super().__init__(path, file_resources, source_col_name="lncrna_symbol", target_col_name="gene_symbol",
                             edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes,
                             **kwargs)
        if self.version == "low_throughput":
            super().__init__(path, file_resources, source_col_name="GENCODE_gene_name",
                             target_col_name="Target_official_symbol", edge_attr=edge_attr, filters=filters,
                             directed=directed, relabel_nodes=relabel_nodes, **kwargs)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        network = None
        if self.version == "high_throughput":
            network = self.load_network_high_throughput(file_resources, source_col_name, target_col_name, edge_attr,
                                                        directed)
        elif self.version == "low_throughput":
            network = self.load_network_low_throughput(file_resources, source_col_name, target_col_name, edge_attr,
                                                       directed)
        else:
            logger.warn("LncRNA2Target version argument must be one of 'high_throughput' or 'low_throughput'")

        return network

[docs]    def load_network_high_throughput(self, file_resources, source_col_name="lncrna_symbol",
                                     target_col_name="gene_symbol",
                                     edge_attr=None, directed=True, filters=None):
        edges = pd.read_table(file_resources["lncRNA_target_from_high_throughput_experiments.txt"], sep="\t")
        edges = filter_rows(edges, filters)

        edges["lncrna_symbol"] = edges["lncrna_symbol"].str.upper()
        edges["lncrna_symbol"] = edges["lncrna_symbol"].str.replace("LINC", "")
        edges["gene_symbol"] = edges["gene_symbol"].str.upper()

        self.data = self.edges = edges
        lncrna2target_high_throughput_network = nx.from_pandas_edgelist(edges,
                                                                        source=source_col_name,
                                                                        target=target_col_name,
                                                                        edge_attr=edge_attr,
                                                                        create_using=nx.DiGraph() if directed else nx.Graph())
        return lncrna2target_high_throughput_network

[docs]    def load_network_low_throughput(self, file_resources, source_col_name="GENCODE_gene_name",
                                    target_col_name="Target_official_symbol",
                                    edge_attr=None, directed=True, filters=None):
        edges = pd.read_excel(file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"])
        edges = filter_rows(edges, filters)

        edges["Target_official_symbol"] = edges["Target_official_symbol"].str.replace("(?i)(mir)", "hsa-mir-",
                                                                                      regex=True)
        edges["Target_official_symbol"] = edges["Target_official_symbol"].str.replace("--", "-")
        edges["Target_official_symbol"].apply(lambda x: x.lower() if "mir" in x.lower() else x.upper())
        edges["GENCODE_gene_name"] = edges["GENCODE_gene_name"].str.upper()

        self.data = self.edges = edges
        lncrna2target_low_throughput_network = nx.from_pandas_edgelist(edges,
                                                                       source=source_col_name,
                                                                       target=target_col_name,
                                                                       edge_attr=edge_attr,
                                                                       create_using=nx.DiGraph() if directed else nx.Graph())
        return lncrna2target_low_throughput_network


[docs]class lncRNome(Interactions, Database):
    """Loads the lncRNome database from  .

    Default path:  .
    Default file_resources: {
        "": "",
        "": "",
        "": "",
    }
    """

    def __init__(self, path, file_resources, source_col_name='Gene Name', target_col_name='Binding miRNAs',
                 edge_attr=["miRNA Interaction Site", "Transcript ID"], directed=True, relabel_nodes=None,
                 **kwargs):
        if file_resources is None:
            file_resources = {}
            file_resources["miRNA_binding_sites.txt"] = os.path.join(path, "miRNA_binding_sites.txt")
            file_resources["general_information.txt"] = os.path.join(path, "general_information.txt")

        super().__init__(path, file_resources=file_resources, source_col_name=source_col_name,
                         target_col_name=target_col_name,
                         directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        df = pd.read_table(self.file_resources["miRNA_binding_sites.txt"], header=0)
        print(self.name(), df.columns.tolist())

        df['Binding miRNAs'] = df['Binding miRNAs'].str.lower()
        df['Binding miRNAs'] = df['Binding miRNAs'].str.replace("-3p.*|-5p.*", "", regex=True)

        lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name,
                                                                       target=target_col_name,
                                                                       edge_attr=edge_attr,
                                                                       create_using=nx.DiGraph())

        return lncRNome_miRNA_binding_sites_network

[docs]    def load_dataframe(self, file_resources, blocksize=None):
        return pd.read_table(self.file_resources["general_information.txt"], header=0,
                             usecols=["Gene Name", "Transcript Name", "Transcript Type", "Location", "Strand"])


[docs]class NPInter(Interactions):
    """Loads the NPInter database from http://bigdata.ibp.ac.cn/npinter4/ .

    Default path: "http://bigdata.ibp.ac.cn/npinter4/download/" .
    Default file_resources: {
        "interaction_NPInterv4.expr.txt": "file/interaction_NPInterv4.expr.txt.gz",
    }
    """
    def __init__(self, path="http://bigdata.ibp.ac.cn/npinter4/download/", file_resources=None,
                 source_col_name='ncName', target_col_name='tarName',
                 edge_attr=["tarType", "tissueOrCell", "tag", 'class', "level"],
                 filters=None,
                 directed=True, relabel_nodes=None, verbose=False):
        if file_resources is None:
            file_resources = {}
            file_resources["interaction_NPInterv4.expr.txt.gz"] = \
                os.path.join(path, "file/interaction_NPInterv4.expr.txt.gz")

        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
                         relabel_nodes=relabel_nodes, verbose=verbose)

    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
        df = pd.read_table(file_resources["interaction_NPInterv4.expr.txt"], header=0, na_values=["-"])
        print(self.name(), df.columns.tolist())
        df["ncName"] = df["ncName"].str.upper()
        df["ncName"] = df["ncName"].str.strip("LNCRNA-")
        df["ncName"] = df["ncName"].str.replace("MALAT-1", "MALAT1")
        df["ncName"] = df["ncName"].str.replace("^MIR-", "hsa-mir-", regex=True)
        df["ncName"] = df["ncName"].str.replace("^MICRORNA-", "hsa-mir-", regex=True)

        df["tarName"] = df["tarName"].str.upper()

        return df

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        df = self.data
        df = filter_rows(df, filters)

        lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name,
                                                                       target=target_col_name,
                                                                       edge_attr=edge_attr,
                                                                       create_using=nx.DiGraph() if directed else nx.Graph())

        return lncRNome_miRNA_binding_sites_network


[docs]class StarBase(Interactions):
    """Loads the  database from  .

    Default path:  .
    Default file_resources: {
        "": "",
        "": "",
        "": "",
    }
    """

    def __init__(self, path, file_resources, source_col_name="geneName", target_col_name="pairGeneName",
                 min_interactionNum=1, min_expNum=1,
                 edge_attr=None, directed=True, relabel_nodes=None, **kwargs):
        if file_resources is None:
            file_resources = {}
            file_resources["starbase_3.0_lncrna_rna_interactions.csv"] = \
                os.path.join(path, "starbase_3.0_lncrna_rna_interactions.csv")
        self.min_interactionNum = min_interactionNum
        self.min_expNum = min_expNum
        super().__init__(path, file_resources, source_col_name=source_col_name, target_col_name=target_col_name,
                         directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs)

[docs]    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
                     blocksize=None):
        df = pd.read_csv(self.file_resources["starbase_3.0_lncrna_rna_interactions.csv"], header=0)

        df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][
            "pairGeneName"].str.lower()
        df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][
            "pairGeneName"].str.replace("-3p.*|-5p.*", "")
        df = df[df["interactionNum"] >= self.min_interactionNum]
        df = df[df["expNum"] >= self.min_expNum]

        self.starBase_RNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
                                                                edge_attr=["interactionNum"],
                                                                create_using=nx.DiGraph())
        return self.starBase_RNA_RNA_network