Source code for openomics.database.interaction

import copy
import os
from abc import abstractmethod
from collections.abc import Iterable
from typing import List, Dict, Any, Union, Optional

import dask.dataframe as dd
import networkx as nx
import pandas as pd
import scipy.sparse as ssp
from Bio import SeqIO
from logzero import logger
from pandas.core.dtypes.common import is_numeric_dtype

from openomics.database.base import Database
from openomics.database.sequence import SequenceDatabase, UniProt
from openomics.transforms.df import filter_rows

__all__ = ['STRING', 'GeneMania', 'IntAct', 'BioGRID', 'MiRTarBase', 'LncBase', 'TargetScan', 'TarBase',
           'LncReg', 'LncRNA2Target', 'lncRNome', 'NPInter', 'RNAInter', 'StarBase']

[docs]class Interactions(Database): edges: Optional[Union[pd.DataFrame, dd.DataFrame]] def __init__(self, path, file_resources: Dict, source_col_name: str = None, target_col_name: str = None, edge_attr: List[str] = None, filters: Union[str, Dict[str, Union[str, List[str]]]] = None, directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs): """ This is an abstract class used to instantiate a database given a folder containing various file resources. When creating a Database class, the load_data function is called where the file resources are load as a DataFrame and performs necessary processings. This class provides an interface for RNA classes to annotate various genomic annotation, functional annotation, sequences, and disease associations. Args: path (str): The folder path containing the data files. file_resources (dict): Default None, used to list required files for load_network of the dataset. A dictionary where keys are required filenames and value are file paths. If None, then the class constructor should automatically build the required file resources dict. source_col_name (str): Column name of DataFrame to be used as the source node names. target_col_name (str): Column name of DataFrame to be used as the target node names. edge_attr (list): A list of column names to be included as attributes for each edge (source-target pairs). filters (dict): Optional. A dict with key matching the data table (from load_network()) columns and values for the filtering on that column. directed (bool): default True, Whether to create a directed or an undirected network. relabel_nodes (dict): default None, A dictionary to rename nodes in the network, where the nodes with name <dict[key]> will be renamed to <dict[value]> blocksize (): """ self.filters = filters self.source_col_name = source_col_name self.target_col_name = target_col_name self.directed = directed self.edge_attr = edge_attr super().__init__(path=path, file_resources=file_resources, blocksize=blocksize, **kwargs) self.network = self.load_network(file_resources=self.file_resources, source_col_name=source_col_name, target_col_name=target_col_name, edge_attr=edge_attr, directed=directed, filters=filters, blocksize=blocksize) if relabel_nodes is not None: self.network = nx.relabel_nodes(self.network, mapping=relabel_nodes) self.close()
[docs] @classmethod def name(cls): return cls.__name__
[docs] @abstractmethod def load_network(self, file_resources: Dict, source_col_name: str, target_col_name: str, edge_attr: Union[str, List[str]], directed: bool, filters: Dict[str, Any], blocksize=None) \ -> nx.Graph: """ Handles data processing from `file_resources` to a Pandas DataFrame which contain edgelist data, then constructs and return a NetworkX Graph. Args: file_resources: a dict of file name and file path/object source_col_name (str): column name of the dataframe for source in the edge target_col_name (str): column name of the dataframe for target in the edge edge_attr (list): list of str for column data to include in each edge directed (bool): True to return a DiGraph(), else Graph() filters: A dict of {column name: column values} to filter the dataframe blocksize (): Returns: network: a NetworkX Graph or DiGraph """ raise NotImplementedError
[docs] def get_interactions(self, nodelist=None, data=False, inclusive=True, relabel_nodes: Dict[str, str] = None): """ Args: nodelist (list): A list of nodes to fetch edges from data (bool): default False Whether to include edge attributes inclusive (bool): default False Whether to only retrieve edges from nodes inclusive in nodelist. Returns: edges (OutEdgeView): a NetworkX edgelist """ if not hasattr(self, "network"): raise Exception( "{} does not have network interaction data yet. Must run load_network() and assign self.network field first.".format( self.name())) g = self.network if relabel_nodes: g = nx.relabel_nodes(g, relabel_nodes, copy=False) if nodelist is None: return g.edges(data=data) if inclusive: return g.subgraph(nodelist).edges(data=data) else: return g.edges(nbunch=nodelist, data=data)
[docs]class STRING(Interactions, SequenceDatabase): """Loads the STRING database from https://string-db.org/ . Default path: "https://stringdb-static.org/download/" . Default file_resources: { "{species_id}.protein.info.txt.gz": f"protein.info.{version}/{species_id}.protein.info.{version}.txt.gz", "{species_id}.protein.aliases.txt.gz": f"protein.links.{version}/{species_id}.protein.aliases.{version}.txt.gz", "{species_id}.protein.links.txt.gz": f"protein.links.{version}/{species_id}.protein.links.{version}.txt.gz", "{species_id}.protein.sequences.fa.gz": f"protein.sequences.{version}/{species_id}.protein.sequences.{version}.fa.gz" } Edge attributes for protein.actions.txt include ["mode", 'action', 'is_directional', 'a_is_acting' "score"] Edge attributes for protein.actions.txt include ["combined_score"] """ COLUMNS_RENAME_DICT = { "#string_protein_id": "string_protein_id", "protein_external_id": "protein_id", "preferred_name": "gene_name", '#ncbi_taxid': 'species_id', 'string_protein_id_2': 'homologous_protein_id', } def __init__(self, path="https://stringdb-static.org/download/", file_resources=None, species_id: Union[str, List[str]] = "9606", version="v11.0", source_col_name="protein1", target_col_name="protein2", edge_attr: Union[str, List[str]] = 'combined_score', directed=False, relabel_nodes=None, index_col='#string_protein_id', keys=None, alias_types={'Ensembl_UniProt', 'Ensembl_UniProt_AC'}, blocksize=None, **kwargs): """ Args: path (): file_resources (): species_id (): List of str of species id's Provide a species_id string or a list of species_id's to download the species-specific STRING dataset, and integrate them. If species_id is None, then download the full-dataset version of STRING, which is very time-consuming. version (): source_col_name (): target_col_name (): source_index (): target_index (): edge_attr (): directed (): relabel_nodes (): verbose (): blocksize (): """ self.version = version self.species_id = copy.copy(species_id) self.alias_types = alias_types assert isinstance(edge_attr, str) if file_resources is None: file_resources = {} if isinstance(species_id, (Iterable, str)) and len(species_id): species_list = [species_id] if isinstance(species_id, str) else species_id for species in species_list: file_resources[f"{species}.protein.info.txt.gz"] = \ os.path.join(path, f"protein.info.{version}/{species}.protein.info.{version}.txt.gz") file_resources[f"{species}.protein.links.txt.gz"] = \ os.path.join(path, f"protein.links.{version}/{species}.protein.links.{version}.txt.gz") file_resources[f"{species}.protein.links.detailed.txt.gz"] = \ os.path.join(path, f"protein.links.detailed.{version}/" f"{species}.protein.links.detailed.{version}.txt.gz") file_resources[f"{species}.protein.homology.txt.gz"] = \ os.path.join(path, f"protein.homology.{version}/{species}.protein.homology.{version}.txt.gz") file_resources[f"{species}.clusters.proteins.txt.gz"] = \ os.path.join(path, f"clusters.proteins.{version}/{species}.clusters.proteins.{version}.txt.gz") file_resources[f"{species}.protein.aliases.txt.gz"] = \ os.path.join(path, f"protein.aliases.{version}/{species}.protein.aliases.{version}.txt.gz") file_resources[f"{species}.enrichment.terms.txt.gz"] = \ os.path.join(path, f"enrichment.terms.{version}/{species}.enrichment.terms.{version}.txt.gz") file_resources[f"{species}.protein.sequences.fa.gz"] = \ os.path.join(path, f"protein.sequences.{version}/{species}.protein.sequences.{version}.fa.gz") else: file_resources["protein.info.txt.gz"] = os.path.join(path, f"protein.info.{version}.txt.gz") file_resources["protein.links.txt.gz"] = os.path.join(path, f"protein.links.{version}.txt.gz") file_resources["protein.sequences.fa.gz"] = os.path.join(path, f"protein.sequences.{version}.fa.gz") else: if isinstance(self.species_id, Iterable): file_resources = {fn: fp for fn, fp in file_resources.items() \ if any(fn.startswith(species) for species in self.species_id)} super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, edge_attr=edge_attr, directed=directed, relabel_nodes=relabel_nodes, blocksize=blocksize, index_col=index_col, keys=keys, col_rename=STRING.COLUMNS_RENAME_DICT, **kwargs) def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: # Load nodes dfs = [] if blocksize: for filename in [fn for fn, path in file_resources.items() \ if 'info.txt' in fn and isinstance(path, str)]: compression = 'gzip' if filename.endswith(".gz") else None info_df = dd.read_table(file_resources[filename], na_values=['annotation not available'], low_memory=True, compression=compression, dtype={'protein_size': 'int8'}, blocksize=None if isinstance(blocksize, bool) else blocksize) if self.keys is not None: info_df = info_df.loc[info_df[self.index_col].isin(self.keys)] if self.index_col: info_df = info_df.set_index(self.index_col, sorted=True) # Join other attributes to node_info species_id = filename.split(".")[0] attrs = self.load_accessory_data(file_resources, species_id=species_id, alias_types=self.alias_types, blocksize=False) if attrs is not None: new_cols = attrs.columns.difference(info_df.columns) info_df = info_df.join(attrs[new_cols], on=self.index_col) dfs.append(info_df) else: for filename in file_resources: if filename.endswith("protein.info.txt"): info_df = pd.read_table(file_resources[filename], na_values=['annotation not available'], dtype={'protein_size': 'int8'}, index_col=self.index_col, low_memory=True) index_split = info_df['#string_protein_id'].str.split(".", expand=True, n=1) info_df = info_df.assign(species_id=index_split[0], protein_embl_id=index_split[1]) # Join other attributes to node_info species_id = filename.split(".")[0] attrs = self.load_accessory_data(file_resources, species_id=species_id, alias_types=self.alias_types, blocksize=blocksize) if attrs is not None: new_cols = attrs.columns.difference(info_df.columns) info_df = info_df.join(attrs[new_cols], on=self.index_col) dfs.append(info_df) if not len(dfs): raise Exception("Must provide at least one 'protein.info.txt' file.") if blocksize: protein_info: dd.DataFrame = dd.concat(dfs, axis=0, interleave_partitions=True) else: protein_info = pd.concat(dfs, axis=0) return protein_info def load_accessory_data(self, file_resources: Dict[str, str], species_id: str, accessory_files=['protein.aliases', 'protein.homology', 'protein.enrichment', 'clusters.proteins'], alias_types={'Ensembl_UniProt', 'Ensembl_UniProt_AC'}, blocksize=False, ) \ -> Union[pd.DataFrame, dd.DataFrame]: """ Stack the annotations files for the provided `species_id`, such that rows in the annotations are filtered by `keys` (if not null), indexed by "#string_protein_id", and with attributes transformed to a dataframe columns. Args: file_resources (): a dict of filename and filepath species_id (str): the species_id string which is used to select only files that have the same prefix. accessory_files (List[str]): A list of strings that specify which types of annotation files to integrate, i.e., only select files having a substring matching one of these. Default ['protein.aliases', 'protein.homology', 'protein.enrichment', 'clusters.proteins']. alias_types (): a set of string, default {'Ensembl_UniProt_AC'} A set of `source` values in the `protein.aliases` annotation to aggregate `alias`'s for. Must be a subset of {'Ensembl_Source', 'Ensembl_gene', 'Ensembl_transcript', 'Ensembl_UniGene', 'Ensembl_RefSeq_short', 'Ensembl_RefSeq', 'Ensembl_OTTG', 'Ensembl_OTTP', 'Ensembl_UCSC', 'Ensembl_UniProt', 'Ensembl_UniProt_AC', 'Ensembl_EntrezGene', 'Ensembl_EMBL', 'Ensembl_protein_id'} blocksize (bool): Recommended to use Pandas to avoid uncessary overhead. Returns: dd.Dataframe or pd.DataFrame """ allowed_prefixes = {'protein.aliases', 'protein.homology', 'protein.enrichment', 'clusters.proteins'} if not set(accessory_files).issubset(allowed_prefixes): logger.warn(f'{set(accessory_files).difference(allowed_prefixes)} files are not supported') select_files = [] for fn, path in file_resources.items(): if fn.startswith(species_id) and any(ftype in fn for ftype in accessory_files): select_files.append(fn) dfs = [] for filename in select_files: args = dict( low_memory=True, dtype={'cluster_id': 'category', '#ncbi_taxid': 'category', 'category': 'category', 'source': 'category'}) compression = 'gzip' if filename.endswith(".gz") else None if blocksize: if not isinstance(file_resources[filename], str): continue df = dd.read_table(file_resources[filename], compression=compression, **args) else: df = pd.read_table(file_resources[filename], **args) # Set index for df for col in ['#string_protein_id', 'protein_id', '#string_protein_1']: if col in df.columns: df = df.set_index(col, sorted=True) if blocksize else df.set_index(col) break # Set index if df.index.name is None: continue elif self.index_col and df.index.name != self.index_col: df.index = df.index.rename(self.index_col) if blocksize: assert df.known_divisions # Filter rows if self.keys is not None: df = df.loc[df.index.isin(self.keys)] # Groupby on index and perform appropriate transforms depending on the annotation type if 'protein.homology' in filename: df = df.loc[df.index != df['string_protein_id_2']] df = df.groupby(self.index_col)['string_protein_id_2'].unique().to_frame() # TODO ignored column of size of homologous regions elif 'clusters.protein' in filename: df = df.groupby(self.index_col)[['cluster_id', '#ncbi_taxid']].unique() elif 'protein.enrichment' in filename: df = df.groupby(self.index_col)['term'].unique().to_frame() elif 'protein.aliases' in filename: df = df.loc[df['source'].isin(alias_types)] df['source'] = df['source'].cat.set_categories(alias_types) if blocksize: # Set alias values to lists so pivot_table(..., aggfunc='sum') will concatenate them df = df.assign(alias=df['alias'].map(lambda x: [x], meta=pd.Series([[""]]))) df = dd.pivot_table(df.reset_index(), index='#string_protein_id', columns='source', values='alias', aggfunc='sum') else: df = df.reset_index().groupby([self.index_col, 'source'])['alias'].unique().unstack(level=1) if blocksize and not df.known_divisions: df.divisions = df.compute_current_divisions() if not len(df.index): continue dfs.append(df) if dfs: attrs = dd.concat(dfs, axis=1) if blocksize else pd.concat(dfs, axis=1) else: attrs = None return attrs
[docs] def load_network(self, file_resources, source_col_name='protein1', target_col_name='protein2', edge_attr: Union[str, List[str]] = 'combined_score', directed=False, filters=None, blocksize=None): keys = self.data.index.compute() if isinstance(self.data, dd.DataFrame) else self.data.index select_files = [fn for fn, path in file_resources.items() if "links" in fn] # Load edges edges_dfs = [] for filename in select_files: args = dict(sep=" ", low_memory=True, dtype={'protein1': 'category', 'protein2': 'category', 'neighborhood': 'uint8', 'fusion': 'uint8', 'cooccurence': 'uint8', 'coexpression': 'uint8', 'experimental': 'uint8', 'database': 'uint8', 'textmining': 'uint8', 'combined_score': 'uint8'}) if blocksize: if not isinstance(file_resources[filename], str): continue compression = 'gzip' if filename.endswith(".gz") else None df: dd.DataFrame = dd.read_table(file_resources[filename], compression=compression, **args, blocksize=None if isinstance(blocksize, bool) else blocksize) if compression: logger.info(f"Repartitioning {filename} from {df.npartitions} " f"partitions to {blocksize}-size partitions") df = df.repartition(partition_size=blocksize) else: df = pd.read_table(file_resources[filename], **args) df = df.loc[df[source_col_name].isin(keys) & df[target_col_name].isin(keys)] edges_dfs.append(df) if len(edges_dfs) == 0: return # Concatenate multiple edgelists into dataframe edges_df = dd.concat(edges_dfs, axis=0) if blocksize else pd.concat(edges_dfs, axis=0) edges_df = edges_df.rename(columns=self.COLUMNS_RENAME_DICT) logger.info(f"{self.name()}-{self.species_id}: {edges_df.columns.tolist()}, {edges_df.shape}") # Convert edge_attr (edge weights) from 3 digit integer to float assignfunc = {} for col in (edge_attr if isinstance(edge_attr, list) else [edge_attr]): if col in edges_df.columns and is_numeric_dtype(edges_df[col]): assignfunc[col] = edges_df[col].astype('float16') / 1000 if assignfunc: edges_df = edges_df.assign(**assignfunc) edges_df = filter_rows(edges_df, filters=filters) self.edges = edges_df # Set ordering for rows and columns node2idx = {node: i for i, node in enumerate(keys)} if isinstance(edges_df, dd.DataFrame): def edgelist2adj(df: pd.DataFrame) -> ssp.coo_matrix: if df.shape[0] == 1 and df.iloc[0, 0] == 'foo': return None df = df.assign(row=df[source_col_name].map(node2idx).astype('int'), col=df[target_col_name].map(node2idx).astype('int')) df = df.dropna(subset=['row', 'col']) if df.shape[0] == 0: return None coo_adj = ssp.coo_matrix((df[edge_attr], (df['row'], df['col'])), shape=(len(keys), len(keys))) coo_adj.eliminate_zeros() return coo_adj # Create a sparse adjacency matrix for each partition, then add them to combine adj = edges_df.reduction(chunk=edgelist2adj, aggregate=lambda x: x.dropna().sum() if not x.isna().all() else None, meta=pd.Series([ssp.coo_matrix])).compute() assert len(adj) == 1, f"len(adj) = {len(adj)}" G = nx.from_scipy_sparse_matrix(adj[0], create_using=nx.DiGraph() if directed else nx.Graph(), edge_attribute='weight') idx2node = {i: node for i, node in enumerate(keys)} G = nx.relabel_nodes(G, mapping=idx2node, copy=True) del adj else: # Determine which edge attr to add if isinstance(edge_attr, (list, tuple)): cols = edges_df.columns.intersection(edge_attr + [source_col_name, target_col_name]) edges_df = edges_df[cols] use_attrs = True elif isinstance(edge_attr, str): cols = edges_df.columns.intersection([source_col_name, target_col_name, edge_attr]) edges_df = edges_df[cols] use_attrs = edge_attr else: use_attrs = False G = nx.from_pandas_edgelist(edges_df, source=source_col_name, target=target_col_name, edge_attr=use_attrs, create_using=nx.DiGraph() if directed else nx.Graph()) return G
[docs] def get_sequences(self, index="protein_id", omic=None, agg=None): if hasattr(self, "seq_dict"): return self.seq_dict self.seq_dict = {} collisions = 0 for record in SeqIO.parse(self.file_resources["protein.sequences.fa"], "fasta"): gene_id = str(record.name) sequence_str = str(record.seq) if index == "protein_name": key = self.protein_id2name[gene_id] elif index == "protein_id": key = gene_id if key in self.seq_dict: collisions += 1 self.seq_dict[key] = sequence_str logger.warn("Seq {} collisions: {}".format(index, collisions)) return self.seq_dict
[docs]class GeneMania(Interactions): """Loads the GeneMania database from . Default path: local_directory . Default file_resources: { "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt": "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt", "identifier_mappings.txt": "identifier_mappings.txt", } """ def __init__(self, path, file_resources=None, source_col_name="Gene_A", target_col_name="Gene_B", edge_attr=None, filters=None, directed=True, relabel_nodes=None, **kwargs): if edge_attr is None: edge_attr = ["Weight"] if file_resources is None: file_resources = {} file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"] = os.path.join(path, "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt") file_resources["identifier_mappings.txt"] = os.path.join(path, "identifier_mappings.txt") super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, **kwargs)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): interactions = pd.read_table(file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"], low_memory=True) identifier = pd.read_table(file_resources["identifier_mappings.txt"]) # Rename ENSG ID's to gene names identifier = identifier[identifier["Source"] == "Gene Name"] id_mapping = pd.Series(identifier["Name"].values, index=identifier["Preferred_Name"]).to_dict() interactions.replace(id_mapping, inplace=True) genemania_RNA_RNA_network = nx.from_pandas_edgelist(interactions, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph()) return genemania_RNA_RNA_network
[docs]class IntAct(Interactions): def __init__(self, path, file_resources: Dict, source_col_name: str = None, target_col_name: str = None, source_index: str = None, target_index: str = None, edge_attr: List[str] = None, filters: dict = None, directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs): super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed, relabel_nodes, blocksize, **kwargs)
[docs]class BioGRID(Interactions): """Loads the BioGRID database from https://thebiogrid.org . Default path: "https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/" . Default file_resources: { "BIOGRID-ALL-LATEST.tab2.zip": "BIOGRID-ALL-LATEST.tab2.zip", } """ def __init__(self, path="https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/", file_resources=None, source_col_name="Official Symbol Interactor A", target_col_name="Official Symbol Interactor B", edge_attr=['Score', 'Throughput', 'Experimental System', 'Experimental System Type'], filters=None, directed=False, relabel_nodes=None, **kwargs): """ Args: path (): file_resources (): source_col_name (): target_col_name (): source_index (): target_index (): edge_attr (): filters (): Default None, example {"Organism Interactor A": 9606}. directed (): relabel_nodes (): **kwargs (): """ if file_resources is None: file_resources = {} file_resources["BIOGRID-ALL-LATEST.tab2.zip"] = os.path.join(path, "BIOGRID-ALL-LATEST.tab2.zip") super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, **kwargs) def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: args = dict(na_values=["-"], header=0, low_memory=True, # usecols=['Official Symbol Interactor A', 'Official Symbol Interactor B', # 'Organism Interactor A', 'Score', 'Throughput', 'Qualifications', # 'Modification', 'Phenotypes', 'Source Database'], dtype={'Score': 'float', 'Entrez Gene Interactor A': 'category', 'Entrez Gene Interactor B': 'category', 'BioGRID ID Interactor A': 'category', 'BioGRID ID Interactor B': 'category', 'Systematic Name Interactor A': 'category', 'Systematic Name Interactor B': 'category', 'Official Symbol Interactor A': 'category', 'Official Symbol Interactor B': 'category', 'Pubmed ID': 'str', 'Throughput': 'category', 'Experimental System Type': 'category', 'Experimental System': 'category', 'Modification': 'category', 'Source Database': 'category', 'Organism Interactor A': 'category', 'Organism Interactor B': 'category'}) if blocksize: edges = dd.read_table(file_resources["BIOGRID-ALL-LATEST.tab2"], blocksize=blocksize, **args, ) else: edges = pd.read_table(file_resources["BIOGRID-ALL-LATEST.tab2"], **args, ) self.edges = edges return edges
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): df = self.edges df = filter_rows(df, filters) network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return network
[docs]class MiRTarBase(Interactions): """Loads the database from . Default path: . Default file_resources: { "": "", "": "", "": "", } """ def __init__(self, path="http://mirtarbase.mbc.nctu.edu.tw/cache/download/7.0/", file_resources=None, source_col_name="miRNA", target_col_name="Target Gene", edge_attr=None, filters=None, directed=True, relabel_nodes=None, strip_mirna_name=False, **kwargs): """ Args: path (): file_resources (): source_col_name (): target_col_name (): source_index (): target_index (): edge_attr (): filters (): default None, example {"Species (Target Gene)": "Homo sapiens"} directed (): relabel_nodes (): strip_mirna_name (): **kwargs (): """ if edge_attr is None: edge_attr = ["Support Type"] self.strip_mirna_name = strip_mirna_name if file_resources is None: file_resources = {} file_resources["miRTarBase_MTI.xlsx"] = os.path.join(path, "miRTarBase_MTI.xlsx") super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, **kwargs) def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: df = pd.read_excel(self.file_resources["miRTarBase_MTI.xlsx"]) self.edges = df return df
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): df = self.data df = filter_rows(df, filters) df['miRNA'] = df['miRNA'].str.rstrip('*') if self.strip_mirna_name: df['miRNA'] = df['miRNA'].str.lower().str.replace("-3p.*|-5p.*", "", regex=True) mir_target_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return mir_target_network
[docs]class LncBase(Interactions, Database): """Loads the LncBase database from http://carolina.imis.athena-innovation.gr/diana_tools/web/index.php?r=lncbasev2%2Findex . Default path: local_directory . Default file_resources: { "LncBasev2_download.csv": "LncBasev2_download.csv"", } """ def __init__(self, path='https://dianalab.e-ce.uth.gr/downloads/', file_resources=None, strip_mirna_name=False, source_col_name="mirna", target_col_name="geneId", edge_attr=None, filters=None, directed=True, relabel_nodes=None, ): """ Args: path (): file_resources (): strip_mirna_name (): source_col_name (): target_col_name (): source_index (): target_index (): edge_attr (): filters (): default None. Example: {"species": "Homo sapiens"} directed (): relabel_nodes (): """ self.strip_mirna_name = strip_mirna_name if edge_attr is None: edge_attr = ["tissue", "positive_negative"] if file_resources is None: file_resources = {} file_resources["LncBasev2_download.csv"] = os.path.join(path, "lncbase_v2_exp_data.tar.gz") super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes)
[docs] def get_rename_dict(self, from_index="geneId", to_index="geneName"): lncbase_df = pd.read_table(self.file_resources["LncBasev2_download.csv"], low_memory=True) gene_id_to_gene_name_dict = pd.Series(lncbase_df["geneName"].values, index=lncbase_df["geneId"]).to_dict() return gene_id_to_gene_name_dict
def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: df = pd.read_table(file_resources["LncBasev2_download.csv"], low_memory=True) df.replace({"species": {"Homo Sapiens": "Homo sapiens", "Mus Musculus": "Mus musculus"}}, inplace=True) return df
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): df = self.data df = filter_rows(df, filters) if self.strip_mirna_name: df['mirna'] = df['mirna'].str.lower() df['mirna'] = df['mirna'].str.replace("-3p.*|-5p.*", "", regex=True) if edge_attr is None: edge_attr = ["tissue", "positive_negative"] lncBase_lncRNA_miRNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return lncBase_lncRNA_miRNA_network
[docs]class TarBase(Interactions): """ """ def __init__(self, path='https://dianalab.e-ce.uth.gr/downloads', file_resources: Dict = None, source_col_name: str = 'mirna', target_col_name: str = 'geneName', edge_attr: List[str] = None, filters: Union[str, Dict[str, Union[str, List[str]]]] = None, directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs): """ Args: path (): file_resources (): source_col_name (): target_col_name (): edge_attr (): filters (): directed (): relabel_nodes (): blocksize (): **kwargs (): """ if file_resources is None: file_resources = { 'tarbase_v8_data.tar.gz': 'https://dianalab.e-ce.uth.gr/downloads/tarbase_v8_data.tar.gz', 'speclist': 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/speclist', } super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed, relabel_nodes, blocksize, **kwargs)
[docs] def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: edges = pd.read_table(file_resources['tarbase_v8_data.tar.gz'], compression='tar', dtype={'tissue': 'category', 'method': 'category', 'positive_negative': 'category', 'species': 'category', 'direct_indirect': 'category', 'up_down': 'category', 'cell_line': 'category', }) if 'speclist' in file_resources: species_df = UniProt.get_species_list(file_resources['speclist']) species_df = species_df[['Official (scientific) name', 'Common name', 'Synonym']].melt(ignore_index=False) species_df = species_df.dropna().reset_index() species_name2id = species_df.set_index('value')['NCBI-taxon'].to_dict() edges['species_id'] = edges['species'].map(species_name2id) self.edges = edges return edges
[docs] def load_network(self, file_resources: Dict, source_col_name: str, target_col_name: str, edge_attr: List[str], directed: bool, filters: Dict[str, Any], blocksize=None): df = self.data df = filter_rows(df, filters) # Remove parenthesis containing 3 letter species name df['geneName'] = df['geneName'].str.replace(r'(\(\w{3}\)){1}$', '', regex=True) idx = df['geneName'].str.contains('\(') df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.replace(r'(\(\d of \d\))', '', regex=True).str.strip() idx = df['geneName'].str.contains("\(\w*\)", regex=True) df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.extract(r'\((\w*)\)(\w*)')[0] idx = df['geneName'].str.contains('\(') df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.split('(', expand=True)[0] g = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return g
[docs]class RNAInter(Interactions): """ """ def __init__(self, path='http://www.rnainter.org/raidMedia/download/', file_resources: Dict = None, source_col_name: str = 'Interactor1.Symbol', target_col_name: str = 'Interactor2.Symbol', edge_attr: List[str] = 'score', filters: Union[str, Dict[str, Union[str, List[str]]]] = None, directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs): """ Args: path (): file_resources (): source_col_name (): target_col_name (): edge_attr (): filters (): directed (): relabel_nodes (): blocksize (): **kwargs (): """ if file_resources is None: file_resources = { 'Download_data_RR.tar.gz': 'Download_data_RR.tar.gz', 'Download_data_RP.tar.gz': 'Download_data_RP.tar.gz', } super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed, relabel_nodes, blocksize, **kwargs)
[docs] def load_dataframe(self, file_resources: Dict, blocksize: int = None) -> pd.DataFrame: args = dict(dtype={'Category1': 'category', 'Category2': 'category', 'Species1': 'category', 'Species2': 'category', 'score': 'float', 'predict': 'category', 'weak': 'category', 'strong': 'category'}) edge_files = (fn for fn in file_resources if fn.startswith('Download_data')) for fn in edge_files: if blocksize: if not isinstance(file_resources[fn], str): continue edges = dd.read_table(file_resources[fn], compression='tar' if fn.endswith('.tar.gz') else None, **args) else: edges = pd.read_table(file_resources[fn], compression='tar' if fn.endswith('.tar.gz') else None, **args) edges = filter_rows(edges, self.filters) self.edges = edges return edges
[docs] def load_network(self, file_resources, source_col_name='Interactor1.Symbol', target_col_name='Interactor2.Symbol', edge_attr='score', directed=True, filters=None, blocksize=None): edges = self.data if filters != self.filters: edges = filter_rows(edges, filters) g = nx.from_pandas_edgelist(edges, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return g
[docs]class TargetScan(Interactions, Database): """Loads the TargetScan database from "http://www.targetscan.org/" . Default path: "http://www.targetscan.org/vert_72/vert_72_data_download/" . Default file_resources: { "miR_Family_Info.txt": "miR_Family_Info.txt.zip", "Predicted_Targets_Info.default_predictions.txt": "Predicted_Targets_Info.default_predictions.txt.zip", "": "", } """ def __init__(self, path="http://www.targetscan.org/vert_72/vert_72_data_download/", file_resources=None, source_col_name="MiRBase ID", target_col_name="Gene Symbol", edge_attr=["tissue", "positive_negative"], directed=True, relabel_nodes=None, species_id=None, strip_mirna_name=False, **kwargs): self.strip_mirna_name = strip_mirna_name self.species_id = species_id if file_resources is None: file_resources = {} file_resources["miR_Family_Info.txt.zip"] = os.path.join(path, "miR_Family_Info.txt.zip") file_resources["Predicted_Targets_Info.default_predictions.txt"] = os.path.join(path, "Predicted_Targets_Info.default_predictions.txt") super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): self.df = self.process_miR_family_info_table(file_resources, self.species_id) interactions_df = self.process_interactions_table(file_resources, self.df, self.species_id) print(self.name(), interactions_df.columns.tolist()) mir_target_network = nx.from_pandas_edgelist(interactions_df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return mir_target_network
[docs] def process_miR_family_info_table(self, file_resources, species=None): miR_Family_Info_df = pd.read_table(file_resources["miR_Family_Info.txt"], delimiter='\t') if species: miR_Family_Info_df = miR_Family_Info_df[miR_Family_Info_df['Species ID'] == species] # Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19 if self.strip_mirna_name: miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.lower() miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "") miR_Family_Info_df.drop_duplicates(inplace=True) miR_Family_Info_df = miR_Family_Info_df.filter(items=['miR family', 'MiRBase ID', 'Seed+m8', 'Mature sequence', 'Family Conservation?', 'MiRBase Accession'], axis="columns") miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].astype(str) return miR_Family_Info_df
[docs] def process_interactions_table(self, file_resources, family_to_miR_df, species_id): """ This functions joins the interactions data table between miR Family and targets, and Args: file_resources: family_to_miR_df: species_id: Returns: """ # Load data frame from file family_interactions_df = pd.read_table(file_resources["Predicted_Targets_Info.default_predictions.txt"], dtype={'Species ID': 'category'}, delimiter='\t', low_memory=True) # Select only miRNA-target pairs of certain species_id if species_id: family_interactions_df = family_interactions_df[family_interactions_df["Species ID"] == species_id] family_interactions_df = family_interactions_df.filter(items=["miR Family", "Gene Symbol"], axis="columns") family_to_miR_df = family_to_miR_df.filter(items=['miR family', 'MiRBase ID'], axis="columns") family_to_miR_df = family_to_miR_df.rename(columns={'miR family': 'miR Family'}) # map miRBase ID names to miR Family # family_interactions_df = pd.merge(family_interactions_df, family_to_miR_df, how='outer', on="miR Family") family_to_miR_df.set_genes_index("miR Family", inplace=True) family_interactions_df.set_genes_index("miR Family", inplace=True) mir_interactions_df = family_interactions_df.join(family_to_miR_df, how='outer', on="miR Family").reset_index() # Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19 if self.strip_mirna_name: mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.lower() mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "") return mir_interactions_df
[docs]class LncReg(Interactions): """Loads the database from . Default path: . Default file_resources: { "": "", "": "", "": "", } """ def __init__(self, path, file_resources, source_col_name='A_name_in_paper', target_col_name='B_name_in_paper', source_index="transcript_name", target_index="gene_name", edge_attr=["relationship", "mechanism", "pmid"], filters=None, directed=True, relabel_nodes=None, verbose=False): if file_resources is None: file_resources = {} file_resources["data.xlsx"] = os.path.join(path, "data.xlsx") super().__init__(path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, verbose=verbose)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): df = pd.read_excel(self.file_resources["data.xlsx"]) print(self.name(), df.columns.tolist()) df = df[df["species"] == "Homo sapiens"] df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][ "B_name_in_paper"].str.replace("-3p.*|-5p.*", "") df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][ "B_name_in_paper"].str.replace("MIR", "hsa-mir-") df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][ "B_name_in_paper"].str.replace("let-", "hsa-let-") LncReg_lncRNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph()) return LncReg_lncRNA_RNA_network
[docs]class lncRInter(Interactions): """Loads the database from . Default path: . Default file_resources: { "": "", "": "", "": "", } """ def __init__(self, path, file_resources=None, source_col_name="lncrna", target_col_name='Interacting partner', edge_attr=None, filters=None, directed=True, relabel_nodes=None, **kwargs): if edge_attr is None: edge_attr = ["Interaction Class", "Interaction Mode", "Tissue", "Phenotype"] if file_resources is None: file_resources = {} file_resources["human_interactions.txt"] = os.path.join(path, "human_interactions.txt") super().__init__(path, file_resources, source_col_name=source_col_name, target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, **kwargs)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): lncRInter_df = pd.read_table(file_resources["human_interactions.txt"]) print(self.name(), lncRInter_df.columns.tolist()) lncRInter_df = filter_rows(lncRInter_df, filters) # Data cleaning lncRInter_df.loc[lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"] = \ lncRInter_df.loc[ lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"].str.lower() lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mirlet", "hsa-let-") lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mir", "hsa-mir-") lncRInter_df["Interacting partner"][ lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")] = \ lncRInter_df["Interacting partner"][ lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")].apply( lambda x: x[:-1] + "-" + x[-1]) lncRInter_network = nx.from_pandas_edgelist(lncRInter_df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return lncRInter_network
[docs]class LncRNA2Target(Interactions): """Loads the database from . Default path: . Default file_resources: { "": "", "": "", "": "", } """ def __init__(self, path="http://123.59.132.21/lncrna2target/data/", file_resources=None, edge_attr=None, filters=None, directed=True, relabel_nodes=None, version="high_throughput", **kwargs): """ Args: filters (): default None, example {"species_id": 9606, "Species": "Homo sapiens"}. version (str): one of ["high_throughput", "low_throughput"]. The high_throughput version of lncRNA2Target database is v2.0 and low_throughput is v1.0, according to the database's website. species_id (str, int): one of [9606, "Homo sapiens"]. The species column in high_throughput is formatted in int (e.g. 9606) and in low_throughput is in str (e.g. "Homo sapiens") """ self.version = version if file_resources is None: file_resources = {} file_resources["lncRNA_target_from_high_throughput_experiments.txt.rar"] = \ os.path.join(path, "lncrna_target.rar") file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"] = \ os.path.join(path, "lncRNA_target_from_low_throughput_experiments.xlsx") if self.version == "high_throughput": super().__init__(path, file_resources, source_col_name="lncrna_symbol", target_col_name="gene_symbol", edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, **kwargs) if self.version == "low_throughput": super().__init__(path, file_resources, source_col_name="GENCODE_gene_name", target_col_name="Target_official_symbol", edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, **kwargs)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): network = None if self.version == "high_throughput": network = self.load_network_high_throughput(file_resources, source_col_name, target_col_name, edge_attr, directed) elif self.version == "low_throughput": network = self.load_network_low_throughput(file_resources, source_col_name, target_col_name, edge_attr, directed) else: logger.warn("LncRNA2Target version argument must be one of 'high_throughput' or 'low_throughput'") return network
[docs] def load_network_high_throughput(self, file_resources, source_col_name="lncrna_symbol", target_col_name="gene_symbol", edge_attr=None, directed=True, filters=None): edges = pd.read_table(file_resources["lncRNA_target_from_high_throughput_experiments.txt"], sep="\t") edges = filter_rows(edges, filters) edges["lncrna_symbol"] = edges["lncrna_symbol"].str.upper() edges["lncrna_symbol"] = edges["lncrna_symbol"].str.replace("LINC", "") edges["gene_symbol"] = edges["gene_symbol"].str.upper() self.data = self.edges = edges lncrna2target_high_throughput_network = nx.from_pandas_edgelist(edges, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return lncrna2target_high_throughput_network
[docs] def load_network_low_throughput(self, file_resources, source_col_name="GENCODE_gene_name", target_col_name="Target_official_symbol", edge_attr=None, directed=True, filters=None): edges = pd.read_excel(file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"]) edges = filter_rows(edges, filters) edges["Target_official_symbol"] = edges["Target_official_symbol"].str.replace("(?i)(mir)", "hsa-mir-", regex=True) edges["Target_official_symbol"] = edges["Target_official_symbol"].str.replace("--", "-") edges["Target_official_symbol"].apply(lambda x: x.lower() if "mir" in x.lower() else x.upper()) edges["GENCODE_gene_name"] = edges["GENCODE_gene_name"].str.upper() self.data = self.edges = edges lncrna2target_low_throughput_network = nx.from_pandas_edgelist(edges, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return lncrna2target_low_throughput_network
[docs]class lncRNome(Interactions, Database): """Loads the lncRNome database from . Default path: . Default file_resources: { "": "", "": "", "": "", } """ def __init__(self, path, file_resources, source_col_name='Gene Name', target_col_name='Binding miRNAs', edge_attr=["miRNA Interaction Site", "Transcript ID"], directed=True, relabel_nodes=None, **kwargs): if file_resources is None: file_resources = {} file_resources["miRNA_binding_sites.txt"] = os.path.join(path, "miRNA_binding_sites.txt") file_resources["general_information.txt"] = os.path.join(path, "general_information.txt") super().__init__(path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): df = pd.read_table(self.file_resources["miRNA_binding_sites.txt"], header=0) print(self.name(), df.columns.tolist()) df['Binding miRNAs'] = df['Binding miRNAs'].str.lower() df['Binding miRNAs'] = df['Binding miRNAs'].str.replace("-3p.*|-5p.*", "", regex=True) lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph()) return lncRNome_miRNA_binding_sites_network
[docs] def load_dataframe(self, file_resources, blocksize=None): return pd.read_table(self.file_resources["general_information.txt"], header=0, usecols=["Gene Name", "Transcript Name", "Transcript Type", "Location", "Strand"])
[docs]class NPInter(Interactions): """Loads the NPInter database from http://bigdata.ibp.ac.cn/npinter4/ . Default path: "http://bigdata.ibp.ac.cn/npinter4/download/" . Default file_resources: { "interaction_NPInterv4.expr.txt": "file/interaction_NPInterv4.expr.txt.gz", } """ def __init__(self, path="http://bigdata.ibp.ac.cn/npinter4/download/", file_resources=None, source_col_name='ncName', target_col_name='tarName', edge_attr=["tarType", "tissueOrCell", "tag", 'class', "level"], filters=None, directed=True, relabel_nodes=None, verbose=False): if file_resources is None: file_resources = {} file_resources["interaction_NPInterv4.expr.txt.gz"] = \ os.path.join(path, "file/interaction_NPInterv4.expr.txt.gz") super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, verbose=verbose) def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: df = pd.read_table(file_resources["interaction_NPInterv4.expr.txt"], header=0, na_values=["-"]) print(self.name(), df.columns.tolist()) df["ncName"] = df["ncName"].str.upper() df["ncName"] = df["ncName"].str.strip("LNCRNA-") df["ncName"] = df["ncName"].str.replace("MALAT-1", "MALAT1") df["ncName"] = df["ncName"].str.replace("^MIR-", "hsa-mir-", regex=True) df["ncName"] = df["ncName"].str.replace("^MICRORNA-", "hsa-mir-", regex=True) df["tarName"] = df["tarName"].str.upper() return df
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): df = self.data df = filter_rows(df, filters) lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return lncRNome_miRNA_binding_sites_network
[docs]class StarBase(Interactions): """Loads the database from . Default path: . Default file_resources: { "": "", "": "", "": "", } """ def __init__(self, path, file_resources, source_col_name="geneName", target_col_name="pairGeneName", min_interactionNum=1, min_expNum=1, edge_attr=None, directed=True, relabel_nodes=None, **kwargs): if file_resources is None: file_resources = {} file_resources["starbase_3.0_lncrna_rna_interactions.csv"] = \ os.path.join(path, "starbase_3.0_lncrna_rna_interactions.csv") self.min_interactionNum = min_interactionNum self.min_expNum = min_expNum super().__init__(path, file_resources, source_col_name=source_col_name, target_col_name=target_col_name, directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, blocksize=None): df = pd.read_csv(self.file_resources["starbase_3.0_lncrna_rna_interactions.csv"], header=0) df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][ "pairGeneName"].str.lower() df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][ "pairGeneName"].str.replace("-3p.*|-5p.*", "") df = df[df["interactionNum"] >= self.min_interactionNum] df = df[df["expNum"] >= self.min_expNum] self.starBase_RNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=["interactionNum"], create_using=nx.DiGraph()) return self.starBase_RNA_RNA_network