Source code for openomics.database.interaction

import logging
from abc import abstractmethod
from typing import List, Dict

import networkx as nx
from Bio import SeqIO

from openomics.database.annotation import *
from openomics.database.base import Database
from openomics.database.sequence import SequenceDatabase


[docs]class Interactions(Database): def __init__(self, path, file_resources: Dict, source_col_name: str = None, target_col_name: str = None, source_index: str = None, target_index: str = None, edge_attr: List[str] = None, filters: dict = None, directed: bool = True, relabel_nodes: dict = None, verbose: bool = False): """ This is an abstract class used to instantiate a database given a folder containing various file resources. When creating a Database class, the load_data function is called where the file resources are load as a DataFrame and performs necessary processings. This class provides an interface for RNA classes to annotate various genomic annotation, functional annotation, sequences, and disease associations. Args: path (str): The folder path containing the data files. file_resources (dict): Default None, used to list required files for load_network of the dataset. A dictionary where keys are required filenames and value are file paths. If None, then the class constructor should automatically build the required file resources dict. source_col_name (str): Column name of DataFrame to be used as the source node names. target_col_name (str): Column name of DataFrame to be used as the target node names. source_index (str): One of {"gene_name", "gene_id", "transcript_name", "transcript_id", "protein_name", "protein_id"} target_index (str): One of {"gene_name", "gene_id", "transcript_name", "transcript_id", "protein_name", "protein_id"} edge_attr (list): A list of column names to be included as attributes for each edge (source-target pairs). filters (dict): Optional. A dict with key matching the data table (from load_network()) columns and values for the filtering on that column. directed (bool): default True, Whether to create a directed or an undirected network. relabel_nodes (dict): default None, A dictionary to rename nodes in the network, where the nodes with name <dict[key]> will be renamed to <dict[value]> """ # This class should NOT call super's __init__() self.validate_file_resources(path, file_resources, verbose=verbose) self.data_path = path self.file_resources = file_resources self.source_index = source_index self.target_index = target_index self.network = self.load_network(file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, edge_attr=edge_attr, directed=directed, filters=filters) assert isinstance(self.network, nx.Graph) self.network.name = self.name() if self.network is None: raise Exception( "Make sure load_network() returns a Networkx Graph and is called with super().__init__() in the constructor.") if relabel_nodes is not None: self.network = nx.relabel_nodes(self.network, mapping=relabel_nodes) self.verbose = verbose self.info() if verbose else None
[docs] def info(self): print("{}".format(nx.info(self.network)))
[docs] @classmethod def name(cls): return cls.__name__
[docs] @abstractmethod def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): """ Handles data processing from `file_resources` to a Pandas DataFrame which contain edgelist data, then constructs and return a NetworkX Graph. Args: file_resources: a dict of file name and file path/object source_col_name (str): column name of the dataframe for source in the edge target_col_name (str): column name of the dataframe for target in the edge edge_attr (list): list of str for column data to include in each edge directed (bool): True to return a DiGraph(), else Graph() filters: A dict of {column name: column values} to filter the dataframe Returns: network: a NetworkX Graph or DiGraph """ raise NotImplementedError
[docs] def get_interactions(self, nodelist=None, data=False, inclusive=True): """ Args: nodelist (list): A list of nodes to fetch edges from data (bool): default False Whether to include edge attributes inclusive (bool): default False Whether to only retrieve edges from nodes inclusive in nodelist. Returns: edges (OutEdgeView): a NetworkX edgelist """ if not hasattr(self, "network"): raise Exception( "{} does not have network interaction data yet. Must run load_network() and assign self.network field first.".format( self.name())) if nodelist is None: return self.network.edges(data=data) if inclusive: return self.network.subgraph(nodelist).edges(data=data) else: return self.network.edges(nbunch=nodelist, data=data)
[docs] def filter_values(self, df: pd.DataFrame, filters: dict, case=False): if filters is None: return df for key, values in filters.items(): if key not in df.columns: logging.info("Filter key `", key, "` must be in one of ", df.columns) continue n_rows = df.shape[0] if isinstance(values, list): if case is False: df = df[df[key].str.upper().isin([val.upper() for val in values])] else: df = df[df[key].isin(values)] elif isinstance(values, str): df = df[df[key].str.contains(values, case=case)] else: df = df[df[key] == values] logging.info("INFO: Removed ", n_rows - df.shape[0], " rows with `", key, "` != ", values) assert df.shape[0] > 0, "ERROR: Dataframe is empty because of filter: {filters}" return df
[docs]class GeneMania(Interactions): def __init__(self, path, file_resources=None, source_col_name="Gene_A", target_col_name="Gene_B", source_index="gene_name", target_index="gene_name", edge_attr=None, filters=None, directed=True, relabel_nodes=None): if edge_attr is None: edge_attr = ["Weight"] if file_resources is None: file_resources = {} file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"] = os.path.join(path, "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt") file_resources["identifier_mappings.txt"] = os.path.join(path, "identifier_mappings.txt") super(GeneMania, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, source_index=source_index, target_index=target_index, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): interactions = pd.read_table(file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"], low_memory=True) identifier = pd.read_table(file_resources["identifier_mappings.txt"]) # Rename ENSG ID's to gene names identifier = identifier[identifier["Source"] == "Gene Name"] id_mapping = pd.Series(identifier["Name"].values, index=identifier["Preferred_Name"]).to_dict() interactions.replace(id_mapping, inplace=True) genemania_RNA_RNA_network = nx.from_pandas_edgelist(interactions, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph()) return genemania_RNA_RNA_network
[docs]class BioGRID(Interactions): def __init__(self, path="https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/", file_resources=None, source_col_name="Official Symbol Interactor A", target_col_name="Official Symbol Interactor B", source_index="gene_name", target_index="gene_name", edge_attr=['Score', 'Throughput', 'Experimental System', 'Experimental System Type'], filters={"Organism Interactor A": 9606}, directed=False, relabel_nodes=None): if file_resources is None: file_resources = {} file_resources["BIOGRID-ALL-LATEST.tab2.zip"] = os.path.join(path, "BIOGRID-ALL-LATEST.tab2.zip") super(BioGRID, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, source_index=source_index, target_index=target_index, edge_attr=edge_attr, directed=directed, relabel_nodes=relabel_nodes, filters=filters)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): df = pd.read_table(file_resources["BIOGRID-ALL-LATEST.tab2.zip"], na_values=["-"], # usecols=['Official Symbol Interactor A', 'Official Symbol Interactor B', # 'Organism Interactor A', 'Score', 'Throughput', 'Qualifications', # 'Modification', 'Phenotypes'], low_memory=True) logging.info("{}: {}".format(self.name(), df.columns.tolist())) df = self.filter_values(df, filters) network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return network
[docs]class STRING(Interactions, SequenceDatabase): COLUMNS_RENAME_DICT = { "protein_external_id": "protein_id", "preferred_name": "protein_name", } def __init__(self, path="https://stringdb-static.org/download/", file_resources=None, species_id="9606", source_col_name="item_id_a", target_col_name="item_id_b", source_index="protein_name", target_index="protein_name", edge_attr=["score"], directed=False, relabel_nodes=None, verbose=False): """ Args: species_id (str): Required. Must provide species id number to download the correct STRING dataset. """ if file_resources is None: file_resources = {} file_resources["protein.actions.txt"] = os.path.join(path, "protein.actions.v11.0/{}.protein.actions.v11.0.txt.gz".format( species_id)) file_resources["protein.links.txt"] = os.path.join(path, "protein.links.v11.0/{}.protein.links.v11.0.txt.gz".format( species_id)) file_resources["protein.info.txt"] = os.path.join(path, "protein.info.v11.0/{}.protein.info.v11.0.txt.gz".format( species_id)) file_resources["protein.sequences.fa"] = os.path.join(path, "protein.sequences.v11.0/{}.protein.sequences.v11.0.fa.gz".format( species_id)) super(STRING, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, source_index=source_index, target_index=target_index, edge_attr=edge_attr, directed=directed, relabel_nodes=relabel_nodes, verbose=verbose) self.file_resources["protein.info.txt"].seek(0) self.data = pd.read_table(file_resources["protein.info.txt"]) self.data = self.data.reset_index() self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): # protein_interactions = pd.read_table(file_resources["protein.links.txt"], sep=" ", low_memory=True) protein_interactions = pd.read_table(file_resources["protein.actions.txt"], sep="\t", low_memory=True) logging.info("{}: {}".format(self.name(), protein_interactions.columns.tolist())) protein_info = pd.read_table(file_resources["protein.info.txt"]) self.protein_id2name = protein_info.set_index("protein_external_id")["preferred_name"].to_dict() network = nx.from_pandas_edgelist(protein_interactions, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) network = nx.relabel_nodes(network, self.protein_id2name) return network
[docs] def get_sequences(self, index="protein_name", omic=None, agg_sequences=None): if hasattr(self, "seq_dict"): return self.seq_dict self.seq_dict = {} collisions = 0 for record in SeqIO.parse(self.file_resources["protein.sequences.fa"], "fasta"): gene_id = str(record.name) gene_name = self.protein_id2name[gene_id] sequence_str = str(record.seq) if index == "protein_name": key = gene_name elif index == "protein_id": key = gene_id if key in self.seq_dict: collisions += 1 self.seq_dict[key] = sequence_str logging.info("Seq {} collisions: {}".format(index, collisions)) return self.seq_dict
[docs]class LncBase(Interactions, Database): def __init__(self, path, file_resources=None, strip_mirna_name=False, source_col_name="mirna", target_col_name="geneId", source_index="transcript_name", target_index="gene_id", edge_attr=None, filters={"species": "Homo sapiens"}, directed=True, relabel_nodes=None, ): self.strip_mirna_name = strip_mirna_name if edge_attr is None: edge_attr = ["tissue", "positive_negative"] if file_resources is None: file_resources = {} file_resources["LncBasev2_download.csv"] = os.path.join(path, "LncBasev2_download.csv") super(LncBase, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, source_index=source_index, target_index=target_index, edge_attr=edge_attr, directed=directed, relabel_nodes=relabel_nodes, filters=filters)
[docs] def get_rename_dict(self, from_index="geneId", to_index="geneName"): lncbase_df = pd.read_table(self.file_resources["LncBasev2_download.csv"], low_memory=True) gene_id_to_gene_name_dict = pd.Series(lncbase_df["geneName"].values, index=lncbase_df["geneId"]).to_dict() return gene_id_to_gene_name_dict
[docs] def load_network(self, file_resources, source_col_name="mirna", target_col_name="gene_id", edge_attr=None, directed=True, filters=None): if edge_attr is None: edge_attr = ["tissue", "positive_negative"] df = pd.read_table(file_resources["LncBasev2_download.csv"], low_memory=True) logging.info(self.name(), df.columns.tolist()) df.replace({"species": {"Homo Sapiens": "Homo sapiens", "Mus Musculus": "Mus musculus"}}, inplace=True) df = self.filter_values(df, filters) if self.strip_mirna_name: df['mirna'] = df['mirna'].str.lower() df['mirna'] = df['mirna'].str.replace("-3p.*|-5p.*", "") lncBase_lncRNA_miRNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return lncBase_lncRNA_miRNA_network
[docs]class LncReg(Interactions): def __init__(self, path, file_resources, source_col_name='A_name_in_paper', target_col_name='B_name_in_paper', source_index="transcript_name", target_index="gene_name", edge_attr=["relationship", "mechanism", "pmid"], filters=None, directed=True, relabel_nodes=None, verbose=False): if file_resources is None: file_resources = {} file_resources["data.xlsx"] = os.path.join(path, "data.xlsx") super(LncReg, self).__init__(path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, source_index=source_index, target_index=target_index, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, verbose=verbose)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): df = pd.read_excel(self.file_resources["data.xlsx"]) logging.info(self.name(), df.columns.tolist()) df = df[df["species"] == "Homo sapiens"] df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][ "B_name_in_paper"].str.replace("-3p.*|-5p.*", "") df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][ "B_name_in_paper"].str.replace("MIR", "hsa-mir-") df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][ "B_name_in_paper"].str.replace("let-", "hsa-let-") LncReg_lncRNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph()) return LncReg_lncRNA_RNA_network
[docs]class lncRInter(Interactions): def __init__(self, path, file_resources=None, source_col_name="lncrna", target_col_name='Interacting partner', source_index="gene_name", target_index="gene_name", edge_attr=None, filters=None, directed=True, relabel_nodes=None): if edge_attr is None: edge_attr = ["Interaction Class", "Interaction Mode", "Tissue", "Phenotype"] if file_resources is None: file_resources = {} file_resources["human_interactions.txt"] = os.path.join(path, "human_interactions.txt") super(lncRInter, self).__init__(path, file_resources, source_col_name=source_col_name, target_col_name=target_col_name, source_index=source_index, target_index=target_index, edge_attr=edge_attr, directed=directed, relabel_nodes=relabel_nodes, filters=filters)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): lncRInter_df = pd.read_table(file_resources["human_interactions.txt"]) logging.info(self.name(), lncRInter_df.columns.tolist()) lncRInter_df = self.filter_values(lncRInter_df, filters) # Data cleaning lncRInter_df.loc[lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"] = \ lncRInter_df.loc[ lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"].str.lower() lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mirlet", "hsa-let-") lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mir", "hsa-mir-") lncRInter_df["Interacting partner"][ lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")] = \ lncRInter_df["Interacting partner"][ lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")].apply( lambda x: x[:-1] + "-" + x[-1]) lncRInter_network = nx.from_pandas_edgelist(lncRInter_df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return lncRInter_network
[docs]class LncRNA2Target(Interactions): def __init__(self, path="http://123.59.132.21/lncrna2target/data/", file_resources=None, source_index="gene_name", target_index="gene_name", edge_attr=None, filters={"species_id": 9606, "Species": "Homo sapiens"}, directed=True, relabel_nodes=None, version="high_throughput", ): """ Args: version (str): one of ["high_throughput", "low_throughput"]. The high_throughput version of lncRNA2Target database is v2.0 and low_throughput is v1.0, according to the database's website. species_id (str, int): one of [9606, "Homo sapiens"]. The species column in high_throughput is formatted in int (e.g. 9606) and in low_throughput is in str (e.g. "Homo sapiens") """ self.version = version if file_resources is None: file_resources = {} file_resources["lncRNA_target_from_high_throughput_experiments.txt"] = os.path.join(path, "lncrna_target.rar") file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"] = os.path.join(path, "lncRNA_target_from_low_throughput_experiments.xlsx") if self.version == "high_throughput": super(LncRNA2Target, self).__init__(path, file_resources, source_col_name="lncrna_symbol", target_col_name="gene_symbol", source_index=source_index, target_index=target_index, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes) if self.version == "low_throughput": super(LncRNA2Target, self).__init__(path, file_resources, source_col_name="GENCODE_gene_name", target_col_name="Target_official_symbol", source_index=source_index, target_index=target_index, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): if self.version == "high_throughput": return self.load_network_high_throughput(file_resources, source_col_name, target_col_name, edge_attr, directed) elif self.version == "low_throughput": return self.load_network_low_throughput(file_resources, source_col_name, target_col_name, edge_attr, directed) else: raise Exception("LncRNA2Target version argument must be one of 'high_throughput' or 'low_throughput'")
[docs] def load_network_high_throughput(self, file_resources, source_col_name="lncrna_symbol", target_col_name="gene_symbol", edge_attr=None, directed=True, filters=None): table = pd.read_table(file_resources["lncRNA_target_from_high_throughput_experiments.txt"], sep="\t") table = self.filter_values(table, filters) logging.info(self.name(), table.columns.tolist()) table["lncrna_symbol"] = table["lncrna_symbol"].str.upper() table["lncrna_symbol"] = table["lncrna_symbol"].str.replace("LINC", "") table["gene_symbol"] = table["gene_symbol"].str.upper() lncrna2target_high_throughput_network = nx.from_pandas_edgelist(table, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return lncrna2target_high_throughput_network
[docs] def load_network_low_throughput(self, file_resources, source_col_name="GENCODE_gene_name", target_col_name="Target_official_symbol", edge_attr=None, directed=True, filters=None): table = pd.read_excel(file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"]) table = self.filter_values(table, filters) logging.info(self.name(), table.columns.tolist()) table["Target_official_symbol"] = table["Target_official_symbol"].str.replace("(?i)(mir)", "hsa-mir-", regex=True) table["Target_official_symbol"] = table["Target_official_symbol"].str.replace("--", "-") table["Target_official_symbol"].apply(lambda x: x.lower() if "mir" in x.lower() else x.upper()) table["GENCODE_gene_name"] = table["GENCODE_gene_name"].str.upper() lncrna2target_low_throughput_network = nx.from_pandas_edgelist(table, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return lncrna2target_low_throughput_network
[docs]class lncRNome(Interactions, Database): def __init__(self, path, file_resources, source_col_name='Gene Name', target_col_name='Binding miRNAs', source_index="gene_name", target_index="gene_name", edge_attr=["miRNA Interaction Site", "Transcript ID"], directed=True, relabel_nodes=None, npartitions=0): if file_resources is None: file_resources = {} file_resources["miRNA_binding_sites.txt"] = os.path.join(path, "miRNA_binding_sites.txt") file_resources["general_information.txt"] = os.path.join(path, "general_information.txt") super(lncRNome, self).__init__(path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, source_index=source_index, target_index=target_index, edge_attr=edge_attr, directed=directed, relabel_nodes=relabel_nodes, npartitions=npartitions)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): df = pd.read_table(self.file_resources["miRNA_binding_sites.txt"], header=0) logging.info(self.name(), df.columns.tolist()) df['Binding miRNAs'] = df['Binding miRNAs'].str.lower() df['Binding miRNAs'] = df['Binding miRNAs'].str.replace("-3p.*|-5p.*", "") lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph()) return lncRNome_miRNA_binding_sites_network
[docs] def load_dataframe(self, file_resources, npartitions=None): return pd.read_table(self.file_resources["general_information.txt"], header=0, usecols=["Gene Name", "Transcript Name", "Transcript Type", "Location", "Strand"])
[docs]class NPInter(Interactions): def __init__(self, path="http://bigdata.ibp.ac.cn/npinter4/download/", file_resources=None, source_col_name='ncName', target_col_name='tarName', source_index="gene_name", target_index="gene_name", edge_attr=["tarType", "tissueOrCell", "tag", "level"], filters=None, directed=True, relabel_nodes=None, verbose=False): if file_resources is None: file_resources = {} file_resources["interaction_NPInterv4.expr.txt"] = os.path.join(path, "file/interaction_NPInterv4.expr.txt.gz") super(NPInter, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, source_index=source_index, target_index=target_index, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, verbose=verbose)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): df = pd.read_table(file_resources["interaction_NPInterv4.expr.txt"], header=0, na_values=["-"]) logging.info(self.name(), df.columns.tolist()) df["ncName"] = df["ncName"].str.upper() df["ncName"] = df["ncName"].str.strip("LNCRNA-") df["ncName"] = df["ncName"].str.replace("MALAT-1", "MALAT1") df["ncName"] = df["ncName"].str.replace("^MIR-", "hsa-mir-", regex=True) df["ncName"] = df["ncName"].str.replace("^MICRORNA-", "hsa-mir-", regex=True) df["tarName"] = df["tarName"].str.upper() df = self.filter_values(df, filters) lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return lncRNome_miRNA_binding_sites_network
[docs]class StarBase(Interactions): def __init__(self, path, file_resources, source_col_name="geneName", target_col_name="pairGeneName", source_index="gene_name", target_index="gene_name", min_interactionNum=1, min_expNum=1, edge_attr=None, directed=True, relabel_nodes=None, npartitions=0): if file_resources is None: file_resources = {} file_resources["starbase_3.0_lncrna_rna_interactions.csv"] = \ os.path.join(path, "starbase_3.0_lncrna_rna_interactions.csv") self.min_interactionNum = min_interactionNum self.min_expNum = min_expNum super(StarBase, self).__init__(path, file_resources, source_col_name, target_col_name, source_index, target_index, edge_attr, directed, relabel_nodes, npartitions)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): df = pd.read_csv(self.file_resources["starbase_3.0_lncrna_rna_interactions.csv"], header=0) df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][ "pairGeneName"].str.lower() df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][ "pairGeneName"].str.replace("-3p.*|-5p.*", "") df = df[df["interactionNum"] >= self.min_interactionNum] df = df[df["expNum"] >= self.min_expNum] self.starBase_RNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=["interactionNum"], create_using=nx.DiGraph()) return self.starBase_RNA_RNA_network
[docs]class MiRTarBase(Interactions): def __init__(self, path="http://mirtarbase.mbc.nctu.edu.tw/cache/download/7.0/", file_resources=None, source_col_name="miRNA", target_col_name="Target Gene", source_index="transcript_name", target_index="gene_name", edge_attr=None, filters={"Species (Target Gene)": "Homo sapiens"}, directed=True, relabel_nodes=None, strip_mirna_name=False): if edge_attr is None: edge_attr = ["Support Type"] self.strip_mirna_name = strip_mirna_name if file_resources is None: file_resources = {} file_resources["miRTarBase_MTI.xlsx"] = os.path.join(path, "miRTarBase_MTI.xlsx") super(MiRTarBase, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, source_index=source_index, target_index=target_index, edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, )
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): df = pd.read_excel(self.file_resources["miRTarBase_MTI.xlsx"]) logging.info(self.name(), df.columns.tolist()) df = self.filter_values(df, filters) if self.strip_mirna_name: df['miRNA'] = df['miRNA'].str.lower() df['miRNA'] = df['miRNA'].str.replace("-3p.*|-5p.*", "") mir_target_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return mir_target_network
[docs]class TargetScan(Interactions, Database): def __init__(self, path, file_resources=None, source_col_name="MiRBase ID", target_col_name="Gene Symbol", source_index="transcript_name", target_index="transcript_name", edge_attr=["tissue", "positive_negative"], directed=True, relabel_nodes=None, species=9606, strip_mirna_name=False): if edge_attr is None: edge_attr = ["tissue", "positive_negative"] self.strip_mirna_name = strip_mirna_name self.species = species if file_resources is None: file_resources = {} file_resources["miR_Family_Info.txt"] = os.path.join(path, "miR_Family_Info.txt") file_resources["Predicted_Targets_Info.default_predictions.txt"] = os.path.join(path, "Predicted_Targets_Info.default_predictions.txt") super(TargetScan, self).__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, target_col_name=target_col_name, source_index=source_index, target_index=target_index, edge_attr=edge_attr, directed=directed, relabel_nodes=relabel_nodes)
[docs] def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters): self.df = self.process_miR_family_info_table(file_resources, self.species) interactions_df = self.process_interactions_table(file_resources, self.df, self.species) logging.info(self.name(), interactions_df.columns.tolist()) mir_target_network = nx.from_pandas_edgelist(interactions_df, source=source_col_name, target=target_col_name, edge_attr=edge_attr, create_using=nx.DiGraph() if directed else nx.Graph()) return mir_target_network
[docs] def process_miR_family_info_table(self, file_resources, species=None): miR_Family_Info_df = pd.read_table(file_resources["miR_Family_Info.txt"], delimiter='\t') if species: miR_Family_Info_df = miR_Family_Info_df[miR_Family_Info_df['Species ID'] == species] # Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19 if self.strip_mirna_name: miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.lower() miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "") miR_Family_Info_df.drop_duplicates(inplace=True) miR_Family_Info_df = miR_Family_Info_df.filter(items=['miR family', 'MiRBase ID', 'Seed+m8', 'Mature sequence', 'Family Conservation?', 'MiRBase Accession'], axis="columns") miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].astype(str) return miR_Family_Info_df
[docs] def process_interactions_table(self, file_resources, family_to_miR_df, species): """ This functions joins the interactions data table between miR Family and targets, and Args: file_resources: family_to_miR_df: species: Returns: """ # Load data frame from file family_interactions_df = pd.read_table(file_resources["Predicted_Targets_Info.default_predictions.txt"], delimiter='\t', low_memory=True) # Select only homo sapiens miRNA-target pairs if species: family_interactions_df = family_interactions_df[family_interactions_df["Species ID"] == species] family_interactions_df = family_interactions_df.filter(items=["miR Family", "Gene Symbol"], axis="columns") family_to_miR_df = family_to_miR_df.filter(items=['miR family', 'MiRBase ID'], axis="columns") family_to_miR_df.rename(columns={'miR family': 'miR Family'}, inplace=True) # map miRBase ID names to miR Family # family_interactions_df = pd.merge(family_interactions_df, family_to_miR_df, how='outer', on="miR Family") family_to_miR_df.set_genes_index("miR Family", inplace=True) family_interactions_df.set_genes_index("miR Family", inplace=True) mir_interactions_df = family_interactions_df.join(family_to_miR_df, how='outer', on="miR Family").reset_index() # Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19 if self.strip_mirna_name: mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.lower() mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "") return mir_interactions_df