Source code for openomics.database.annotation

import os
from io import StringIO
from os.path import expanduser

import dask.dataframe as dd
import pandas as pd
from bioservices import BioMart
from pandas.errors import ParserError

from .base import Database

DEFAULT_CACHE_PATH = os.path.join(expanduser("~"), ".openomics")
DEFAULT_LIBRARY_PATH = os.path.join(expanduser("~"), ".openomics", "databases")

__all__ = ['ProteinAtlas', 'GTEx', 'NONCODE', 'EnsemblGenes', 'EnsemblGeneSequences', 'EnsemblTranscriptSequences',
           'EnsemblSNP', 'EnsemblSomaticVariation', 'TANRIC']

[docs]class ProteinAtlas(Database): """Loads the database from . Default path: . Default file_resources: { "": "", "": "", "": "", } """ COLUMNS_RENAME_DICT = { "Gene": "protein_name", "Ensembl": "gene_id", } def __init__(self, path="https://www.proteinatlas.org/download/", file_resources=None, col_rename=COLUMNS_RENAME_DICT, blocksize=0, verbose=False): """ Args: path: file_resources: col_rename: blocksize: verbose: """ if file_resources is None: file_resources = {} file_resources["proteinatlas.tsv.zip"] = "proteinatlas.tsv.zip" super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose)
[docs] def load_dataframe(self, file_resources, blocksize=None): """ Args: file_resources: blocksize: """ if blocksize: df = dd.read_table(file_resources["proteinatlas.tsv"], blocksize=None if isinstance(blocksize, bool) else blocksize) else: df = pd.read_table(file_resources["proteinatlas.tsv"]) return df
[docs] def get_expressions(self, index="gene_name", type="Tissue RNA"): """Returns (NX) expressions from the proteinatlas.tsv table. :param index: a column name to index by. If column contain multiple values, then aggregate by median values. :param type: one of {"Tissue RNA", "Cell RNA", "Blood RNA", "Brain RNA", "RNA - "}. If "RNA - ", then select all types of expressions. Args: index: type: Returns: expressions (pd.DataFrame): """ columns = "|".join([type, index]) expressions = self.data.filter(regex=columns).groupby( index).median() return expressions
[docs]class GTEx(Database): """Loads the database from https://www.gtexportal.org/home/ . Default path: "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/" . Default file_resources: { "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct": "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz", "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt": "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct": "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz", } """ COLUMNS_RENAME_DICT = { "Name": "gene_id", "Description": "gene_name" } def __init__(self, path="https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/", file_resources=None, col_rename=None, blocksize=0, verbose=False): """ Args: path: file_resources: col_rename: blocksize: verbose: """ if file_resources is None: file_resources = {} file_resources["GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"] = \ "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz" file_resources["GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"] = \ "https://storage.googleapis.com/gtex_analysis_v8/annotations/" \ "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt" file_resources["GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz"] = \ "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz" super().__init__(path, file_resources, col_rename=None, blocksize=blocksize, verbose=verbose)
[docs] def load_dataframe(self, file_resources, blocksize=None) -> pd.DataFrame: """ Args: file_resources: blocksize: """ gene_exp_medians = pd.read_csv( self.file_resources["GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct"], sep='\t', header=1, skiprows=1) gene_exp_medians["Name"] = gene_exp_medians["Name"].str.replace("[.]\d*", "", regex=True) gene_exp_medians = gene_exp_medians.rename(columns=self.COLUMNS_RENAME_DICT) # Must be done here gene_exp_medians.set_index(["gene_id", "gene_name"], inplace=True) # # Sample attributes (needed to get tissue type) # SampleAttributes = pd.read_table( # self.file_resources["GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"], # ) # SampleAttributes.set_index("SAMPID", inplace=True) # # # Transcript expression for all samples # transcript_exp = pd.read_csv( # self.file_resources["GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct"], # sep='\t', header=1, skiprows=1) # print("transcript_exp", transcript_exp.columns) # transcript_exp["gene_id"] = transcript_exp["gene_id"].str.replace("[.]\d*", "") # transcript_exp["transcript_id"] = transcript_exp["transcript_id"].str.replace("[.]\d*", "") # transcript_exp.set_index(["gene_id", "transcript_id"], inplace=True) # # # Join by sample with tissue type, group expressions by tissue type, and compute medians for each # transcript_exp_medians = transcript_exp.T \ # .join(SampleAttributes["SMTSD"], how="left") \ # .groupby("SMTSD") \ # .median() # # # Reset multilevel index # transcript_exp_medians.index.rename(name=None, inplace=True) # transcript_exp_medians = transcript_exp_medians.T.set_index( # pd.MultiIndex.from_tuples(tuples=transcript_exp_medians.T.index, names=["gene_id", "transcript_id"])) # # gene_transcript_exp_medians = pd.concat([gene_exp_medians, transcript_exp_medians], join="inner", copy=True) # print("gene_transcript_exp_medians \n", gene_transcript_exp_medians) return gene_exp_medians
[docs]class NONCODE(Database): """Loads the NONCODE database from http://noncode.org . Default path: "http://www.noncode.org/datadownload" . Default file_resources: { "NONCODEv6_human.fa": "NONCODEv6_human.fa.gz", "": "", "": "", } """ def __init__(self, path="http://www.noncode.org/datadownload", file_resources=None, col_rename=None, verbose=False, blocksize=None): """ Args: path: file_resources: col_rename: verbose: blocksize: """ if file_resources is None: file_resources = {} file_resources["NONCODEv5_source"] = os.path.join(path, "NONCODEv5_source") file_resources["NONCODEv5_Transcript2Gene"] = os.path.join(path, "NONCODEv5_Transcript2Gene") file_resources["NONCODEv5_human.func"] = os.path.join(path, "NONCODEv5_human.func") super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose)
[docs] def load_dataframe(self, file_resources, blocksize=None): """ Args: file_resources: blocksize: """ source_df = pd.read_table(file_resources["NONCODEv5_source"], header=None) source_df.columns = ["NONCODE Transcript ID", "name type", "Gene ID"] transcript2gene_df = pd.read_table(file_resources["NONCODEv5_Transcript2Gene"], header=None) transcript2gene_df.columns = ["NONCODE Transcript ID", "NONCODE Gene ID"] if blocksize: self.noncode_func_df = dd.read_table(file_resources["NONCODEv5_human.func"], header=None, blocksize=None if isinstance(blocksize, bool) else blocksize) else: self.noncode_func_df = pd.read_table(file_resources["NONCODEv5_human.func"], header=None) self.noncode_func_df.columns = ["NONCODE Gene ID", "GO terms"] self.noncode_func_df.set_index("NONCODE Gene ID", inplace=True) # Convert to NONCODE transcript ID for the functional annotation data self.noncode_func_df["NONCODE Transcript ID"] = self.noncode_func_df.index.map( pd.Series(transcript2gene_df['NONCODE Transcript ID'].values, index=transcript2gene_df['NONCODE Gene ID']).to_dict()) # Convert NONCODE transcript ID to gene names source_gene_names_df = source_df[source_df["name type"] == "NAME"].copy() self.noncode_func_df["Gene Name"] = self.noncode_func_df["NONCODE Transcript ID"].map( pd.Series(source_gene_names_df['Gene ID'].values, index=source_gene_names_df['NONCODE Transcript ID']).to_dict())
[docs]class BioMartManager: """ A base class with functions to query Ensembl Biomarts "https://www.ensembl.org/biomart". """ DTYPES = { 'entrezgene_id': 'str', 'gene_biotype': 'category', 'transcript_biotype': 'category', 'chromosome_name': 'category', 'transcript_start': 'int', 'transcript_end': 'int', 'transcript_length': 'int', 'mirbase_id': 'str'} def __init__(self, dataset, attributes, host, filename): """ Args: dataset: attributes: host: filename: """ pass # Does not instantiate
[docs] def retrieve_dataset(self, host, dataset, attributes, filename, blocksize=None): """ Args: host: dataset: attributes: filename: blocksize: """ filename = os.path.join(DEFAULT_CACHE_PATH, f"{filename}.tsv") args = dict( sep="\t", low_memory=True, dtype=self.DTYPES, ) if os.path.exists(filename): if blocksize: df = dd.read_csv(filename, blocksize=None if isinstance(blocksize, bool) else blocksize, **args) else: df = pd.read_csv(filename, **args) else: df = self.query_biomart(host=host, dataset=dataset, attributes=attributes, cache=True, save_filename=filename) return df
[docs] def cache_dataset(self, dataset, dataframe, save_filename): """ Args: dataset: dataframe: save_filename: """ if not os.path.exists(DEFAULT_CACHE_PATH): os.makedirs(DEFAULT_CACHE_PATH, exist_ok=True) if save_filename is None: save_filename = os.path.join(DEFAULT_CACHE_PATH, "{}.tsv".format(dataset)) dataframe.to_csv(save_filename, sep="\t", index=False) return save_filename
[docs] def query_biomart(self, dataset, attributes, host="www.ensembl.org", cache=True, save_filename=None, blocksize=None): """ Args: dataset: attributes: host: cache: save_filename: blocksize: """ bm = BioMart(host=host) bm.new_query() bm.add_dataset_to_xml(dataset) for at in attributes: bm.add_attribute_to_xml(at) xml_query = bm.get_xml() print("Querying {} from {} with attributes {}...".format(dataset, host, attributes)) results = bm.query(xml_query) try: if blocksize: df = dd.read_csv(StringIO(results), header=None, names=attributes, sep="\t", low_memory=True, dtype=self.DTYPES, blocksize=None if isinstance(blocksize, bool) else blocksize) else: df = pd.read_csv(StringIO(results), header=None, names=attributes, sep="\t", low_memory=True, dtype=self.DTYPES) except Exception as e: raise ParserError(f'BioMart Query Result: {results}') if cache: self.cache_dataset(dataset, df, save_filename) return df
[docs]class EnsemblGenes(BioMartManager, Database): COLUMNS_RENAME_DICT = {'ensembl_gene_id': 'gene_id', 'external_gene_name': 'gene_name', 'ensembl_transcript_id': 'transcript_id', 'external_transcript_name': 'transcript_name', 'rfam': 'Rfams'} def __init__(self, biomart="hsapiens_gene_ensembl", attributes=None, host="www.ensembl.org", blocksize=None): # Do not call super().__init__() """ Args: biomart: attributes: host: blocksize: """ if attributes is None: attributes = ['ensembl_gene_id', 'external_gene_name', 'ensembl_transcript_id', 'external_transcript_name', 'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length', 'gene_biotype', 'transcript_biotype', ] self.filename = "{}.{}".format(biomart, self.__class__.__name__) self.biomart = biomart self.host = host self.data = self.load_data(dataset=biomart, attributes=attributes, host=self.host, filename=self.filename, blocksize=blocksize) self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) def name(self): return f"{super().name()} {self.biomart}"
[docs] def load_data(self, dataset, attributes, host, filename=None, blocksize=None): """ Args: dataset: attributes: host: filename: blocksize: """ df = self.retrieve_dataset(host, dataset, attributes, filename, blocksize=blocksize) return df
[docs]class EnsemblGeneSequences(EnsemblGenes): def __init__(self, biomart="hsapiens_gene_ensembl", attributes=None, host="www.ensembl.org", blocksize=None): """ Args: biomart: attributes: host: blocksize: """ if attributes is None: attributes = ['ensembl_gene_id', 'gene_exon_intron', 'gene_flank', 'coding_gene_flank', 'gene_exon', 'coding'] self.filename = "{}.{}".format(biomart, self.__class__.__name__) self.biomart = biomart self.host = host self.df = self.load_data(dataset=biomart, attributes=attributes, host=self.host, filename=self.filename, blocksize=blocksize) self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)
[docs]class EnsemblTranscriptSequences(EnsemblGenes): def __init__(self, biomart="hsapiens_gene_ensembl", attributes=None, host="www.ensembl.org", blocksize=None): """ Args: biomart: attributes: host: blocksize: """ if attributes is None: attributes = ['ensembl_transcript_id', 'transcript_exon_intron', 'transcript_flank', 'coding_transcript_flank', '5utr', '3utr'] self.filename = "{}.{}".format(biomart, self.__class__.__name__) self.biomart = biomart self.host = host self.df = self.load_data(dataset=biomart, attributes=attributes, host=self.host, filename=self.filename, blocksize=blocksize) self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)
[docs]class EnsemblSNP(EnsemblGenes): def __init__(self, biomart="hsapiens_snp", attributes=None, host="www.ensembl.org", blocksize=None): """ Args: biomart: attributes: host: blocksize: """ if attributes is None: attributes = ['synonym_name', 'variation_names', 'minor_allele', 'associated_variant_risk_allele', 'ensembl_gene_stable_id', 'ensembl_transcript_stable_id', 'phenotype_name', 'chr_name', 'chrom_start', 'chrom_end'] self.filename = "{}.{}".format(biomart, self.__class__.__name__) self.biomart = biomart self.host = host self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)
[docs]class EnsemblSomaticVariation(EnsemblGenes): def __init__(self, biomart="hsapiens_snp_som", attributes=None, host="www.ensembl.org", blocksize=None): """ Args: biomart: attributes: host: blocksize: """ if attributes is None: attributes = ['somatic_variation_name', 'somatic_source_name', 'somatic_allele', 'somatic_minor_allele', 'somatic_clinical_significance', 'somatic_validated', 'somatic_transcript_location', 'somatic_mapweight', 'somatic_chromosome_start', 'somatic_chromosome_end'] self.filename = "{}.{}".format(biomart, self.__class__.__name__) self.biomart = biomart self.host = host self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)
[docs]class TANRIC(Database): def __init__(self, path, file_resources=None, col_rename=None, blocksize=0, verbose=False): """ Args: path: file_resources: col_rename: blocksize: verbose: """ super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose)
[docs] def load_dataframe(self, file_resources, blocksize=None): """ Args: file_resources: blocksize: """ pass
[docs] def get_expressions(self, genes_index): """Preprocess LNCRNA expression file obtained from TANRIC MDAnderson, and replace ENSEMBL gene ID to HUGO gene names (HGNC). This function overwrites the GenomicData.process_expression_table() function which processes TCGA-Assembler data. TANRIC LNCRNA expression values are log2 transformed Args: genes_index: """ df = pd.read_table(self.file_resources["TCGA-LUAD-rnaexpr.tsv"]) df[genes_index] = df[genes_index].str.replace("[.]\d*", "") # Removing .# ENGS gene version number at the end df = df[~df[genes_index].duplicated(keep='first')] # Remove duplicate genes # Drop NA gene rows df.dropna(axis=0, inplace=True) # Transpose matrix to patients rows and genes columns df.index = df[genes_index] df = df.T.iloc[1:, :] # Change index string to bcr_sample_barcode standard def change_patient_barcode(s): if "Normal" in s: return s[s.find('TCGA'):] + "-11A" elif "Tumor" in s: return s[s.find('TCGA'):] + "-01A" else: return s df.index = df.index.map(change_patient_barcode) df.index.name = "gene_id" return df