import logging
from typing import List, Dict, Union
import pandas as pd
import openomics
from .clinical import (
ClinicalData,
HISTOLOGIC_SUBTYPE_COL,
PATHOLOGIC_STAGE_COL,
TUMOR_NORMAL_COL,
PREDICTED_SUBTYPE_COL,
)
from .genomics import SomaticMutation, CopyNumberVariation, DNAMethylation
from .imageomics import WholeSlideImage
from .proteomics import Protein
from .transcriptomics import MessengerRNA, MicroRNA, LncRNA, Expression
[docs]class MultiOmics:
"""A data object which holds multiple -omics data for a single clinical cohort."""
def __init__(self, cohort_name, omics_data=None):
"""
Args:
cohort_name (str): the clinical cohort name
"""
self._cohort_name = cohort_name
self._omics = []
# This is a data dictionary accessor to retrieve individual -omic data
self.data = {}
if omics_data:
for omics in omics_data:
self.add_omic(omics)
[docs] def add_omic(self,
omic_data: Expression,
initialize_annotations: bool = True):
"""Adds an omic object to the Multiomics such that the samples in omic
matches the samples existing in the other omics.
Args:
omic_data (Expression): The omic to add, e.g., MessengerRNA,
MicroRNA, LncRNA, etc.
initialize_annotations (bool): default True. If true, initializes
the annotation dataframe in the omic object
"""
self.__dict__[omic_data.name()] = omic_data
if omic_data.name not in self._omics:
self._omics.append(omic_data.name())
# dictionary as data accessor to the expression data
self.data[omic_data.name()] = omic_data.expressions
# Initialize annotation
if initialize_annotations:
omic_data.initialize_annotations(index=omic_data.gene_index,
gene_list=None)
logging.info(
omic_data.name(),
self.data[omic_data.name()].shape if hasattr(
self.data[omic_data.name()], "shape") else ": None",
", indexed by:",
omic_data.annotations.index.name,
)
[docs] def add_clinical_data(self, clinical: openomics.clinical.ClinicalData, **kwargs):
""" Add a ClinicalData instance to the MultiOmics instance.
Args:
clinical (openomics.clinical.ClinicalData):
"""
if not isinstance(clinical, ClinicalData):
raise Exception("Must pass a ClinicalData in, not a file path.")
self.clinical = clinical
self.data["PATIENTS"] = self.clinical.patient
if hasattr(self.clinical, "biospecimen"):
self.data["BIOSPECIMENS"] = self.clinical.biospecimen
if hasattr(self.clinical, "drugs"):
self.data["DRUGS"] = self.clinical.drugs
self.build_samples(**kwargs)
[docs] def get_omics_list(self):
return self._omics
def __getitem__(self, item:str):
"""This function allows the MultiOmicData class objects to access
individual omics by a dictionary lookup, e.g. openomics["MicroRNA"]
Args:
item (str): a string of the class name
"""
if item.lower() == MessengerRNA.name().lower():
return self.__getattribute__(MessengerRNA.name())
elif item.lower() == MicroRNA.name().lower():
return self.__getattribute__(MicroRNA.name())
elif item.lower() == LncRNA.name().lower():
return self.__getattribute__(LncRNA.name())
elif item.lower() == WholeSlideImage.name().lower():
return self.__getattribute__(WholeSlideImage.name())
elif item.lower() == SomaticMutation.name().lower():
return self.__getattribute__(SomaticMutation.name())
elif item.lower() == CopyNumberVariation.name().lower():
return self.__getattribute__(CopyNumberVariation.name())
elif item.lower() == DNAMethylation.name().lower():
return self.__getattribute__(DNAMethylation.name())
elif item.lower() == Protein.name().lower():
return self.__getattribute__(Protein.name())
elif item.lower() == "patients":
return self.clinical.patient
elif item.lower() == "samples":
return self.clinical.samples
elif item.lower() == "drugs":
return self.clinical.drugs
else:
raise Exception(
'String accessor must be one of {"MessengerRNA", "MicroRNA", "LncRNA", "Protein", etc.}'
)
[docs] def remove_duplicate_genes(self):
"""Removes duplicate genes between any omics such that the gene index
across all omics has no duplicates.
"""
for omic_A in self._omics:
for omic_B in self._omics:
if omic_A != omic_B:
self.__getattribute__(omic_A).drop_genes(
set(self.__getattribute__(omic_A).get_genes_list())
& set(self.__getattribute__(omic_B).get_genes_list()))
[docs] def build_samples(self, agg_by="union"):
"""Running this function will build a dataframe for all samples across
the different omics (either by a union or intersection). Then,
Args:
agg_by (str): ["union", "intersection"]
"""
# make sure at least one ExpressionData present
if len(self._omics) < 1:
logging.debug(
"build_samples() does nothing. Must add at least one omic to this MultiOmics object."
)
return
all_samples = pd.Index([])
for omic in self._omics:
if agg_by == "union":
all_samples = all_samples.union(self.data[omic].index)
elif agg_by == "intersection":
all_samples = all_samples.intersection(self.data[omic].index)
if hasattr(self, "clinical"):
self.clinical.build_clinical_samples(all_samples)
self.data["SAMPLES"] = self.clinical.samples.index
else:
self.data["SAMPLES"] = all_samples
def __dir__(self):
return list(self.data.keys())
[docs] def match_samples(self, omics) -> pd.Index:
"""Return the index of bcr_sample_barcodes of the intersection of
samples from all modalities
Args:
omics: An array of modalities
Returns:
matched_sapmles: An pandas Index list
"""
# TODO check that for single modalities, this fetch all patients
matched_samples = self.data[omics[0]].index.copy()
for omic in omics:
matched_samples = matched_samples.join(self.data[omic].index,
how="inner")
return matched_samples
[docs] def load_data(
self,
omics,
target=["pathologic_stage"],
pathologic_stages=None,
histological_subtypes=None,
predicted_subtypes=None,
tumor_normal=None,
samples_barcode=None,
):
# type: (Union[List[str], str], List[str], List[str], List[str], List[str], List[str], List[str]) -> (Dict[str, pd.DataFrame], pd.DataFrame)
"""
Args:
omics (list): A list of the data modalities to load. Default "all"
to select all modalities
target (list): The clinical data fields to include in the
pathologic_stages (list): Only fetch samples having certain stages
in their corresponding patient's clinical data. For instance,
["Stage I", "Stage II"] will only fetch samples from Stage I and
Stage II patients. Default is [] which fetches all pathologic
stages.
histological_subtypes: A list specifying the histological subtypes
to fetch. Default is [] which fetches all histological sybtypes.
predicted_subtypes: A list specifying the histological subtypes to
fetch. Default is [] which fetches all histological sybtypes.
tumor_normal: ["Tumor", "Normal"]. Default is [], which fetches all
tumor or normal sample types.
samples_barcode: A list of sample's barcode. If not None, only fetch
data with matching samples provided in this list.
Returns:
(X, y): Returns X, a dictionary containing the multiomics data that
have data
"""
if omics == "all" or omics is None:
omics = self._omics
matched_samples = self.match_samples(omics)
if samples_barcode is not None:
matched_samples = samples_barcode
if hasattr(self, "clinical") and isinstance(self.clinical,
ClinicalData):
# Build targets clinical data
y = self.get_sample_attributes(matched_samples)
# Select only samples with certain cancer stage or subtype
if pathologic_stages:
y = y[y[PATHOLOGIC_STAGE_COL].isin(pathologic_stages)]
if histological_subtypes:
y = y[y[HISTOLOGIC_SUBTYPE_COL].isin(histological_subtypes)]
if predicted_subtypes:
y = y[y[PREDICTED_SUBTYPE_COL].isin(predicted_subtypes)]
if tumor_normal:
y = y[y[TUMOR_NORMAL_COL].isin(tumor_normal)]
# Filter y target column labels
y = y.filter(target)
y.dropna(axis=0, inplace=True)
matched_samples = y.index
else:
y = None
# Build expression matrix for each omic, indexed by matched_samples
X_multiomics = {}
for omic in omics:
X_multiomics[omic] = self.data[omic].loc[
matched_samples, self[omic].get_genes_list()]
return X_multiomics, y
[docs] def get_sample_attributes(self, matched_samples):
"""Fetch patient's clinical data for each given samples barcodes in the
matched_samples
Returns
samples_index: Index of samples
Args:
matched_samples: A list of sample barcodes
"""
return self.data["SAMPLES"].reindex(matched_samples)
[docs] def print_sample_sizes(self):
for omic in self.data:
print(
omic,
self.data[omic].shape
if hasattr(self.data[omic], "shape") else "Didn't import data",
)
[docs] def annotate_samples(self, dictionary):
"""This function adds a "predicted_subtype" field to the patients
clinical data. For instance, patients were classified into subtypes
based on their expression profile using k-means, then, to use this
function, do:
annotate_patients(dict(zip(patient index>, <list of corresponding patient's subtypes>)))
Adding a field to the patients clinical data allows openomics to
query the patients data through the .load_data(subtypes=[])
parameter,
Args:
dictionary: A dictionary mapping patient's index to a subtype
"""
self.data["PATIENTS"] = self.data["PATIENTS"].assign(
subtypes=self.data["PATIENTS"][
self.clinical.patient_column].map(dictionary))