Source code for openomics.multiomics

import logging
from typing import List, Dict, Union

import pandas as pd

import openomics
from .clinical import (
    ClinicalData,
    HISTOLOGIC_SUBTYPE_COL,
    PATHOLOGIC_STAGE_COL,
    TUMOR_NORMAL_COL,
    PREDICTED_SUBTYPE_COL,
)
from .genomics import SomaticMutation, CopyNumberVariation, DNAMethylation
from .imageomics import WholeSlideImage
from .proteomics import Protein
from .transcriptomics import MessengerRNA, MicroRNA, LncRNA, Expression


[docs]class MultiOmics:
    """A data object which holds multiple -omics data for a single clinical cohort."""
    def __init__(self, cohort_name, omics_data=None):
        """

        Args:
            cohort_name (str): the clinical cohort name
        """
        self._cohort_name = cohort_name
        self._omics = []

        # This is a data dictionary accessor to retrieve individual -omic data
        self.data = {}

        if omics_data:
            for omics in omics_data:
                self.add_omic(omics)

[docs]    def add_omic(self,
                 omic_data: Expression,
                 initialize_annotations: bool = True):
        """Adds an omic object to the Multiomics such that the samples in omic
        matches the samples existing in the other omics.

        Args:
            omic_data (Expression): The omic to add, e.g., MessengerRNA,
                MicroRNA, LncRNA, etc.
            initialize_annotations (bool): default True. If true, initializes
                the annotation dataframe in the omic object
        """
        self.__dict__[omic_data.name()] = omic_data

        if omic_data.name not in self._omics:
            self._omics.append(omic_data.name())

        # dictionary as data accessor to the expression data
        self.data[omic_data.name()] = omic_data.expressions

        # Initialize annotation
        if initialize_annotations:
            omic_data.initialize_annotations(index=omic_data.gene_index,
                                             gene_list=None)

        logging.info(
            omic_data.name(),
            self.data[omic_data.name()].shape if hasattr(
                self.data[omic_data.name()], "shape") else ": None",
            ", indexed by:",
            omic_data.annotations.index.name,
        )

[docs]    def add_clinical_data(self, clinical: openomics.clinical.ClinicalData, **kwargs):
        """ Add a ClinicalData instance to the MultiOmics instance.

        Args:
            clinical (openomics.clinical.ClinicalData):
        """
        if not isinstance(clinical, ClinicalData):
            raise Exception("Must pass a ClinicalData in, not a file path.")

        self.clinical = clinical

        self.data["PATIENTS"] = self.clinical.patient
        if hasattr(self.clinical, "biospecimen"):
            self.data["BIOSPECIMENS"] = self.clinical.biospecimen
        if hasattr(self.clinical, "drugs"):
            self.data["DRUGS"] = self.clinical.drugs

        self.build_samples(**kwargs)

[docs]    def get_omics_list(self):
        return self._omics

    def __getitem__(self, item:str):
        """This function allows the MultiOmicData class objects to access
        individual omics by a dictionary lookup, e.g. openomics["MicroRNA"]

        Args:
            item (str): a string of the class name
        """
        if item.lower() == MessengerRNA.name().lower():
            return self.__getattribute__(MessengerRNA.name())

        elif item.lower() == MicroRNA.name().lower():
            return self.__getattribute__(MicroRNA.name())

        elif item.lower() == LncRNA.name().lower():
            return self.__getattribute__(LncRNA.name())

        elif item.lower() == WholeSlideImage.name().lower():
            return self.__getattribute__(WholeSlideImage.name())

        elif item.lower() == SomaticMutation.name().lower():
            return self.__getattribute__(SomaticMutation.name())

        elif item.lower() == CopyNumberVariation.name().lower():
            return self.__getattribute__(CopyNumberVariation.name())

        elif item.lower() == DNAMethylation.name().lower():
            return self.__getattribute__(DNAMethylation.name())

        elif item.lower() == Protein.name().lower():
            return self.__getattribute__(Protein.name())

        elif item.lower() == "patients":
            return self.clinical.patient
        elif item.lower() == "samples":
            return self.clinical.samples
        elif item.lower() == "drugs":
            return self.clinical.drugs
        else:
            raise Exception(
                'String accessor must be one of {"MessengerRNA", "MicroRNA", "LncRNA", "Protein", etc.}'
            )

[docs]    def remove_duplicate_genes(self):
        """Removes duplicate genes between any omics such that the gene index
        across all omics has no duplicates.
        """
        for omic_A in self._omics:
            for omic_B in self._omics:
                if omic_A != omic_B:
                    self.__getattribute__(omic_A).drop_genes(
                        set(self.__getattribute__(omic_A).get_genes_list())
                        & set(self.__getattribute__(omic_B).get_genes_list()))

[docs]    def build_samples(self, agg_by="union"):
        """Running this function will build a dataframe for all samples across
        the different omics (either by a union or intersection). Then,

        Args:
            agg_by (str): ["union", "intersection"]
        """
        # make sure at least one ExpressionData present
        if len(self._omics) < 1:
            logging.debug(
                "build_samples() does nothing. Must add at least one omic to this MultiOmics object."
            )
            return

        all_samples = pd.Index([])
        for omic in self._omics:
            if agg_by == "union":
                all_samples = all_samples.union(self.data[omic].index)
            elif agg_by == "intersection":
                all_samples = all_samples.intersection(self.data[omic].index)

        if hasattr(self, "clinical"):
            self.clinical.build_clinical_samples(all_samples)
            self.data["SAMPLES"] = self.clinical.samples.index
        else:
            self.data["SAMPLES"] = all_samples

    def __dir__(self):
        return list(self.data.keys())

[docs]    def match_samples(self, omics) -> pd.Index:
        """Return the index of bcr_sample_barcodes of the intersection of
        samples from all modalities

        Args:
            omics: An array of modalities

        Returns:
            matched_sapmles: An pandas Index list
        """
        # TODO check that for single modalities, this fetch all patients
        matched_samples = self.data[omics[0]].index.copy()

        for omic in omics:
            matched_samples = matched_samples.join(self.data[omic].index,
                                                   how="inner")

        return matched_samples

[docs]    def load_data(
        self,
        omics,
        target=["pathologic_stage"],
        pathologic_stages=None,
        histological_subtypes=None,
        predicted_subtypes=None,
        tumor_normal=None,
        samples_barcode=None,
    ):
        # type: (Union[List[str], str], List[str], List[str], List[str], List[str], List[str], List[str]) -> (Dict[str, pd.DataFrame], pd.DataFrame)
        """
        Args:
            omics (list): A list of the data modalities to load. Default "all"
                to select all modalities
            target (list): The clinical data fields to include in the
            pathologic_stages (list): Only fetch samples having certain stages
                in their corresponding patient's clinical data. For instance,
                ["Stage I", "Stage II"] will only fetch samples from Stage I and
                Stage II patients. Default is [] which fetches all pathologic
                stages.
            histological_subtypes: A list specifying the histological subtypes
                to fetch. Default is [] which fetches all histological sybtypes.
            predicted_subtypes: A list specifying the histological subtypes to
                fetch. Default is [] which fetches all histological sybtypes.
            tumor_normal: ["Tumor", "Normal"]. Default is [], which fetches all
                tumor or normal sample types.
            samples_barcode: A list of sample's barcode. If not None, only fetch
                data with matching samples provided in this list.

        Returns:
            (X, y): Returns X, a dictionary containing the multiomics data that
            have data
        """
        if omics == "all" or omics is None:
            omics = self._omics

        matched_samples = self.match_samples(omics)

        if samples_barcode is not None:
            matched_samples = samples_barcode

        if hasattr(self, "clinical") and isinstance(self.clinical,
                                                    ClinicalData):
            # Build targets clinical data
            y = self.get_sample_attributes(matched_samples)

            # Select only samples with certain cancer stage or subtype
            if pathologic_stages:
                y = y[y[PATHOLOGIC_STAGE_COL].isin(pathologic_stages)]
            if histological_subtypes:
                y = y[y[HISTOLOGIC_SUBTYPE_COL].isin(histological_subtypes)]
            if predicted_subtypes:
                y = y[y[PREDICTED_SUBTYPE_COL].isin(predicted_subtypes)]
            if tumor_normal:
                y = y[y[TUMOR_NORMAL_COL].isin(tumor_normal)]

            # Filter y target column labels
            y = y.filter(target)
            y.dropna(axis=0, inplace=True)
            matched_samples = y.index
        else:
            y = None

        # Build expression matrix for each omic, indexed by matched_samples
        X_multiomics = {}
        for omic in omics:
            X_multiomics[omic] = self.data[omic].loc[
                matched_samples, self[omic].get_genes_list()]

        return X_multiomics, y

[docs]    def get_sample_attributes(self, matched_samples):
        """Fetch patient's clinical data for each given samples barcodes in the
        matched_samples

        Returns
            samples_index: Index of samples

        Args:
            matched_samples: A list of sample barcodes
        """
        return self.data["SAMPLES"].reindex(matched_samples)

[docs]    def print_sample_sizes(self):
        for omic in self.data:
            print(
                omic,
                self.data[omic].shape
                if hasattr(self.data[omic], "shape") else "Didn't import data",
            )

[docs]    def annotate_samples(self, dictionary):
        """This function adds a "predicted_subtype" field to the patients
        clinical data. For instance, patients were classified into subtypes
        based on their expression profile using k-means, then, to use this
        function, do:

        annotate_patients(dict(zip(patient index>, <list of corresponding patient's subtypes>)))

        Adding a field to the patients clinical data allows openomics to
        query the patients data through the .load_data(subtypes=[])
        parameter,

        Args:
            dictionary: A dictionary mapping patient's index to a subtype
        """
        self.data["PATIENTS"] = self.data["PATIENTS"].assign(
            subtypes=self.data["PATIENTS"][
                self.clinical.patient_column].map(dictionary))