Preparing data for downstream analyses#

To prepare the data for classification#

X_multiomics, y = luad_data.load_data(omics="all", target=["pathologic_stage"], remove_duplicates=True)

print(X_multiomics['MessengerRNA'].shape,
      X_multiomics['MicroRNA'].shape,
      X_multiomics['LncRNA'].shape,
      y.shape)

(338, 20472) (338, 1870) (338, 12727) (338, 1)

print(y)

	pathologic_stage
TCGA-05-4390-01A	Stage I
TCGA-05-4405-01A	Stage I
TCGA-05-4410-01A	Stage I
TCGA-05-4417-01A	Stage I
TCGA-05-4424-01A	Stage II
TCGA-05-4427-01A	Stage II
TCGA-05-4433-01A	Stage I
TCGA-05-5423-01A	Stage II
TCGA-05-5425-01A	Stage II
TCGA-05-5428-01A	Stage II
TCGA-05-5715-01A	Stage I
TCGA-38-4631-01A	Stage I
TCGA-38-7271-01A	Stage I
TCGA-38-A44F-01A	Stage I
TCGA-44-2655-11A	Stage I

336 rows × 1 columns

Log2 transform the mRNA, microRNA, and lncRNA expression values#

def expression_val_transform(x):
    return np.log2(x+1)
X_multiomics['MessengerRNA'] = X_multiomics['MessengerRNA'].applymap(expression_val_transform)
X_multiomics['MicroRNA'] = X_multiomics['MicroRNA'].applymap(expression_val_transform)
# X_multiomics['LncRNA'] = X_multiomics['LncRNA'].applymap(expression_val_transform)

Classification of Cancer Stage#

from sklearn import preprocessing
from sklearn import metrics
from sklearn.svm import SVC, LinearSVC
import sklearn.linear_model
from sklearn.model_selection import train_test_split

binarizer = preprocessing.LabelEncoder()
binarizer.fit(y)
binarizer.transform(y)

array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

for omic in ["MessengerRNA", "MicroRNA"]:
    print(omic)
    scaler = sklearn.preprocessing.StandardScaler(copy=True, with_mean=True, with_std=False)
    scaler.fit(X_multiomics[omic])

    X_train, X_test, Y_train, Y_test = \
        train_test_split(X_multiomics[omic], y, test_size=0.3, random_state=np.random.randint(0, 10000), stratify=y)
    print(X_train.shape, X_test.shape)


    X_train = scaler.transform(X_train)

    model = LinearSVC(C=1e-2, penalty='l1', class_weight='balanced', dual=False, multi_class="ovr")
#     model = sklearn.linear_model.LogisticRegression(C=1e-0, penalty='l1', fit_intercept=False, class_weight="balanced")
#     model = SVC(C=1e0, kernel="rbf", class_weight="balanced", decision_function_shape="ovo")

    model.fit(X=X_train, y=Y_train)
    print("NONZERO", len(np.nonzero(model.coef_)[0]))
    print("Training accuracy", metrics.accuracy_score(model.predict(X_train), Y_train))
    print(metrics.classification_report(y_pred=model.predict(X_test), y_true=Y_test))

MessengerRNA
(254, 20472) (109, 20472)
NONZERO 0
Training accuracy 0.6929133858267716
             precision    recall  f1-score   support

    Stage I       0.69      1.00      0.82        75
   Stage II       0.00      0.00      0.00        34

avg / total       0.47      0.69      0.56       109

MicroRNA
(254, 1870) (109, 1870)
NONZERO 0
Training accuracy 0.6929133858267716
             precision    recall  f1-score   support

    Stage I       0.69      1.00      0.82        75
   Stage II       0.00      0.00      0.00        34

avg / total       0.47      0.69      0.56       109