import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from IPython.display import display, HTML
Predicting ER status imputing missing values
Aim
The goal of this report is to predict the ER status for those patients where the ER status is unknown. To this end I used the GEO metadata downloaded in Downloading GEO data and the cluster assignments obtained in the report Identification of breast cancer subtypes and biomarkers using the Seurat workflow.
Workflow
I applied three supervised learning method (logist regression, random forest, support vector machine) and optimized their hyperparameters using a grid search using balanced accurancy as evaluation metric.
To handle missing data, I used the KNNimputer, which imputes values based on the nearest neighbors in the feature space.
For feature selection, I applied mutual_info_classif to retain the most informative features with respect to the target variable (ER status).
To address class imbalance, I set the parameter class_weight=’balanced, ensuring that the model gives appropriate attention to minority classes during training.
Conclusion
In terms of balanced accuracy, random forest slightly outperformed logistic regression and SVM.
Exploring dataset
Loading dataset
= pd.read_table('data/cluster_anno_seurat.csv', sep=',')
df
= df[['patient', 'age_at_diagnosis', 'tumor_size',
df 'lymph_node_status','er_status', 'pgr_status', 'her2_status',
'pam50_subtype', 'overall_survival_days', 'overall_survival_event',
'endocrine_treated', 'chemo_treated', 'ki67_status', 'nhg',
'seurat_clusters']]
= df.set_index('patient')
df
= df.replace({'endocrine_treated': {'no treated': 'untreated'}})
df = df.replace({'chemo_treated': {'no treated': 'untreated'}})
df = df.replace({'overall_survival_event': {'no survival': 'nosurvival'}})
df
df
age_at_diagnosis | tumor_size | lymph_node_status | er_status | pgr_status | her2_status | pam50_subtype | overall_survival_days | overall_survival_event | endocrine_treated | chemo_treated | ki67_status | nhg | seurat_clusters | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
patient | ||||||||||||||
F1 | 43 | 9.0 | NodeNegative | NaN | NaN | HER2- | Basal | 2367 | nosurvival | untreated | treated | NaN | G3 | 5 |
F2 | 48 | 14.0 | NodePositive | ER+ | PgR+ | HER2- | LumA | 2367 | nosurvival | treated | treated | NaN | G2 | 0 |
F3 | 69 | 27.0 | NodePositive | ER+ | PgR+ | HER2- | LumB | 2168 | survival | treated | treated | NaN | G3 | 1 |
F4 | 39 | 51.0 | NodePositive | ER+ | NaN | HER2+ | LumA | 2416 | nosurvival | treated | treated | NaN | G3 | 4 |
F5 | 73 | 60.0 | NodePositive | ER+ | NaN | HER2- | Normal | 2389 | nosurvival | treated | untreated | NaN | G2 | 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
F3269 | 72 | 13.0 | NodeNegative | ER+ | PgR+ | HER2- | LumB | 856 | nosurvival | treated | treated | Ki67+ | G2 | 7 |
F3270 | 69 | 30.0 | NodePositive | ER+ | PgR+ | HER2- | LumA | 861 | nosurvival | treated | treated | Ki67+ | G2 | 8 |
F3271 | 73 | 18.0 | NodeNegative | ER+ | PgR- | NaN | LumB | 862 | nosurvival | treated | treated | Ki67+ | G3 | 4 |
F3272 | 67 | 33.0 | NodePositive | ER+ | PgR+ | HER2- | LumA | 844 | nosurvival | treated | untreated | Ki67+ | G2 | 0 |
F3273 | 60 | 17.0 | NodePositive | ER+ | PgR+ | HER2- | LumA | 843 | nosurvival | treated | treated | Ki67+ | G2 | 3 |
3273 rows × 14 columns
Missing data Distribution
= df.columns
features
= df[features].isnull().sum()
df_miss = df_miss[df_miss > 0].sort_values(ascending=False)
df_miss
= plt.subplots(figsize=(10, 6))
fig, ax ='bar', ax=ax, color='skyblue', edgecolor='black')
df_miss.plot(kind
'Missing Values', fontsize=12)
plt.ylabel(=45, ha='right')
plt.xticks(rotation='y', alpha=0.3)
plt.grid(axis
# Add value labels
for i, v in enumerate(df_miss.values):
+ 0.1, str(v), ha='center', va='bottom', fontweight='bold')
ax.text(i, v
plt.tight_layout()
plt.show()
= (len(df_miss)/len(features))*100
missperc print(f"Tot features: {len(features)}")
print(f"Tot features with missing values: {len(df_miss)} ({missperc:0.1f}%)")
print(f"Feature with missing values: {[col for col in df_miss.index]}")
print(f"Feature without missing values: {[col for col in df.columns if col not in df_miss.index]}")
Tot features: 14
Tot features with missing values: 9 (64.3%)
Feature with missing values: ['ki67_status', 'pgr_status', 'er_status', 'her2_status', 'lymph_node_status', 'nhg', 'tumor_size', 'endocrine_treated', 'chemo_treated']
Feature without missing values: ['age_at_diagnosis', 'pam50_subtype', 'overall_survival_days', 'overall_survival_event', 'seurat_clusters']
ER status distribution
= plt.subplots(figsize=(10, 6))
fig, ax 'er_status'].value_counts(dropna=False).plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
df[
for i, v in enumerate(df['er_status'].value_counts(dropna=False)):
+ 0.1, str(v), ha='center', va='bottom', fontweight='bold') ax.text(i, v
Splitting data in training and test set
= df['er_status']
y = df.loc[~y.isna(),:].drop('er_status', axis=1), y[~y.isna()]
X_train, y_train = df.loc[y.isna(),:].drop('er_status', axis=1), y[y.isna()] X_test, y_test
Imputation pipeline
def imputation_pipeline(X_train, classifier, nfeat = 5):
# identify feature type
= X_train.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_features
# build imputation pipeline
# OrdinalEncoder: designed for feature encoding, accept 2d array (X,y)
# LabelEncoder: designed for target encoding, accept 1d array (y)
= Pipeline(steps=[
categorical_imputer 'encoder', OrdinalEncoder()),
('imputer', KNNImputer(n_neighbors=50))
(
])
= Pipeline(steps=[
numerical_imputer 'imputer', KNNImputer(n_neighbors=20)),
('scaler', StandardScaler())
(
])
= ColumnTransformer(transformers=[
imputation 'categorical', categorical_imputer, categorical_features),
('numerical', numerical_imputer, numerical_features)
(
])
# return pipeline
return Pipeline(steps=[
'imputation', imputation),
('feature_selection', SelectKBest(mutual_info_classif, k=nfeat)),
('classifier', classifier)
(
])
## using smote instead of class_weight='balanced'
# from imblearn.over_sampling import SMOTE
# from imblearn.pipeline import Pipeline as ImbPipeline
# return ImbPipeline(steps=[
# ('imputation', imputation),
# ('smote', SMOTE(random_state=0)), # add SMOTE after imputation
# ('classifier', classifier) # remove class_weight='balanced' from classifiers
# ])
Utility function to format results
## utils to show results
def show_results(fitted_pipeline, X_test, y_pred_proba):
= pd.DataFrame({
results 'Patient_ID': X_test.index,
f'Prob_{fitted_pipeline.classes_[0]}': y_pred_proba[:, 0],
f'Prob_{fitted_pipeline.classes_[1]}': y_pred_proba[:, 1],
'Max_Probability': np.max(y_pred_proba, axis=1),
'Predicted_ER_Status': y_pred,
'Confidence_Level': np.where(np.max(y_pred_proba, axis=1) > 0.8, 'High',
max(y_pred_proba, axis=1) > 0.6, 'Medium',
np.where(np.'Low'))
'Patient_ID')
}).set_index(
= pd.DataFrame({
er_df 'Label': {class_label: idx for idx, class_label in enumerate(fitted_pipeline.classes_)},
'Count': pd.Series(y_pred).value_counts()
'ER_Status')
}).rename_axis(
= pd.DataFrame({
confidence_df 'Count': results['Confidence_Level'].value_counts(),
'Percentage': (results['Confidence_Level'].value_counts(normalize=True) * 100).round(2)
})
# create side-by-side HTML
= f"""
html_str <div style="display: flex; gap: 50px;">
<div>
<h4>ER Status Distribution</h4>
{er_df.to_html()}
</div>
<div>
<h4>Confidence Level Distribution</h4>
{confidence_df.to_html()}
</div>
</div>
"""
return results, html_str
Logistic regression
= imputation_pipeline(X_train,
model_pipeline =LogisticRegression(random_state=0, max_iter=1000, class_weight='balanced'),
classifier=5)
nfeat
= cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='balanced_accuracy')
cv_scores print(f"Cross-validation accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
Cross-validation accuracy: 0.952 (+/- 0.062)
# Fit the final model
model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('imputation', ColumnTransformer(transformers=[('categorical', Pipeline(steps=[('encoder', OrdinalEncoder()), ('imputer', KNNImputer(n_neighbors=50))]), ['lymph_node_status', 'pgr_status', 'her2_status', 'pam50_subtype', 'overall_survival_event', 'endocrine_treated', 'chemo_treated', 'ki67_status', 'nhg']), ('numerical', Pipeline(steps=[('imputer', KNNImputer(n_neighbors=20)), ('scaler', StandardScaler())]), ['age_at_diagnosis', 'tumor_size', 'overall_survival_days', 'seurat_clusters'])])), ('feature_selection', SelectKBest(k=5, score_func=<function mutual_info_classif at 0x7f5f0cf41990>)), ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=0))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('imputation', ColumnTransformer(transformers=[('categorical', Pipeline(steps=[('encoder', OrdinalEncoder()), ('imputer', KNNImputer(n_neighbors=50))]), ['lymph_node_status', 'pgr_status', 'her2_status', 'pam50_subtype', 'overall_survival_event', 'endocrine_treated', 'chemo_treated', 'ki67_status', 'nhg']), ('numerical', Pipeline(steps=[('imputer', KNNImputer(n_neighbors=20)), ('scaler', StandardScaler())]), ['age_at_diagnosis', 'tumor_size', 'overall_survival_days', 'seurat_clusters'])])), ('feature_selection', SelectKBest(k=5, score_func=<function mutual_info_classif at 0x7f5f0cf41990>)), ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=0))])
ColumnTransformer(transformers=[('categorical', Pipeline(steps=[('encoder', OrdinalEncoder()), ('imputer', KNNImputer(n_neighbors=50))]), ['lymph_node_status', 'pgr_status', 'her2_status', 'pam50_subtype', 'overall_survival_event', 'endocrine_treated', 'chemo_treated', 'ki67_status', 'nhg']), ('numerical', Pipeline(steps=[('imputer', KNNImputer(n_neighbors=20)), ('scaler', StandardScaler())]), ['age_at_diagnosis', 'tumor_size', 'overall_survival_days', 'seurat_clusters'])])
['lymph_node_status', 'pgr_status', 'her2_status', 'pam50_subtype', 'overall_survival_event', 'endocrine_treated', 'chemo_treated', 'ki67_status', 'nhg']
OrdinalEncoder()
KNNImputer(n_neighbors=50)
['age_at_diagnosis', 'tumor_size', 'overall_survival_days', 'seurat_clusters']
KNNImputer(n_neighbors=20)
StandardScaler()
SelectKBest(k=5, score_func=<function mutual_info_classif at 0x7f5f0cf41990>)
LogisticRegression(class_weight='balanced', max_iter=1000, random_state=0)
# Make predictions on test set
= model_pipeline.predict(X_test)
y_pred = model_pipeline.predict_proba(X_test)
y_pred_proba
= show_results(model_pipeline, X_test, y_pred_proba)
results, html_str
display(results) display(HTML(html_str))
Prob_ER+ | Prob_ER- | Max_Probability | Predicted_ER_Status | Confidence_Level | |
---|---|---|---|---|---|
Patient_ID | |||||
F1 | 0.017625 | 0.982375 | 0.982375 | ER- | High |
F6 | 0.017625 | 0.982375 | 0.982375 | ER- | High |
F15 | 0.416440 | 0.583560 | 0.583560 | ER- | Low |
F16 | 0.017625 | 0.982375 | 0.982375 | ER- | High |
F27 | 0.026501 | 0.973499 | 0.973499 | ER- | High |
... | ... | ... | ... | ... | ... |
F2962 | 0.819012 | 0.180988 | 0.819012 | ER+ | High |
F2965 | 0.013485 | 0.986515 | 0.986515 | ER- | High |
F2967 | 0.987457 | 0.012543 | 0.987457 | ER+ | High |
F2976 | 0.982419 | 0.017581 | 0.982419 | ER+ | High |
F3242 | 0.030064 | 0.969936 | 0.969936 | ER- | High |
200 rows × 5 columns
ER Status Distribution
Label | Count | |
---|---|---|
ER_Status | ||
ER+ | 0 | 33 |
ER- | 1 | 167 |
Confidence Level Distribution
Count | Percentage | |
---|---|---|
Confidence_Level | ||
High | 174 | 87.0 |
Medium | 18 | 9.0 |
Low | 8 | 4.0 |
Feature selection
In blue the top 5 features used in the model, in orange the discarded ones.
= X_train.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_features = np.array(categorical_features + numerical_features)
all_features
= model_pipeline.named_steps['feature_selection']
feature_selector = feature_selector.get_support(indices=True)
feature_index = all_features[feature_index]
selected_feature
= pd.DataFrame({
feature_importance 'Feature': all_features,
'Score': feature_selector.scores_,
'Selected': feature_selector.get_support()
'Score', ascending=False)
}).sort_values(# display(feature_importance)
=(10, 6))
plt.figure(figsize= ['tab:blue' if selected else 'tab:orange' for selected in feature_importance['Selected']]
colors 'Feature'], feature_importance['Score'], color=colors)
plt.barh(feature_importance['Feature Score')
plt.xlabel(
plt.gca().invert_yaxis()
# fig, ax = plt.subplots(figsize=(10, 6))
# colors = ['tab:green' if selected else 'tab:orange' for selected in feature_importance['Selected']]
# ax.barh(feature_importance['Feature'], feature_importance['Score'], color=colors)
# ax.set_xlabel('Feature Score')
# ax.invert_yaxis()
Parameter tuning
%%time
= imputation_pipeline(X_train,
model_pipeline = LogisticRegression(random_state=0, max_iter=1000, class_weight='balanced'),
classifier= 5)
nfeat
= [
param_grid
{'classifier__C': [0.01, 0.1, 1, 10, 100],
'classifier__penalty': ['l2'],
'classifier__solver': ['lbfgs', 'newton-cholesky']
},
{'classifier__C': [0.01, 0.1, 1, 10, 100],
'classifier__penalty': ['l1', 'l2'],
'classifier__solver': ['liblinear', 'saga']
},
{'classifier__C': [0.01, 0.1, 1, 10, 100],
'classifier__penalty': ['elasticnet'],
'classifier__solver': ['saga'], # saga is the only solver that supports elasticnet
'classifier__l1_ratio': [0.1, 0.5, 0.7, 0.9]
}
]
print("Starting Grid Search for Logistic Regression...")
= GridSearchCV(
grid_lr
model_pipeline,
param_grid, =5,
cv='balanced_accuracy',
scoring=-1,
n_jobs=1
verbose
)
# Fit the grid search
grid_lr.fit(X_train, y_train)
# Display results
print("\nBest parameters:")
print(grid_lr.best_params_)
print(f"\nAveraged cross-validation balanced accuracy score: {grid_lr.best_score_:.3f}")
Starting Grid Search for Logistic Regression...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters:
{'classifier__C': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
Averaged cross-validation balanced accuracy score: 0.955
CPU times: user 4.77 s, sys: 612 ms, total: 5.39 s
Wall time: 27.8 s
# Use the best model for predictions
= grid_lr.best_estimator_
best_model_lr = best_model_lr.predict(X_test)
y_pred = best_model_lr.predict_proba(X_test)
y_pred_proba
= show_results(best_model_lr, X_test, y_pred_proba)
results_lr, html_str
display(results_lr) display(HTML(html_str))
Prob_ER+ | Prob_ER- | Max_Probability | Predicted_ER_Status | Confidence_Level | |
---|---|---|---|---|---|
Patient_ID | |||||
F1 | 0.015638 | 0.984362 | 0.984362 | ER- | High |
F6 | 0.015638 | 0.984362 | 0.984362 | ER- | High |
F15 | 0.403025 | 0.596975 | 0.596975 | ER- | Low |
F16 | 0.015638 | 0.984362 | 0.984362 | ER- | High |
F27 | 0.023326 | 0.976674 | 0.976674 | ER- | High |
... | ... | ... | ... | ... | ... |
F2962 | 0.827821 | 0.172179 | 0.827821 | ER+ | High |
F2965 | 0.011873 | 0.988127 | 0.988127 | ER- | High |
F2967 | 0.988520 | 0.011480 | 0.988520 | ER+ | High |
F2976 | 0.984569 | 0.015431 | 0.984569 | ER+ | High |
F3242 | 0.026950 | 0.973050 | 0.973050 | ER- | High |
200 rows × 5 columns
ER Status Distribution
Label | Count | |
---|---|---|
ER_Status | ||
ER+ | 0 | 32 |
ER- | 1 | 168 |
Confidence Level Distribution
Count | Percentage | |
---|---|---|
Confidence_Level | ||
High | 175 | 87.5 |
Medium | 16 | 8.0 |
Low | 9 | 4.5 |
Comparing tuned and untuned model
The tuned model slightly outperformed the untuned one.
print("Default model parameters:", model_pipeline.named_steps['classifier'].get_params())
print("Best tuned parameters:", best_model_lr.named_steps['classifier'].get_params())
# Compare probability distributions
print("\nDefault model confidence distribution:")
print(results['Confidence_Level'].value_counts())
print("\nTuned model confidence distribution:")
print(results_lr['Confidence_Level'].value_counts())
## compare averaged cross-validated balanced accuracy
print(f"\nAveraged cross-validated balanced accuracy (untuned model): {cv_scores.mean():.3f}")
print(f"Averaged cross-validation balanced accuracy score (tuned model): {grid_lr.best_score_:.3f}")
Default model parameters: {'C': 1.0, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 0, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Best tuned parameters: {'C': 100, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 0, 'solver': 'saga', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Default model confidence distribution:
Confidence_Level
High 174
Medium 18
Low 8
Name: count, dtype: int64
Tuned model confidence distribution:
Confidence_Level
High 175
Medium 16
Low 9
Name: count, dtype: int64
Averaged cross-validated balanced accuracy (untuned model): 0.952
Averaged cross-validation balanced accuracy score (tuned model): 0.955
Random Frorest
%%time
= imputation_pipeline(X_train,
model_pipeline = RandomForestClassifier(random_state=0, class_weight='balanced'),
classifier =5)
nfeat
= {
param_grid 'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [5, 10, 20, 30],
'classifier__min_samples_split': [2, 5, 10, 20],
'classifier__min_samples_leaf': [1, 2, 4, 8],
'classifier__max_features': ['sqrt', 'log2']
}
# from sklearn.tree import DecisionTreeClassifier
# DecisionTreeClassifier(random_state=0)
# param_grid = {
# 'classifier__max_depth': [5, 10, 20, 30],
# 'classifier__min_samples_split': [2, 5, 10, 20],
# 'classifier__min_samples_leaf': [1, 2, 4, 8],
# 'classifier__criterion': ['gini', 'entropy'],
# 'classifier__max_features': ['sqrt', 'log2']
# }
print("Starting Grid Search for Random Forest...")
= GridSearchCV(
grid_rf
model_pipeline,
param_grid, =5,
cv='balanced_accuracy',
scoring=-1,
n_jobs=1
verbose
)
# Fit the grid search
grid_rf.fit(X_train, y_train)
# Display results
print("\nBest parameters:")
print(grid_rf.best_params_)
print(f"\nAveraged cross-validation balanced accuracy score: {grid_rf.best_score_:.3f}")
Starting Grid Search for Random Forest...
Fitting 5 folds for each of 384 candidates, totalling 1920 fits
Best parameters:
{'classifier__max_depth': 5, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 8, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}
Averaged cross-validation balanced accuracy score: 0.963
CPU times: user 19 s, sys: 954 ms, total: 19.9 s
Wall time: 3min 27s
# Use the best model for predictions
= grid_rf.best_estimator_
best_model_rf = best_model_rf.predict(X_test)
y_pred = best_model_rf.predict_proba(X_test)
y_pred_proba
= show_results(best_model_rf, X_test, y_pred_proba)
results_rf, html_str
display(results_rf) display(HTML(html_str))
Prob_ER+ | Prob_ER- | Max_Probability | Predicted_ER_Status | Confidence_Level | |
---|---|---|---|---|---|
Patient_ID | |||||
F1 | 0.081427 | 0.918573 | 0.918573 | ER- | High |
F6 | 0.081427 | 0.918573 | 0.918573 | ER- | High |
F15 | 0.790802 | 0.209198 | 0.790802 | ER+ | Medium |
F16 | 0.081427 | 0.918573 | 0.918573 | ER- | High |
F27 | 0.105113 | 0.894887 | 0.894887 | ER- | High |
... | ... | ... | ... | ... | ... |
F2962 | 0.837857 | 0.162143 | 0.837857 | ER+ | High |
F2965 | 0.080274 | 0.919726 | 0.919726 | ER- | High |
F2967 | 0.999465 | 0.000535 | 0.999465 | ER+ | High |
F2976 | 0.999871 | 0.000129 | 0.999871 | ER+ | High |
F3242 | 0.163633 | 0.836367 | 0.836367 | ER- | High |
200 rows × 5 columns
ER Status Distribution
Label | Count | |
---|---|---|
ER_Status | ||
ER+ | 0 | 38 |
ER- | 1 | 162 |
Confidence Level Distribution
Count | Percentage | |
---|---|---|
Confidence_Level | ||
High | 176 | 88.0 |
Medium | 23 | 11.5 |
Low | 1 | 0.5 |
SVM
%%time
= imputation_pipeline(X_train,
model_pipeline = SVC(probability=True, class_weight='balanced'),
classifier=5)
nfeat
= [
param_grid # Grid for Linear kernel
{'classifier__kernel': ['linear'],
'classifier__C': [0.01, 0.1, 1, 10, 100]
},# Grid for RBF kernel
{'classifier__kernel': ['rbf'],
'classifier__C': [0.01, 0.1, 1, 10, 100],
'classifier__gamma': [0.01, 0.1, 1, 10, 100]
},# Grid for Polynomial kernel
{'classifier__kernel': ['poly'],
'classifier__C': [0.01, 0.1, 1, 10, 100],
'classifier__degree': [2, 3, 4, 5]
}
]
print("Starting Grid Search for SVM...")
= GridSearchCV(
grid_svm
model_pipeline,
param_grid, =5,
cv='balanced_accuracy',
scoring=-1,
n_jobs=1
verbose
)
# Fit the grid search
grid_svm.fit(X_train, y_train)
# Display results
print("\nBest parameters:")
print(grid_svm.best_params_)
print(f"\nAveraged cross-validation balanced accuracy score: {grid_svm.best_score_:.3f}")
Starting Grid Search for SVM...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters:
{'classifier__C': 0.1, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf'}
Averaged cross-validation balanced accuracy score: 0.961
CPU times: user 5.21 s, sys: 419 ms, total: 5.63 s
Wall time: 40.1 s
# Use the best model for predictions
= grid_svm.best_estimator_
best_model_svm = best_model_svm.predict(X_test)
y_pred = best_model_svm.predict_proba(X_test)
y_pred_proba
= show_results(best_model_svm, X_test, y_pred_proba)
results_svm, html_str
display(results_svm) display(HTML(html_str))
Prob_ER+ | Prob_ER- | Max_Probability | Predicted_ER_Status | Confidence_Level | |
---|---|---|---|---|---|
Patient_ID | |||||
F1 | 0.144080 | 0.855920 | 0.855920 | ER- | High |
F6 | 0.144080 | 0.855920 | 0.855920 | ER- | High |
F15 | 0.972072 | 0.027928 | 0.972072 | ER+ | High |
F16 | 0.144080 | 0.855920 | 0.855920 | ER- | High |
F27 | 0.370919 | 0.629081 | 0.629081 | ER- | Medium |
... | ... | ... | ... | ... | ... |
F2962 | 0.979503 | 0.020497 | 0.979503 | ER+ | High |
F2965 | 0.126436 | 0.873564 | 0.873564 | ER- | High |
F2967 | 0.999174 | 0.000826 | 0.999174 | ER+ | High |
F2976 | 0.999446 | 0.000554 | 0.999446 | ER+ | High |
F3242 | 0.380777 | 0.619223 | 0.619223 | ER- | Medium |
200 rows × 5 columns
ER Status Distribution
Label | Count | |
---|---|---|
ER_Status | ||
ER+ | 0 | 27 |
ER- | 1 | 173 |
Confidence Level Distribution
Count | Percentage | |
---|---|---|
Confidence_Level | ||
High | 147 | 73.5 |
Medium | 40 | 20.0 |
Low | 13 | 6.5 |
Results
Below we populate the original dataset with the predicted ER staus and the imputed values obtained by the best model (random forest) according to the averaged balanced accuracy.
pd.DataFrame({'Model': ['Logistic Regression',
'Random Forest',
'SVM'],
'Avg_Balanced_Accuracy': [f"{grid_lr.best_score_:.3f}",
f"{grid_rf.best_score_:.3f}",
f"{grid_svm.best_score_:.3f}"],
'Model').sort_values('Avg_Balanced_Accuracy', ascending=False) }).set_index(
Avg_Balanced_Accuracy | |
---|---|
Model | |
Random Forest | 0.963 |
SVM | 0.961 |
Logistic Regression | 0.955 |
# get imputer for the best model
= best_model_rf.named_steps['imputation']
imputer
# get fitted imputations
= imputer.transform(X_train)
X_train_imputed = imputer.transform(X_test)
X_test_imputed
# split imputed data back into numerical and categorical parts
# nb: the slicing order is defined by ColumnTransformer in imputation_pipeline() -> categorical come before numerical features
= X_train.select_dtypes(include=['object', 'category']).columns.tolist()
cat_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_features
= len(cat_features)
n_cat = len(num_features)
n_num
= X_train_imputed[:, :n_cat]
X_train_cat = X_train_imputed[:, n_cat:n_num+n_cat]
X_train_num
= X_test_imputed[:, :n_cat]
X_test_cat = X_test_imputed[:, n_cat:n_num+n_cat]
X_test_num
# get the ordinal encoder from the categorical transformer
= imputer.named_transformers_['categorical']
categorical_transformer = categorical_transformer.named_steps['encoder']
ordinal_encoder
# inverse transform categorical features to get original labels
= ordinal_encoder.inverse_transform(X_train_cat)
X_train_cat = ordinal_encoder.inverse_transform(X_test_cat)
X_test_cat
# create dataframes with original categorical values and processed numerical values
= pd.DataFrame(
X_train_imputed
np.column_stack([X_train_cat, X_train_num]),=cat_features + num_features,
columns=X_train.index
index
)
= X_train_imputed[X_train.columns]
X_train_imputed
= pd.DataFrame(
X_test_imputed
np.column_stack([X_test_cat, X_test_num]),=cat_features + num_features,
columns=X_test.index
index
)
= X_test_imputed[X_test.columns]
X_test_imputed
## NaN comparison
# print("Training set - original data:")
# display(X_train[X_train.isnull().any(axis=1)])
# print("\nTraining set - after imputation:")
# display(X_train_imputed[X_train.isnull().any(axis=1)])
# print("\nTest set - original data:")
# display(X_test[X_test.isnull().any(axis=1)])
# print("\nTest set - after imputation:")
# display(X_test_imputed[X_test.isnull().any(axis=1)])
##
## complete dataset with the predicted ER status and the imputed values.
= results_svm[['Predicted_ER_Status']].rename(columns={'Predicted_ER_Status': 'er_status'})
pred_er = 'patient'
pred_er.index.name = pd.concat([X_test_imputed, pred_er], axis=1)
X_test_complete
= df.loc[~y.isna(), ['er_status']]
train_er = pd.concat([X_train_imputed, train_er], axis=1)
X_train_complete
display(pd.concat([X_train_complete, X_test_complete]).reindex(df.index))
## check
print("\nTot. missing values including training and test:")
print("Before imputation:", X_train.isnull().sum().sum() + X_test.isnull().sum().sum())
print("After imputation:", X_train_imputed.isnull().sum().sum() + X_test_imputed.isnull().sum().sum())
age_at_diagnosis | tumor_size | lymph_node_status | pgr_status | her2_status | pam50_subtype | overall_survival_days | overall_survival_event | endocrine_treated | chemo_treated | ki67_status | nhg | seurat_clusters | er_status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
patient | ||||||||||||||
F1 | -1.51916 | -0.904251 | NodeNegative | PgR+ | HER2- | Basal | 1.604754 | nosurvival | untreated | treated | Ki67+ | G3 | 0.668592 | ER- |
F2 | -1.135531 | -0.482241 | NodePositive | PgR+ | HER2- | LumA | 1.604754 | nosurvival | treated | treated | Ki67+ | G2 | -1.249785 | ER+ |
F3 | 0.47571 | 0.614988 | NodePositive | PgR+ | HER2- | LumB | 1.197026 | survival | treated | treated | Ki67+ | G3 | -0.86611 | ER+ |
F4 | -1.826063 | 2.64064 | NodePositive | PgR+ | HER2+ | LumA | 1.705149 | nosurvival | treated | treated | Ki67+ | G3 | 0.284916 | ER+ |
F5 | 0.782613 | 3.40026 | NodePositive | PgR+ | HER2- | Normal | 1.64983 | nosurvival | treated | untreated | Ki67+ | G2 | -0.098759 | ER+ |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
F3269 | 0.705887 | -0.566643 | NodeNegative | PgR+ | HER2- | LumB | -1.491111 | nosurvival | treated | treated | Ki67+ | G2 | 1.435942 | ER+ |
F3270 | 0.47571 | 0.868194 | NodePositive | PgR+ | HER2- | LumA | -1.480866 | nosurvival | treated | treated | Ki67+ | G2 | 1.819618 | ER+ |
F3271 | 0.782613 | -0.144632 | NodeNegative | PgR- | HER2+ | LumB | -1.478817 | nosurvival | treated | treated | Ki67+ | G3 | 0.284916 | ER+ |
F3272 | 0.322258 | 1.121401 | NodePositive | PgR+ | HER2- | LumA | -1.515697 | nosurvival | treated | untreated | Ki67+ | G2 | -1.249785 | ER+ |
F3273 | -0.214822 | -0.229034 | NodePositive | PgR+ | HER2- | LumA | -1.517746 | nosurvival | treated | treated | Ki67+ | G2 | -0.098759 | ER+ |
3273 rows × 14 columns
Tot. missing values including training and test:
Before imputation: 2407
After imputation: 0