diff --git a/.gitignore b/.gitignore index 16f2dc5fa95ef368c3f09eaca24f75cb45102d3e..84ea3c96eb7a58584f887c94057453c0fddc7c2f 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,154 @@ -*.csv \ No newline at end of file +*.csv + + +# Created by https://www.toptal.com/developers/gitignore/api/python +# Edit at https://www.toptal.com/developers/gitignore?templates=python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +pytestdebug.log + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +doc/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +#poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +# .env +.env/ +.venv/ +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pythonenv* + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# operating system-related files +*.DS_Store #file properties cache/storage on macOS +Thumbs.db #thumbnail cache on Windows + +# profiling data +.prof + + +# End of https://www.toptal.com/developers/gitignore/api/python \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0c4d21b410ccce1b05d169b7a3d4816b4559b557 --- /dev/null +++ b/README.md @@ -0,0 +1,20 @@ +# COVID-19_RISK_PREDICTOR +To develop a model that takes in demographics, living style and symptoms/conditions to predict risk for patients. + +### Directory structure used to read parsed data + +``` +encoded-2-week-filter.csv +``` + +### Create conda environment before running anything +``` +conda create env -n rico -f configs/environment.yaml +conda activate rico +``` +### Run model training +``` +python src/Model.py +``` + +output files are saved in the top level directory. diff --git a/configs/environment.yaml b/configs/environment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a0cd1c657ed7b3b8581df0b76e8b94fbe548268a --- /dev/null +++ b/configs/environment.yaml @@ -0,0 +1,14 @@ +name: rico +channels: + - conda-forge + - defaults +dependencies: + - pip: + - matplotlib==3.3.4 + - researchpy==0.2.3 + - scorecardpy==0.1.9.2 + - xverse==1.0.5 + - scikit-learn==0.23.2 + - pandas==1.2.1 + - numpy==1.19.2 + diff --git a/filter_tested.py b/filter_tested.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/notes.md b/notes.md deleted file mode 100644 index 41f0a57d3135a873c37b392501ca0c39504e4199..0000000000000000000000000000000000000000 --- a/notes.md +++ /dev/null @@ -1,12 +0,0 @@ -########### 6/11/20 ######################## - -Project - To develop a model that takes in symptoms/comditions and COVID test results and predicts/scores patients with risk. We might also develop an app giving something like a credit score to patients with risk. - -Data - N3C data downloaded from i2b2. - -################## Data processing ###################################################################### - -First filtering out patients who are tested for coronavirus -Files involved - - measurement.csv - test and results - condition_occurance.csv \ No newline at end of file diff --git a/src/Model.py b/src/Model.py new file mode 100644 index 0000000000000000000000000000000000000000..3da60a5bbc24c00af9936e96dab2b6620ef23d0a --- /dev/null +++ b/src/Model.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python + +#libraries +import pandas as pd +import numpy as np +import xverse +from xverse.transformer import WOE +import scorecardpy as sc +import sklearn +from sklearn.model_selection import train_test_split, StratifiedKFold +import statsmodels.api as sm +from matplotlib import pyplot +%matplotlib inline +from joblib import dump, load + +# Functions for computing AUC CI using Delong's method + #!/usr/bin/python + +""" +AUC DeLong CI + +Created on Tue Nov 6 10:06:52 2018 + +@author: yandexdataschool + +Original Code found in: +https://github.com/yandexdataschool/roc_comparison + +updated: Raul Sanchez-Vazquez +""" + +import numpy as np +import scipy.stats +from scipy import stats + +# AUC comparison adapted from +# https://github.com/Netflix/vmaf/ +def compute_midrank(x): + """Computes midranks. + Args: + x - a 1D numpy array + Returns: + array of midranks + """ + J = np.argsort(x) + Z = x[J] + N = len(x) + T = np.zeros(N, dtype=np.float) + i = 0 + while i < N: + j = i + while j < N and Z[j] == Z[i]: + j += 1 + T[i:j] = 0.5*(i + j - 1) + i = j + T2 = np.empty(N, dtype=np.float) + # Note(kazeevn) +1 is due to Python using 0-based indexing + # instead of 1-based in the AUC formula in the paper + T2[J] = T + 1 + return T2 + + +def compute_midrank_weight(x, sample_weight): + """Computes midranks. + Args: + x - a 1D numpy array + Returns: + array of midranks + """ + J = np.argsort(x) + Z = x[J] + cumulative_weight = np.cumsum(sample_weight[J]) + N = len(x) + T = np.zeros(N, dtype=np.float) + i = 0 + while i < N: + j = i + while j < N and Z[j] == Z[i]: + j += 1 + T[i:j] = cumulative_weight[i:j].mean() + i = j + T2 = np.empty(N, dtype=np.float) + T2[J] = T + return T2 + + +def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight): + if sample_weight is None: + return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count) + else: + return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight) + + +def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight): + """ + The fast version of DeLong's method for computing the covariance of + unadjusted AUC. + Args: + predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples] + sorted such as the examples with label "1" are first + Returns: + (AUC value, DeLong covariance) + Reference: + @article{sun2014fast, + title={Fast Implementation of DeLong's Algorithm for + Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves}, + author={Xu Sun and Weichao Xu}, + journal={IEEE Signal Processing Letters}, + volume={21}, + number={11}, + pages={1389--1393}, + year={2014}, + publisher={IEEE} + } + """ + # Short variables are named as they are in the paper + m = label_1_count + n = predictions_sorted_transposed.shape[1] - m + positive_examples = predictions_sorted_transposed[:, :m] + negative_examples = predictions_sorted_transposed[:, m:] + k = predictions_sorted_transposed.shape[0] + + tx = np.empty([k, m], dtype=np.float) + ty = np.empty([k, n], dtype=np.float) + tz = np.empty([k, m + n], dtype=np.float) + for r in range(k): + tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m]) + ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:]) + tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight) + total_positive_weights = sample_weight[:m].sum() + total_negative_weights = sample_weight[m:].sum() + pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:]) + total_pair_weights = pair_weights.sum() + aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights + v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights + v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights + sx = np.cov(v01) + sy = np.cov(v10) + delongcov = sx / m + sy / n + return aucs, delongcov + + +def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count): + """ + The fast version of DeLong's method for computing the covariance of + unadjusted AUC. + Args: + predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples] + sorted such as the examples with label "1" are first + Returns: + (AUC value, DeLong covariance) + Reference: + @article{sun2014fast, + title={Fast Implementation of DeLong's Algorithm for + Comparing the Areas Under Correlated Receiver Oerating + Characteristic Curves}, + author={Xu Sun and Weichao Xu}, + journal={IEEE Signal Processing Letters}, + volume={21}, + number={11}, + pages={1389--1393}, + year={2014}, + publisher={IEEE} + } + """ + # Short variables are named as they are in the paper + m = label_1_count + n = predictions_sorted_transposed.shape[1] - m + positive_examples = predictions_sorted_transposed[:, :m] + negative_examples = predictions_sorted_transposed[:, m:] + k = predictions_sorted_transposed.shape[0] + + tx = np.empty([k, m], dtype=np.float) + ty = np.empty([k, n], dtype=np.float) + tz = np.empty([k, m + n], dtype=np.float) + for r in range(k): + tx[r, :] = compute_midrank(positive_examples[r, :]) + ty[r, :] = compute_midrank(negative_examples[r, :]) + tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :]) + aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n + v01 = (tz[:, :m] - tx[:, :]) / n + v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m + sx = np.cov(v01) + sy = np.cov(v10) + delongcov = sx / m + sy / n + return aucs, delongcov + + +def calc_pvalue(aucs, sigma): + """Computes log(10) of p-values. + Args: + aucs: 1D array of AUCs + sigma: AUC DeLong covariances + Returns: + log10(pvalue) + """ + l = np.array([[1, -1]]) + z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T)) + return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10) + + +def compute_ground_truth_statistics(ground_truth, sample_weight): + assert np.array_equal(np.unique(ground_truth), [0, 1]) + order = (-ground_truth).argsort() + label_1_count = int(ground_truth.sum()) + if sample_weight is None: + ordered_sample_weight = None + else: + ordered_sample_weight = sample_weight[order] + + return order, label_1_count, ordered_sample_weight + + +def delong_roc_variance(ground_truth, predictions, sample_weight=None): + """ + Computes ROC AUC variance for a single set of predictions + Args: + ground_truth: np.array of 0 and 1 + predictions: np.array of floats of the probability of being class 1 + """ + order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics( + ground_truth, sample_weight) + predictions_sorted_transposed = predictions[np.newaxis, order] + aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight) + assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers" + return aucs[0], delongcov + + + +# Data setup +# Read, filter based on missingness and identical limits, +# train/test split, and perform WoE transform. + +# load data +encoded = pd.read_csv('encoded-2-week-filter.csv') + +encoded = encoded.drop(['PERSON_ID'],axis=1) + +# filter variable via missing rate, iv, identical value rate +encoded_f = sc.var_filter(encoded + , y="class" + , positive='negative' + , identical_limit = 0.95 + , iv_limit = 0 + , missing_limit=0.95 + , return_rm_reason=False # makes output a dictionary referencing 2 dfs + , var_kp=['f_R06' + , 'f_R05' + , 'f_R50' + , 'f_R53' + , 'f_M79' + , 'f_R09' + , 'f_R51' + , 'f_J44' + , 'f_E11' + , 'f_I25' + , 'f_I10' + ] + , var_rm = [ + 'f_BMI-unknown' + , 'f_Unknown' + ] + ) + +# breaking dt into train and test +train, test = sc.split_df(encoded_f, 'class').values() + +# woe binning ------ +bins = sc.woebin(encoded_f, y="class") + +# converting train and test into woe values +train_woe = sc.woebin_ply(train, bins) +test_woe = sc.woebin_ply(test, bins) + +# get xs and ys +y_train = train_woe.loc[:,'class'] +X_train = train_woe.loc[:,train_woe.columns != 'class'] +y_test = test_woe.loc[:,'class'] +X_test = test_woe.loc[:,train_woe.columns != 'class'] + +# Lasso-based regression +# Determine a lambda for Lasso (l1) regularization using +# 10-fold cross validation, get predictions from best model, score, and make scorecard + +# logistic regression ------ +# lasso +from sklearn.linear_model import LogisticRegressionCV, LogisticRegression +lasso_cv = LogisticRegressionCV(penalty='l1' + , Cs = 100 + , solver='saga' + , cv = StratifiedKFold(10) + , n_jobs=-1 + , max_iter = 10000 + , scoring = 'neg_log_loss' + , class_weight = 'balanced' + ) +lasso_cv.fit(X_train, y_train) + +# plot training ROC +sklearn.metrics.plot_roc_curve(lasso_cv, X_train, y_train) +pyplot.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--') +pyplot.title('2-Week LASSO Training ROC') +axes = pyplot.gca() +axes.set_facecolor("white") +axes.set_clip_on(False) +pyplot.savefig('2week_training_roc.png') + +# plot testing ROC +sklearn.metrics.plot_roc_curve(lasso_cv, X_test, y_test) +pyplot.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--') +pyplot.title('2-Week LASSO Testing ROC') +axes = pyplot.gca() +axes.set_facecolor("white") +axes.set_clip_on(False) +pyplot.savefig('2week_testing_roc.png') + +# predicted proability +train_pred = lasso_cv.predict_proba(X_train)[:,1] +train_pred_class = lasso_cv.predict(X_train) +test_pred = lasso_cv.predict_proba(X_test)[:,1] +test_pred_class = lasso_cv.predict(X_test) + +# Make scorecard +card = sc.scorecard(bins, lasso_cv, X_train.columns) +# credit score +train_score = sc.scorecard_ply(train, card, print_step=0) +test_score = sc.scorecard_ply(test, card, print_step=0) + + +# psi +pyplot.rcParams["font.size"] = "18" +fig = sc.perf_psi( + score = {'train':train_score, 'test':test_score}, + label = {'train':y_train, 'test':y_test}, + x_tick_break=50 +) +fig['pic']['score'].set_size_inches(18.5, 10.5) + +fig['pic']['score'].savefig('2week_dist.png') + +card_df = pd.concat(card) +card_df.to_csv('2week_lasso_card_df.csv') + +scores_lasso_2week = sc.scorecard_ply(encoded, card, only_total_score=True, print_step=0, replace_blank_na=True) +scores_lasso_2week.to_csv('scores_lasso_2week.csv') + +# Training Metrics and AUC CI +print("Training Metrics") +# calculate accuracy +acc = sklearn.metrics.accuracy_score(y_train, train_pred_class) +print('Accuracy: %.3f' % acc) +auc_score = sklearn.metrics.roc_auc_score(y_train, train_pred) +print('AUC: %.3f' % auc_score) +f_score = sklearn.metrics.f1_score(y_train, train_pred_class) +print('FS: %.3f' % f_score) + +# delong ci +delong_alpha = 0.95 +auc, auc_cov = delong_roc_variance( +np.ravel(y_train), +np.ravel(train_pred)) + +auc_std = np.sqrt(auc_cov) +lower_upper_q = np.abs(np.array([0, 1]) - (1 - delong_alpha) / 2) + +ci = stats.norm.ppf( + lower_upper_q, + loc=auc_score, + scale=auc_std) + +ci[ci > 1] = 1 + +print('AUC COV:', round(auc_cov,2)) +print('95% AUC CI:', np.round(ci,2)) + +# Testing Metrics and AUC CI +print("Testing Metrics") +# calculate accuracy +acc = sklearn.metrics.accuracy_score(y_test, test_pred_class) +print('Accuracy: %.3f' % acc) +auc_score = sklearn.metrics.roc_auc_score(y_test, test_pred) +print('AUC: %.3f' % auc_score) +f_score = sklearn.metrics.f1_score(y_test, test_pred_class) +print('FS: %.3f' % f_score) + +# delong ci +delong_alpha = 0.95 +auc, auc_cov = delong_roc_variance( +np.ravel(y_test), +np.ravel(test_pred)) + +auc_std = np.sqrt(auc_cov) +lower_upper_q = np.abs(np.array([0, 1]) - (1 - delong_alpha) / 2) + +ci = stats.norm.ppf( + lower_upper_q, + loc=auc_score, + scale=auc_std) + +ci[ci > 1] = 1 + +print('AUC COV:', round(auc_cov,2)) +print('95% AUC CI:', np.round(ci,2)) \ No newline at end of file