Python models

9c3d401e · Ryan Melvin · Tarun karthik kumar Mamidi · a9b9f934 · 9c3d401e · 9c3d401e
Commit 9c3d401e authored 4 years ago by Ryan Melvin Committed by Tarun karthik kumar Mamidi 4 years ago
--- a/.gitignore
+++ b/.gitignore
 *.csv
\ No newline at end of file
+# Created by https://www.toptal.com/developers/gitignore/api/python
+# Edit at https://www.toptal.com/developers/gitignore?templates=python
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+pytestdebug.log
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+doc/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#poetry.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+# .env
+.env/
+.venv/
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pythonenv*
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# operating system-related files
+*.DS_Store #file properties cache/storage on macOS
+Thumbs.db #thumbnail cache on Windows
+# profiling data
+.prof
+# End of https://www.toptal.com/developers/gitignore/api/python
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# COVID-19_RISK_PREDICTOR
+To develop a model that takes in demographics, living style and symptoms/conditions to predict risk for patients.
+### Directory structure used to read parsed data
+```
+encoded-2-week-filter.csv
+```
+### Create conda environment before running anything
+```
+conda create env -n rico -f configs/environment.yaml
+conda activate rico
+```
+### Run model training
+```
+python src/Model.py
+```
+output files are saved in the top level directory.
--- a/configs/environment.yaml
+++ b/configs/environment.yaml
+name: rico
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - pip:
+    - matplotlib==3.3.4
+    - researchpy==0.2.3
+    - scorecardpy==0.1.9.2
+    - xverse==1.0.5
+    - scikit-learn==0.23.2
+    - pandas==1.2.1
+    - numpy==1.19.2
--- a/filter_tested.py
+++ b/filter_tested.py
--- a/notes.md
+++ b/notes.md
-########### 6/11/20 ########################
-Project - To develop a model that takes in symptoms/comditions and COVID test results and predicts/scores patients with risk. We might also develop an app giving something like a credit score to patients with risk.
-Data - N3C data downloaded from i2b2.
-################## Data processing ######################################################################
-First filtering out patients who are tested for coronavirus
-Files involved -
-    measurement.csv - test and results
-    condition_occurance.csv
\ No newline at end of file
--- a/src/Model.py
+++ b/src/Model.py
+#!/usr/bin/env python 
+#libraries
+import pandas as pd
+import numpy as np
+import xverse
+from xverse.transformer import WOE
+import scorecardpy as sc
+import sklearn
+from sklearn.model_selection import train_test_split, StratifiedKFold
+import statsmodels.api as sm
+from matplotlib import pyplot
+%matplotlib inline
+from joblib import dump, load
+# Functions for computing AUC CI using Delong's method
+ #!/usr/bin/python
+"""
+AUC DeLong CI
+Created on Tue Nov  6 10:06:52 2018
+@author: yandexdataschool
+Original Code found in:
+https://github.com/yandexdataschool/roc_comparison
+updated: Raul Sanchez-Vazquez
+"""
+import numpy as np
+import scipy.stats
+from scipy import stats
+# AUC comparison adapted from
+# https://github.com/Netflix/vmaf/
+def compute_midrank(x):
+    """Computes midranks.
+    Args:
+       x - a 1D numpy array
+    Returns:
+       array of midranks
+    """
+    J = np.argsort(x)
+    Z = x[J]
+    N = len(x)
+    T = np.zeros(N, dtype=np.float)
+    i = 0
+    while i < N:
+        j = i
+        while j < N and Z[j] == Z[i]:
+            j += 1
+        T[i:j] = 0.5*(i + j - 1)
+        i = j
+    T2 = np.empty(N, dtype=np.float)
+    # Note(kazeevn) +1 is due to Python using 0-based indexing
+    # instead of 1-based in the AUC formula in the paper
+    T2[J] = T + 1
+    return T2
+def compute_midrank_weight(x, sample_weight):
+    """Computes midranks.
+    Args:
+       x - a 1D numpy array
+    Returns:
+       array of midranks
+    """
+    J = np.argsort(x)
+    Z = x[J]
+    cumulative_weight = np.cumsum(sample_weight[J])
+    N = len(x)
+    T = np.zeros(N, dtype=np.float)
+    i = 0
+    while i < N:
+        j = i
+        while j < N and Z[j] == Z[i]:
+            j += 1
+        T[i:j] = cumulative_weight[i:j].mean()
+        i = j
+    T2 = np.empty(N, dtype=np.float)
+    T2[J] = T
+    return T2
+def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight):
+    if sample_weight is None:
+        return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count)
+    else:
+        return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight)
+def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight):
+    """
+    The fast version of DeLong's method for computing the covariance of
+    unadjusted AUC.
+    Args:
+       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
+          sorted such as the examples with label "1" are first
+    Returns:
+       (AUC value, DeLong covariance)
+    Reference:
+     @article{sun2014fast,
+       title={Fast Implementation of DeLong's Algorithm for
+              Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
+       author={Xu Sun and Weichao Xu},
+       journal={IEEE Signal Processing Letters},
+       volume={21},
+       number={11},
+       pages={1389--1393},
+       year={2014},
+       publisher={IEEE}
+     }
+    """
+    # Short variables are named as they are in the paper
+    m = label_1_count
+    n = predictions_sorted_transposed.shape[1] - m
+    positive_examples = predictions_sorted_transposed[:, :m]
+    negative_examples = predictions_sorted_transposed[:, m:]
+    k = predictions_sorted_transposed.shape[0]
+    tx = np.empty([k, m], dtype=np.float)
+    ty = np.empty([k, n], dtype=np.float)
+    tz = np.empty([k, m + n], dtype=np.float)
+    for r in range(k):
+        tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m])
+        ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:])
+        tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight)
+    total_positive_weights = sample_weight[:m].sum()
+    total_negative_weights = sample_weight[m:].sum()
+    pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
+    total_pair_weights = pair_weights.sum()
+    aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
+    v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
+    v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
+    sx = np.cov(v01)
+    sy = np.cov(v10)
+    delongcov = sx / m + sy / n
+    return aucs, delongcov
+def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count):
+    """
+    The fast version of DeLong's method for computing the covariance of
+    unadjusted AUC.
+    Args:
+       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
+          sorted such as the examples with label "1" are first
+    Returns:
+       (AUC value, DeLong covariance)
+    Reference:
+     @article{sun2014fast,
+       title={Fast Implementation of DeLong's Algorithm for
+              Comparing the Areas Under Correlated Receiver Oerating
+              Characteristic Curves},
+       author={Xu Sun and Weichao Xu},
+       journal={IEEE Signal Processing Letters},
+       volume={21},
+       number={11},
+       pages={1389--1393},
+       year={2014},
+       publisher={IEEE}
+     }
+    """
+    # Short variables are named as they are in the paper
+    m = label_1_count
+    n = predictions_sorted_transposed.shape[1] - m
+    positive_examples = predictions_sorted_transposed[:, :m]
+    negative_examples = predictions_sorted_transposed[:, m:]
+    k = predictions_sorted_transposed.shape[0]
+    tx = np.empty([k, m], dtype=np.float)
+    ty = np.empty([k, n], dtype=np.float)
+    tz = np.empty([k, m + n], dtype=np.float)
+    for r in range(k):
+        tx[r, :] = compute_midrank(positive_examples[r, :])
+        ty[r, :] = compute_midrank(negative_examples[r, :])
+        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
+    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
+    v01 = (tz[:, :m] - tx[:, :]) / n
+    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
+    sx = np.cov(v01)
+    sy = np.cov(v10)
+    delongcov = sx / m + sy / n
+    return aucs, delongcov
+def calc_pvalue(aucs, sigma):
+    """Computes log(10) of p-values.
+    Args:
+       aucs: 1D array of AUCs
+       sigma: AUC DeLong covariances
+    Returns:
+       log10(pvalue)
+    """
+    l = np.array([[1, -1]])
+    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
+    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
+def compute_ground_truth_statistics(ground_truth, sample_weight):
+    assert np.array_equal(np.unique(ground_truth), [0, 1])
+    order = (-ground_truth).argsort()
+    label_1_count = int(ground_truth.sum())
+    if sample_weight is None:
+        ordered_sample_weight = None
+    else:
+        ordered_sample_weight = sample_weight[order]
+    return order, label_1_count, ordered_sample_weight
+def delong_roc_variance(ground_truth, predictions, sample_weight=None):
+    """
+    Computes ROC AUC variance for a single set of predictions
+    Args:
+       ground_truth: np.array of 0 and 1
+       predictions: np.array of floats of the probability of being class 1
+    """
+    order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
+        ground_truth, sample_weight)
+    predictions_sorted_transposed = predictions[np.newaxis, order]
+    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
+    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
+    return aucs[0], delongcov
+# Data setup
+# Read, filter based on missingness and identical limits, 
+# train/test split, and perform WoE transform.
+# load data
+encoded = pd.read_csv('encoded-2-week-filter.csv')
+encoded = encoded.drop(['PERSON_ID'],axis=1)
+# filter variable via missing rate, iv, identical value rate
+encoded_f = sc.var_filter(encoded
+                        , y="class"
+                        , positive='negative'
+                        , identical_limit = 0.95
+                        , iv_limit = 0
+                        , missing_limit=0.95
+                        , return_rm_reason=False # makes output a dictionary referencing 2 dfs
+                        , var_kp=['f_R06'
+                                   , 'f_R05'
+                                   , 'f_R50'
+                                   , 'f_R53'
+                                   , 'f_M79'
+                                   , 'f_R09'
+                                   , 'f_R51'
+                                   , 'f_J44'
+                                   , 'f_E11'
+                                   , 'f_I25'
+                                   , 'f_I10'
+                                 ]
+                        , var_rm = [
+                            'f_BMI-unknown'
+                            , 'f_Unknown'
+                        ]
+                         )
+# breaking dt into train and test
+train, test = sc.split_df(encoded_f, 'class').values()
+# woe binning ------
+bins = sc.woebin(encoded_f, y="class")
+# converting train and test into woe values
+train_woe = sc.woebin_ply(train, bins)
+test_woe = sc.woebin_ply(test, bins)
+# get xs and ys
+y_train = train_woe.loc[:,'class']
+X_train = train_woe.loc[:,train_woe.columns != 'class']
+y_test = test_woe.loc[:,'class']
+X_test = test_woe.loc[:,train_woe.columns != 'class']
+# Lasso-based regression
+# Determine a lambda for Lasso (l1) regularization using 
+# 10-fold cross validation, get predictions from best model, score, and make scorecard
+# logistic regression ------
+# lasso
+from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
+lasso_cv = LogisticRegressionCV(penalty='l1'
+                          , Cs = 100
+                          , solver='saga'
+                          , cv = StratifiedKFold(10)
+                          , n_jobs=-1
+                          , max_iter = 10000
+                          , scoring = 'neg_log_loss'
+                          , class_weight = 'balanced'
+                         )
+lasso_cv.fit(X_train, y_train)
+# plot training ROC
+sklearn.metrics.plot_roc_curve(lasso_cv, X_train, y_train)  
+pyplot.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
+pyplot.title('2-Week LASSO Training ROC')
+axes = pyplot.gca()
+axes.set_facecolor("white")
+axes.set_clip_on(False)
+pyplot.savefig('2week_training_roc.png')
+# plot testing ROC
+sklearn.metrics.plot_roc_curve(lasso_cv, X_test, y_test)  
+pyplot.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
+pyplot.title('2-Week LASSO Testing ROC')
+axes = pyplot.gca()
+axes.set_facecolor("white")
+axes.set_clip_on(False)
+pyplot.savefig('2week_testing_roc.png')
+# predicted proability
+train_pred = lasso_cv.predict_proba(X_train)[:,1]
+train_pred_class = lasso_cv.predict(X_train)
+test_pred = lasso_cv.predict_proba(X_test)[:,1]
+test_pred_class = lasso_cv.predict(X_test)
+# Make scorecard
+card = sc.scorecard(bins, lasso_cv, X_train.columns)
+# credit score
+train_score = sc.scorecard_ply(train, card, print_step=0)
+test_score = sc.scorecard_ply(test, card, print_step=0)
+# psi
+pyplot.rcParams["font.size"] = "18"
+fig = sc.perf_psi(
+  score = {'train':train_score, 'test':test_score},
+  label = {'train':y_train, 'test':y_test},
+  x_tick_break=50
+)
+fig['pic']['score'].set_size_inches(18.5, 10.5)
+fig['pic']['score'].savefig('2week_dist.png')
+card_df = pd.concat(card)
+card_df.to_csv('2week_lasso_card_df.csv')
+scores_lasso_2week = sc.scorecard_ply(encoded, card, only_total_score=True, print_step=0, replace_blank_na=True)
+scores_lasso_2week.to_csv('scores_lasso_2week.csv')
+# Training Metrics and AUC CI
+print("Training Metrics")
+# calculate accuracy
+acc = sklearn.metrics.accuracy_score(y_train, train_pred_class)
+print('Accuracy: %.3f' % acc)
+auc_score = sklearn.metrics.roc_auc_score(y_train, train_pred)
+print('AUC: %.3f' % auc_score)
+f_score = sklearn.metrics.f1_score(y_train, train_pred_class)
+print('FS: %.3f' % f_score)
+# delong ci
+delong_alpha = 0.95
+auc, auc_cov = delong_roc_variance(
+np.ravel(y_train),
+np.ravel(train_pred))
+auc_std = np.sqrt(auc_cov)
+lower_upper_q = np.abs(np.array([0, 1]) - (1 - delong_alpha) / 2)
+ci = stats.norm.ppf(
+    lower_upper_q,
+    loc=auc_score,
+    scale=auc_std)
+ci[ci > 1] = 1
+print('AUC COV:', round(auc_cov,2))
+print('95% AUC CI:', np.round(ci,2))
+# Testing Metrics and AUC CI
+print("Testing Metrics")
+# calculate accuracy
+acc = sklearn.metrics.accuracy_score(y_test, test_pred_class)
+print('Accuracy: %.3f' % acc)
+auc_score = sklearn.metrics.roc_auc_score(y_test, test_pred)
+print('AUC: %.3f' % auc_score)
+f_score = sklearn.metrics.f1_score(y_test, test_pred_class)
+print('FS: %.3f' % f_score)
+# delong ci
+delong_alpha = 0.95
+auc, auc_cov = delong_roc_variance(
+np.ravel(y_test),
+np.ravel(test_pred))
+auc_std = np.sqrt(auc_cov)
+lower_upper_q = np.abs(np.array([0, 1]) - (1 - delong_alpha) / 2)
+ci = stats.norm.ppf(
+    lower_upper_q,
+    loc=auc_score,
+    scale=auc_std)
+ci[ci > 1] = 1
+print('AUC COV:', round(auc_cov,2))
+print('95% AUC CI:', np.round(ci,2))
\ No newline at end of file