Skip to content
Snippets Groups Projects
Commit 9c3d401e authored by Ryan Melvin's avatar Ryan Melvin Committed by Tarun karthik kumar Mamidi
Browse files

Python models

parent a9b9f934
No related branches found
No related tags found
No related merge requests found
*.csv *.csv
\ No newline at end of file
# Created by https://www.toptal.com/developers/gitignore/api/python
# Edit at https://www.toptal.com/developers/gitignore?templates=python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
pytestdebug.log
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
doc/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
#poetry.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
# .env
.env/
.venv/
env/
venv/
ENV/
env.bak/
venv.bak/
pythonenv*
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# operating system-related files
*.DS_Store #file properties cache/storage on macOS
Thumbs.db #thumbnail cache on Windows
# profiling data
.prof
# End of https://www.toptal.com/developers/gitignore/api/python
\ No newline at end of file
# COVID-19_RISK_PREDICTOR
To develop a model that takes in demographics, living style and symptoms/conditions to predict risk for patients.
### Directory structure used to read parsed data
```
encoded-2-week-filter.csv
```
### Create conda environment before running anything
```
conda create env -n rico -f configs/environment.yaml
conda activate rico
```
### Run model training
```
python src/Model.py
```
output files are saved in the top level directory.
name: rico
channels:
- conda-forge
- defaults
dependencies:
- pip:
- matplotlib==3.3.4
- researchpy==0.2.3
- scorecardpy==0.1.9.2
- xverse==1.0.5
- scikit-learn==0.23.2
- pandas==1.2.1
- numpy==1.19.2
########### 6/11/20 ########################
Project - To develop a model that takes in symptoms/comditions and COVID test results and predicts/scores patients with risk. We might also develop an app giving something like a credit score to patients with risk.
Data - N3C data downloaded from i2b2.
################## Data processing ######################################################################
First filtering out patients who are tested for coronavirus
Files involved -
measurement.csv - test and results
condition_occurance.csv
\ No newline at end of file
#!/usr/bin/env python
#libraries
import pandas as pd
import numpy as np
import xverse
from xverse.transformer import WOE
import scorecardpy as sc
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
import statsmodels.api as sm
from matplotlib import pyplot
%matplotlib inline
from joblib import dump, load
# Functions for computing AUC CI using Delong's method
#!/usr/bin/python
"""
AUC DeLong CI
Created on Tue Nov 6 10:06:52 2018
@author: yandexdataschool
Original Code found in:
https://github.com/yandexdataschool/roc_comparison
updated: Raul Sanchez-Vazquez
"""
import numpy as np
import scipy.stats
from scipy import stats
# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
N = len(x)
T = np.zeros(N, dtype=np.float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = 0.5*(i + j - 1)
i = j
T2 = np.empty(N, dtype=np.float)
# Note(kazeevn) +1 is due to Python using 0-based indexing
# instead of 1-based in the AUC formula in the paper
T2[J] = T + 1
return T2
def compute_midrank_weight(x, sample_weight):
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
cumulative_weight = np.cumsum(sample_weight[J])
N = len(x)
T = np.zeros(N, dtype=np.float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = cumulative_weight[i:j].mean()
i = j
T2 = np.empty(N, dtype=np.float)
T2[J] = T
return T2
def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight):
if sample_weight is None:
return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count)
else:
return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight)
def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
Args:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
Returns:
(AUC value, DeLong covariance)
Reference:
@article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for
Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=np.float)
ty = np.empty([k, n], dtype=np.float)
tz = np.empty([k, m + n], dtype=np.float)
for r in range(k):
tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m])
ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:])
tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight)
total_positive_weights = sample_weight[:m].sum()
total_negative_weights = sample_weight[m:].sum()
pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
total_pair_weights = pair_weights.sum()
aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
Args:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
Returns:
(AUC value, DeLong covariance)
Reference:
@article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for
Comparing the Areas Under Correlated Receiver Oerating
Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=np.float)
ty = np.empty([k, n], dtype=np.float)
tz = np.empty([k, m + n], dtype=np.float)
for r in range(k):
tx[r, :] = compute_midrank(positive_examples[r, :])
ty[r, :] = compute_midrank(negative_examples[r, :])
tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
v01 = (tz[:, :m] - tx[:, :]) / n
v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
def calc_pvalue(aucs, sigma):
"""Computes log(10) of p-values.
Args:
aucs: 1D array of AUCs
sigma: AUC DeLong covariances
Returns:
log10(pvalue)
"""
l = np.array([[1, -1]])
z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
def compute_ground_truth_statistics(ground_truth, sample_weight):
assert np.array_equal(np.unique(ground_truth), [0, 1])
order = (-ground_truth).argsort()
label_1_count = int(ground_truth.sum())
if sample_weight is None:
ordered_sample_weight = None
else:
ordered_sample_weight = sample_weight[order]
return order, label_1_count, ordered_sample_weight
def delong_roc_variance(ground_truth, predictions, sample_weight=None):
"""
Computes ROC AUC variance for a single set of predictions
Args:
ground_truth: np.array of 0 and 1
predictions: np.array of floats of the probability of being class 1
"""
order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
ground_truth, sample_weight)
predictions_sorted_transposed = predictions[np.newaxis, order]
aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
return aucs[0], delongcov
# Data setup
# Read, filter based on missingness and identical limits,
# train/test split, and perform WoE transform.
# load data
encoded = pd.read_csv('encoded-2-week-filter.csv')
encoded = encoded.drop(['PERSON_ID'],axis=1)
# filter variable via missing rate, iv, identical value rate
encoded_f = sc.var_filter(encoded
, y="class"
, positive='negative'
, identical_limit = 0.95
, iv_limit = 0
, missing_limit=0.95
, return_rm_reason=False # makes output a dictionary referencing 2 dfs
, var_kp=['f_R06'
, 'f_R05'
, 'f_R50'
, 'f_R53'
, 'f_M79'
, 'f_R09'
, 'f_R51'
, 'f_J44'
, 'f_E11'
, 'f_I25'
, 'f_I10'
]
, var_rm = [
'f_BMI-unknown'
, 'f_Unknown'
]
)
# breaking dt into train and test
train, test = sc.split_df(encoded_f, 'class').values()
# woe binning ------
bins = sc.woebin(encoded_f, y="class")
# converting train and test into woe values
train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)
# get xs and ys
y_train = train_woe.loc[:,'class']
X_train = train_woe.loc[:,train_woe.columns != 'class']
y_test = test_woe.loc[:,'class']
X_test = test_woe.loc[:,train_woe.columns != 'class']
# Lasso-based regression
# Determine a lambda for Lasso (l1) regularization using
# 10-fold cross validation, get predictions from best model, score, and make scorecard
# logistic regression ------
# lasso
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
lasso_cv = LogisticRegressionCV(penalty='l1'
, Cs = 100
, solver='saga'
, cv = StratifiedKFold(10)
, n_jobs=-1
, max_iter = 10000
, scoring = 'neg_log_loss'
, class_weight = 'balanced'
)
lasso_cv.fit(X_train, y_train)
# plot training ROC
sklearn.metrics.plot_roc_curve(lasso_cv, X_train, y_train)
pyplot.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
pyplot.title('2-Week LASSO Training ROC')
axes = pyplot.gca()
axes.set_facecolor("white")
axes.set_clip_on(False)
pyplot.savefig('2week_training_roc.png')
# plot testing ROC
sklearn.metrics.plot_roc_curve(lasso_cv, X_test, y_test)
pyplot.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
pyplot.title('2-Week LASSO Testing ROC')
axes = pyplot.gca()
axes.set_facecolor("white")
axes.set_clip_on(False)
pyplot.savefig('2week_testing_roc.png')
# predicted proability
train_pred = lasso_cv.predict_proba(X_train)[:,1]
train_pred_class = lasso_cv.predict(X_train)
test_pred = lasso_cv.predict_proba(X_test)[:,1]
test_pred_class = lasso_cv.predict(X_test)
# Make scorecard
card = sc.scorecard(bins, lasso_cv, X_train.columns)
# credit score
train_score = sc.scorecard_ply(train, card, print_step=0)
test_score = sc.scorecard_ply(test, card, print_step=0)
# psi
pyplot.rcParams["font.size"] = "18"
fig = sc.perf_psi(
score = {'train':train_score, 'test':test_score},
label = {'train':y_train, 'test':y_test},
x_tick_break=50
)
fig['pic']['score'].set_size_inches(18.5, 10.5)
fig['pic']['score'].savefig('2week_dist.png')
card_df = pd.concat(card)
card_df.to_csv('2week_lasso_card_df.csv')
scores_lasso_2week = sc.scorecard_ply(encoded, card, only_total_score=True, print_step=0, replace_blank_na=True)
scores_lasso_2week.to_csv('scores_lasso_2week.csv')
# Training Metrics and AUC CI
print("Training Metrics")
# calculate accuracy
acc = sklearn.metrics.accuracy_score(y_train, train_pred_class)
print('Accuracy: %.3f' % acc)
auc_score = sklearn.metrics.roc_auc_score(y_train, train_pred)
print('AUC: %.3f' % auc_score)
f_score = sklearn.metrics.f1_score(y_train, train_pred_class)
print('FS: %.3f' % f_score)
# delong ci
delong_alpha = 0.95
auc, auc_cov = delong_roc_variance(
np.ravel(y_train),
np.ravel(train_pred))
auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - delong_alpha) / 2)
ci = stats.norm.ppf(
lower_upper_q,
loc=auc_score,
scale=auc_std)
ci[ci > 1] = 1
print('AUC COV:', round(auc_cov,2))
print('95% AUC CI:', np.round(ci,2))
# Testing Metrics and AUC CI
print("Testing Metrics")
# calculate accuracy
acc = sklearn.metrics.accuracy_score(y_test, test_pred_class)
print('Accuracy: %.3f' % acc)
auc_score = sklearn.metrics.roc_auc_score(y_test, test_pred)
print('AUC: %.3f' % auc_score)
f_score = sklearn.metrics.f1_score(y_test, test_pred_class)
print('FS: %.3f' % f_score)
# delong ci
delong_alpha = 0.95
auc, auc_cov = delong_roc_variance(
np.ravel(y_test),
np.ravel(test_pred))
auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - delong_alpha) / 2)
ci = stats.norm.ppf(
lower_upper_q,
loc=auc_score,
scale=auc_std)
ci[ci > 1] = 1
print('AUC COV:', round(auc_cov,2))
print('95% AUC CI:', np.round(ci,2))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment