diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 0000000000000000000000000000000000000000..dab72cc938eef19c568977d84f3f9c484d5e1245
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,54 @@
+pipeline {
+  agent any
+  options {
+    timestamps()
+    ansiColor('xterm')
+  }
+  environment {
+    GITLAB_API_TOKEN = credentials('GitLabToken')
+    BASE_GITLAB_URL =  credentials('BaseGitlabUrl')
+  }
+  stages {
+    stage('Static Analysis') {
+      agent {
+        docker { image '${BASE_GITLAB_URL}/center-for-computational-genomics-and-data-science/utility-images/static-analysis:v1.1'}
+      }
+      steps {
+        sh '/bin/linting.sh'
+      }
+      post {
+        success {
+          sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"https://gitlab.rc.uab.edu/api/v4/projects/1585/statuses/${GIT_COMMIT}?state=success&name=jenkins_static_analysis\""
+        }
+        failure {
+          sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"https://gitlab.rc.uab.edu/api/v4/projects/1585/statuses/${GIT_COMMIT}?state=canceled&name=jenkins_static_analysis\""
+        }
+      }
+    }
+    stage('Unit Test') {
+      agent {
+        docker { image 'continuumio/miniconda3:4.9.2' }
+      }
+      steps {
+        sh 'conda env create --file configs/environment.yaml'
+        sh 'python -m unittest -v testing/unit_test.py'
+      }
+      post {
+        success {
+          sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"https://gitlab.rc.uab.edu/api/v4/projects/1585/statuses/${GIT_COMMIT}?state=success&name=jenkins_unit_tests\""
+        }
+        failure {
+          sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"https://gitlab.rc.uab.edu/api/v4/projects/1585/statuses/${GIT_COMMIT}?state=canceled&name=jenkins_unit_tests\""
+        }
+      }
+    }
+  }
+  post {
+	success {
+      sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"https://gitlab.rc.uab.edu/api/v4/projects/1585/statuses/${GIT_COMMIT}?state=success&name=jenkins\""
+    }
+	failure {
+      sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_API_TOKEN}\" \"https://gitlab.rc.uab.edu/api/v4/projects/1585/statuses/${GIT_COMMIT}?state=canceled&name=jenkins\""
+  	}
+  }
+}
diff --git a/License.md b/License.md
index b709f88c8be63347fbd203d002876dfd86408711..58985c4916048746970da81e8016dd125fd6fccc 100644
--- a/License.md
+++ b/License.md
@@ -1,9 +1,16 @@
-The MIT License (MIT)
+# The MIT License (MIT)
 
 Copyright (c) 2021 Center for Computational Genomics and Data Science
 
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
+persons to whom the Software is furnished to do so, subject to the following conditions:
 
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+Software.
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
index 47e3f90180d67c4f697ab103b8aa38c96d23bdcf..b4a1d69f053e838a60248eb3048a8f6fc6960e0b 100644
--- a/README.md
+++ b/README.md
@@ -1,29 +1,34 @@
--   [COVID-19_RISK_PREDICTOR](#COVID-19_RISK_PREDICTOR)
-    -   [Data availability](#Data-availability)
-    -   [Usage](#Usage)
-        -   [Installation](#Installation)
-        -   [Requirements](#Requirements)
-        -   [Activate conda environment](#Activate-conda-environment)
-        -   [Run parser](#Run-parser)
-        -   [Run model training](#Run-model-training)
-        -   [Build Streamlit app](#Build-Streamlit-app)
-    -   [Contact information](#Contact-information)
-
-
-
-
 # COVID-19_RISK_PREDICTOR
+
 ***!!! For research purposes only !!!***
 
-**Aim:** To develop a model that takes in demographics, living style and symptoms/conditions to predict risk of COVID-19 infection for patients.
+- [COVID-19_RISK_PREDICTOR](#covid-19_risk_predictor)
+    - [Data availability](#data-availability)
+    - [Usage](#usage)
+        - [Installation](#installation)
+        - [Requirements](#requirements)
+        - [Activate conda environment](#activate-conda-environment)
+        - [Run parser](#run-parser)
+        - [Run model training](#run-model-training)
+        - [Build Streamlit app](#build-streamlit-app)
+        - [Unit Testing](#unit-testing)
+    - [Contact information](#contact-information)
+
+**Aim:** To develop a model that takes in demographics, living style and symptoms/conditions to predict risk of COVID-19
+infection for patients.
 
 ## Data availability
-Data was made available through the UAB Biomedical Research Information Technology Enhancement (U-BRITE) framework. Access to the level-2 i2b2 data was granted upon self-service pursuant to an IRB exemption. [link](https://www.uab.edu/ccts/research-commons/berd/55-research-commons/informatics/325-i2b2)
+
+Data was made available through the UAB Biomedical Research Information Technology Enhancement (U-BRITE) framework.
+Access to the level-2 i2b2 data was granted upon self-service pursuant to an IRB exemption.
+[link](https://www.uab.edu/ccts/research-commons/berd/55-research-commons/informatics/325-i2b2)
 
 ### Directory structure used to parse data from positive and negative cohorts
-Dataset used was transformed to adhere to the [OMOP Common Data Model Version 5.3.1](https://ohdsi.github.io/CommonDataModel/cdm531.html) to enable systemic analyses of EHR data from disparate sources.
 
-```
+Dataset used was transformed to adhere to the [OMOP Common Data Model Version 5.3.1](https://ohdsi.github.io/CommonDataModel/cdm531.html)
+to enable systemic analyses of EHR data from disparate sources.
+
+```directory
 Cohorts/
 â”œâ”€â”€ positive               <--- positive cohort directory
 â”‚Â Â  â”œâ”€â”€ measurement.csv - test and results
@@ -38,10 +43,10 @@ Cohorts/
 â””â”€â”€ README.md
 ```
 
-
 ## Usage
 
 ### Installation
+
 Installation simply requires fetching the source code. Following are required:
 
 - Git
@@ -81,32 +86,62 @@ conda activate rico
 ```
 
 ### Run parser
-```
+
+```sh
 python src/filter_dataset.py --pos Cohorts/positive/ --neg Cohorts/negative/
 ```
 
 For help, use the `-h` help argument
-```
+
+```sh
 python src/filter_dataset.py -h
 ```
 
 parsed files are saved in `./results` directory.
 
 ### Run model training
-```
+
+```sh
 python src/Model.py --input results/encoded-100-week-filter.csv
 ```
 
 output files are saved in `./results` directory.
 
 ### Build Streamlit app
-As an example, we created a streamlit app with the results from our model. Please refer to
+
+To demonstrate the application of these models one of the four was chosen and a sample Streamlit app was created and included in the project. Please refer to
 `src/streamlit/RICO.py`
 
+**Note** - This Streamlit app is for demonstration of one of the models and is not a necessity for the pipeline but only for display of calculation and interpretation. The questionnaire from the models can be used manually without this. Hence, the Streamlit app is not tested and should be used at your own risk for demo purposes or as a guide for building from this work.
+
+### Unit Testing
+
+To test the functions in `filter_dataset.py`, use the below command -
+
+```sh
+python -m unittest -v testing/unit_test.py
+```
+
+To test the coverage of testing, use the below commands -
+
+```sh
+# test the coverage
+coverage run -m unittest -v testing/unit_test.py
+
+# To get a coverage report
+coverage report
+
+# To get annotated HTML listings
+coverage html
+```
+
+**Note** - Functions in `Model.py` are adapted from [this Github repo](https://github.com/yandexdataschool/roc_comparison),
+where they already implemented unit testing.
 
 ## Contact information
-For issues, please send an email with clear description to 
+
+For issues, please send an email with clear description to
 
 Tarun Mamidi    -   tmamidi@uab.edu
 
-Ryan Melvin     -   rmelvin@uabmc.edu
\ No newline at end of file
+Ryan Melvin     -   rmelvin@uabmc.edu
diff --git a/configs/environment.yaml b/configs/environment.yaml
index b794c9d4816a94dc44f237c784b59fc351c87c5f..a6e49d83c926f0aed0c84bf3eb8eac84e20e3d1c 100644
--- a/configs/environment.yaml
+++ b/configs/environment.yaml
@@ -10,8 +10,10 @@ dependencies:
   - pyyaml=5.4.1
   - matplotlib=3.3.4
   - scikit-learn=0.24.1
-  - pip
+  - black=21.5b0
+  - parameterized=0.8.1
+  - pip=21.1.1
   - pip:
-    - scorecardpy==0.1.9.2
-    - xverse==1.0.5
-
+      - scorecardpy==0.1.9.2
+      - xverse==1.0.5
+      - coverage==5.5
diff --git a/src/Model.py b/src/Model.py
index 3c7921a45270b3a4ea177d9b311d25620bc50f2c..1854c3551ebbb132645db849d660334d661aaa17 100644
--- a/src/Model.py
+++ b/src/Model.py
@@ -1,4 +1,4 @@
-#libraries
+# libraries
 import pandas as pd
 import numpy as np
 import xverse
@@ -8,13 +8,14 @@ import sklearn
 from sklearn.model_selection import train_test_split, StratifiedKFold
 import statsmodels.api as sm
 from matplotlib import pyplot
+
 #%matplotlib inline
 from joblib import dump, load
 import argparse
 from scipy import stats
 
 # Functions for computing AUC CI using Delong's method
- #!/usr/bin/python
+#!/usr/bin/python
 
 """
 AUC DeLong CI
@@ -47,7 +48,7 @@ def compute_midrank(x):
         j = i
         while j < N and Z[j] == Z[i]:
             j += 1
-        T[i:j] = 0.5*(i + j - 1)
+        T[i:j] = 0.5 * (i + j - 1)
         i = j
     T2 = np.empty(N, dtype=np.float)
     # Note(kazeevn) +1 is due to Python using 0-based indexing
@@ -127,9 +128,9 @@ def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weig
     total_negative_weights = sample_weight[m:].sum()
     pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
     total_pair_weights = pair_weights.sum()
-    aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
+    aucs = (sample_weight[:m] * (tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
     v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
-    v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
+    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / total_positive_weights
     sx = np.cov(v01)
     sy = np.cov(v10)
     delongcov = sx / m + sy / n
@@ -215,192 +216,183 @@ def delong_roc_variance(ground_truth, predictions, sample_weight=None):
        predictions: np.array of floats of the probability of being class 1
     """
     order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
-        ground_truth, sample_weight)
+        ground_truth, sample_weight
+    )
     predictions_sorted_transposed = predictions[np.newaxis, order]
-    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
+    aucs, delongcov = fastDeLong(
+        predictions_sorted_transposed, label_1_count, ordered_sample_weight
+    )
     assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
     return aucs[0], delongcov
 
 
 if __name__ == "__main__":
     # Data setup
-    # Read, filter based on missingness and identical limits, 
+    # Read, filter based on missingness and identical limits,
     # train/test split, and perform WoE transform.
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input", 
-        type=str,
-        required=True,
-        help="input encoded file")
+    parser.add_argument("--input", type=str, required=True, help="input encoded file")
 
     args = parser.parse_args()
     # load data
     encoded = pd.read_csv(args.input)
-    
-    encoded = encoded.drop(['PERSON_ID'],axis=1)
-    
+
+    encoded = encoded.drop(["PERSON_ID"], axis=1)
+
     # filter variable via missing rate, iv, identical value rate
-    encoded_f = sc.var_filter(encoded
-                            , y="class"
-                            , positive='negative'
-                            , identical_limit = 0.95
-                            , iv_limit = 0
-                            , missing_limit=0.95
-                            , return_rm_reason=False # makes output a dictionary referencing 2 dfs
-                            , var_kp=['f_R06'
-                                       , 'f_R05'
-                                       , 'f_R50'
-                                       , 'f_R53'
-                                       , 'f_M79'
-                                       , 'f_R09'
-                                       , 'f_R51'
-                                       , 'f_J44'
-                                       , 'f_E11'
-                                       , 'f_I25'
-                                       , 'f_I10'
-                                     ]
-                            , var_rm = [
-                                'f_BMI-unknown'
-                                , 'f_Unknown'
-                            ]
-                             )
-    
+    encoded_f = sc.var_filter(
+        encoded,
+        y="class",
+        positive="negative",
+        identical_limit=0.95,
+        iv_limit=0,
+        missing_limit=0.95,
+        return_rm_reason=False,  # makes output a dictionary referencing 2 dfs
+        var_kp=[
+            "f_R06",
+            "f_R05",
+            "f_R50",
+            "f_R53",
+            "f_M79",
+            "f_R09",
+            "f_R51",
+            "f_J44",
+            "f_E11",
+            "f_I25",
+            "f_I10",
+        ],
+        var_rm=["f_BMI-unknown", "f_Unknown"],
+    )
+
     # breaking dt into train and test
-    train, test = sc.split_df(encoded_f, 'class').values()
-    
+    train, test = sc.split_df(encoded_f, "class").values()
+
     # woe binning ------
     bins = sc.woebin(encoded_f, y="class")
-    
+
     # converting train and test into woe values
     train_woe = sc.woebin_ply(train, bins)
     test_woe = sc.woebin_ply(test, bins)
-    
+
     # get xs and ys
-    y_train = train_woe.loc[:,'class']
-    X_train = train_woe.loc[:,train_woe.columns != 'class']
-    y_test = test_woe.loc[:,'class']
-    X_test = test_woe.loc[:,train_woe.columns != 'class']
-    
+    y_train = train_woe.loc[:, "class"]
+    X_train = train_woe.loc[:, train_woe.columns != "class"]
+    y_test = test_woe.loc[:, "class"]
+    X_test = test_woe.loc[:, train_woe.columns != "class"]
+
     # Lasso-based regression
-    # Determine a lambda for Lasso (l1) regularization using 
+    # Determine a lambda for Lasso (l1) regularization using
     # 10-fold cross validation, get predictions from best model, score, and make scorecard
-    
+
     # logistic regression ------
     # lasso
     from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
-    lasso_cv = LogisticRegressionCV(penalty='l1'
-                              , Cs = 100
-                              , solver='saga'
-                              , cv = StratifiedKFold(10)
-                              , n_jobs=-1
-                              , max_iter = 10000
-                              , scoring = 'neg_log_loss'
-                              , class_weight = 'balanced'
-                             )
+
+    lasso_cv = LogisticRegressionCV(
+        penalty="l1",
+        Cs=100,
+        solver="saga",
+        cv=StratifiedKFold(10),
+        n_jobs=-1,
+        max_iter=10000,
+        scoring="neg_log_loss",
+        class_weight="balanced",
+    )
     lasso_cv.fit(X_train, y_train)
-    
+
     # plot training ROC
-    sklearn.metrics.plot_roc_curve(lasso_cv, X_train, y_train)  
-    pyplot.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
-    pyplot.title('LASSO Training ROC')
+    sklearn.metrics.plot_roc_curve(lasso_cv, X_train, y_train)
+    pyplot.plot([0, 1], [0, 1], color="black", lw=2, linestyle="--")
+    pyplot.title("LASSO Training ROC")
     axes = pyplot.gca()
     axes.set_facecolor("white")
     axes.set_clip_on(False)
-    pyplot.savefig('results/training_roc.png')
-    
+    pyplot.savefig("results/training_roc.png")
+
     # plot testing ROC
-    sklearn.metrics.plot_roc_curve(lasso_cv, X_test, y_test)  
-    pyplot.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
-    pyplot.title('LASSO Testing ROC')
+    sklearn.metrics.plot_roc_curve(lasso_cv, X_test, y_test)
+    pyplot.plot([0, 1], [0, 1], color="black", lw=2, linestyle="--")
+    pyplot.title("LASSO Testing ROC")
     axes = pyplot.gca()
     axes.set_facecolor("white")
     axes.set_clip_on(False)
-    pyplot.savefig('results/testing_roc.png')
-    
+    pyplot.savefig("results/testing_roc.png")
+
     # predicted proability
-    train_pred = lasso_cv.predict_proba(X_train)[:,1]
+    train_pred = lasso_cv.predict_proba(X_train)[:, 1]
     train_pred_class = lasso_cv.predict(X_train)
-    test_pred = lasso_cv.predict_proba(X_test)[:,1]
+    test_pred = lasso_cv.predict_proba(X_test)[:, 1]
     test_pred_class = lasso_cv.predict(X_test)
-    
+
     # Make scorecard
     card = sc.scorecard(bins, lasso_cv, X_train.columns)
     # credit score
     train_score = sc.scorecard_ply(train, card, print_step=0)
     test_score = sc.scorecard_ply(test, card, print_step=0)
-    
-    
+
     # psi
     pyplot.rcParams["font.size"] = "18"
     fig = sc.perf_psi(
-      score = {'train':train_score, 'test':test_score},
-      label = {'train':y_train, 'test':y_test},
-      x_tick_break=50
+        score={"train": train_score, "test": test_score},
+        label={"train": y_train, "test": y_test},
+        x_tick_break=50,
     )
-    fig['pic']['score'].set_size_inches(18.5, 10.5)
-    
-    fig['pic']['score'].savefig('results/dist.png')
-    
+    fig["pic"]["score"].set_size_inches(18.5, 10.5)
+
+    fig["pic"]["score"].savefig("results/dist.png")
+
     card_df = pd.concat(card)
-    card_df.to_csv('results/lasso_card_df.csv')
-    
-    scores_lasso_2week = sc.scorecard_ply(encoded, card, only_total_score=True, print_step=0, replace_blank_na=True)
-    scores_lasso_2week.to_csv('results/scores_lasso.csv')
-    
+    card_df.to_csv("results/lasso_card_df.csv")
+
+    scores_lasso_2week = sc.scorecard_ply(
+        encoded, card, only_total_score=True, print_step=0, replace_blank_na=True
+    )
+    scores_lasso_2week.to_csv("results/scores_lasso.csv")
+
     # Training Metrics and AUC CI
     print("Training Metrics")
     # calculate accuracy
     acc = sklearn.metrics.accuracy_score(y_train, train_pred_class)
-    print('Accuracy: %.3f' % acc)
+    print("Accuracy: %.3f" % acc)
     auc_score = sklearn.metrics.roc_auc_score(y_train, train_pred)
-    print('AUC: %.3f' % auc_score)
+    print("AUC: %.3f" % auc_score)
     f_score = sklearn.metrics.f1_score(y_train, train_pred_class)
-    print('FS: %.3f' % f_score)
-    
+    print("FS: %.3f" % f_score)
+
     # delong ci
     delong_alpha = 0.95
-    auc, auc_cov = delong_roc_variance(
-    np.ravel(y_train),
-    np.ravel(train_pred))
-    
+    auc, auc_cov = delong_roc_variance(np.ravel(y_train), np.ravel(train_pred))
+
     auc_std = np.sqrt(auc_cov)
     lower_upper_q = np.abs(np.array([0, 1]) - (1 - delong_alpha) / 2)
-    
-    ci = stats.norm.ppf(
-        lower_upper_q,
-        loc=auc_score,
-        scale=auc_std)
-    
+
+    ci = stats.norm.ppf(lower_upper_q, loc=auc_score, scale=auc_std)
+
     ci[ci > 1] = 1
-    
-    print('AUC COV:', round(auc_cov,2))
-    print('95% AUC CI:', np.round(ci,2))
-    
+
+    print("AUC COV:", round(auc_cov, 2))
+    print("95% AUC CI:", np.round(ci, 2))
+
     # Testing Metrics and AUC CI
     print("Testing Metrics")
     # calculate accuracy
     acc = sklearn.metrics.accuracy_score(y_test, test_pred_class)
-    print('Accuracy: %.3f' % acc)
+    print("Accuracy: %.3f" % acc)
     auc_score = sklearn.metrics.roc_auc_score(y_test, test_pred)
-    print('AUC: %.3f' % auc_score)
+    print("AUC: %.3f" % auc_score)
     f_score = sklearn.metrics.f1_score(y_test, test_pred_class)
-    print('FS: %.3f' % f_score)
-    
+    print("FS: %.3f" % f_score)
+
     # delong ci
     delong_alpha = 0.95
-    auc, auc_cov = delong_roc_variance(
-    np.ravel(y_test),
-    np.ravel(test_pred))
-    
+    auc, auc_cov = delong_roc_variance(np.ravel(y_test), np.ravel(test_pred))
+
     auc_std = np.sqrt(auc_cov)
     lower_upper_q = np.abs(np.array([0, 1]) - (1 - delong_alpha) / 2)
-    
-    ci = stats.norm.ppf(
-        lower_upper_q,
-        loc=auc_score,
-        scale=auc_std)
-    
+
+    ci = stats.norm.ppf(lower_upper_q, loc=auc_score, scale=auc_std)
+
     ci[ci > 1] = 1
-    
-    print('AUC COV:', round(auc_cov,2))
-    print('95% AUC CI:', np.round(ci,2))
\ No newline at end of file
+
+    print("AUC COV:", round(auc_cov, 2))
+    print("95% AUC CI:", np.round(ci, 2))
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/filter_dataset.py b/src/filter_dataset.py
index 9caaed38d3b42089b770e29b0e30d3222849b034..47887bdf97dcc2c403b250e9cd765e8e9fed4b20 100644
--- a/src/filter_dataset.py
+++ b/src/filter_dataset.py
@@ -1,12 +1,13 @@
-import pandas as pd
-pd.set_option('display.max_rows', None)
 import re
-import yaml
-import numpy as np
 import argparse
 import os
+import yaml
+import numpy as np
+import pandas as pd
+
+pd.set_option("display.max_rows", None)
 
-#Load config file
+# Load config file
 def get_col_configs(config_f):
     with open(config_f) as fh:
         config_dict = yaml.safe_load(fh)
@@ -14,296 +15,377 @@ def get_col_configs(config_f):
     # print(config_dict)
     return config_dict
 
-#Extracts necessary columns from each table accroding to the config file
-def extract_col(config_dict,df,file):
-    #print('Extracting columns according to config file !....')
+
+# Extracts necessary columns from each table accroding to the config file
+def extract_col(config_dict, df, file):
+    # print('Extracting columns according to config file !....')
     df = df[config_dict[file]]
     return df
 
-#Parse column from measurements table to extract info
+
+# Parse column from measurements table to extract info
 def parse_values(x):
-    if 'Never' in x  and 'Tobacco' in x:
-       return 'never_smoker'
-    elif 'Former' in x  and 'Tobacco' in x:
-       return 'former_smoker'
-    elif ('Current' in x or 'current' in x or 'Light' in x)  and 'Tobacco' in x:
-       return 'current_smoker'
-    elif 'Unknown' in x  and 'Tobacco' in x:
-       return 'unknown_smoker'
-    elif 'Former' in x  and 'Alcohol' in x:
-       return 'former_alcohol'
-    elif 'Current' in x  and 'Alcohol' in x:
-       return 'current_alcohol'
-    elif 'Within' in x  and 'Alcohol' in x:
-       return 'current_alcohol'
-    elif 'No' in x  and 'Alcohol' in x:
-       return 'No_alcohol'
-    elif 'Unknown' in x  and 'Alcohol' in x:
-       return 'unknown_alcohol'
-    elif 'Unknown' in x  and 'Substance' in x:
-       return 'unknown_Substance_use'
-    elif 'Former' in x  and 'Substance' in x:
-       return 'former_Substance_use'
-    elif 'Current' in x  and 'Substance' in x:
-       return 'current_Substance_use'
-    elif 'Past' in x  and 'Substance' in x:
-       return 'past_Substance_use'
-    elif 'Within' in x  and 'Substance' in x:
-       return 'current_Substance_use'
-    elif 'No' in x  and 'Substance' in x:
-       return 'No_Substance_use'
-    elif 'BMI' in x:
-        if '30+' in x:
-            return '30.0-34.9'
-        elif '50.0-59.9' in x:
-            return '50-59.9'
-        elif '40.0-44.9' in x:
-            return '40-44.9'
-        elif '19' in x:
-            return '19.9 or less'
-        elif '35-39.9' in x:
-            return '30-39.9'
-        elif 'Body mass index (BMI)' in x:
-            return x.split(') ')[1].split(',')[0]
-        elif '(BMI ' in x:
-            return x.split('BMI ')[1].split(')')[0]
-        elif 'BMI ' in x:
-            return x.split('BMI ')[1].split(',')[0]
+    if "Never" in x and "Tobacco" in x:
+        return "never_smoker"
+    elif "Former" in x and "Tobacco" in x:
+        return "former_smoker"
+    elif ("Current" in x or "current" in x or "Light" in x) and "Tobacco" in x:
+        return "current_smoker"
+    elif "Unknown" in x and "Tobacco" in x:
+        return "unknown_smoker"
+    elif "Former" in x and "Alcohol" in x:
+        return "former_alcohol"
+    elif "Current" in x and "Alcohol" in x:
+        return "current_alcohol"
+    elif "Within" in x and "Alcohol" in x:
+        return "current_alcohol"
+    elif "No" in x and "Alcohol" in x:
+        return "No_alcohol"
+    elif "Unknown" in x and "Alcohol" in x:
+        return "unknown_alcohol"
+    elif "Unknown" in x and "Substance" in x:
+        return "unknown_Substance_use"
+    elif "Former" in x and "Substance" in x:
+        return "former_Substance_use"
+    elif "Current" in x and "Substance" in x:
+        return "current_Substance_use"
+    elif "Past" in x and "Substance" in x:
+        return "past_Substance_use"
+    elif "Within" in x and "Substance" in x:
+        return "current_Substance_use"
+    elif "No" in x and "Substance" in x:
+        return "No_Substance_use"
+    elif "BMI" in x:
+        if "30+" in x:
+            return "30.0-34.9"
+        elif "50.0-59.9" in x:
+            return "50-59.9"
+        elif "40.0-44.9" in x:
+            return "40-44.9"
+        elif "19" in x:
+            return "19.9 or less"
+        elif "35-39.9" in x:
+            return "30-39.9"
+        elif "Body mass index (BMI)" in x:
+            return x.split(") ")[1].split(",")[0]
+        elif "(BMI " in x:
+            return x.split("BMI ")[1].split(")")[0]
+        elif "BMI " in x:
+            return x.split("BMI ")[1].split(",")[0]
     else:
-       return x
+        return x
 
-#Parse column from measurements table extract info
+
+# Parse column from measurements table extract info
 def weight_bins(x):
-    if '19' in x:
-        return 'BMI-19.9_or_less'
-    elif '20' in x  or '21' in x or '22' in x or '23' in x or '24' in x:
-       return 'BMI-20.0_24.9'
-    elif '25' in x  or '26' in x or '27' in x or '28' in x or '29' in x or '30' in x  or '31' in x or '32' in x or '33' in x or '34' in x or '35' in x  or '36' in x or '37' in x or '38' in x or '39' in x:
-       return 'BMI-25.0_39.9'
-    elif '40' in x  or '45' in x or '50' in x or '60' in x or '70' in x:
-       return 'BMI-40.0_or_greater'
+    if "19" in x:
+        return "BMI-19.9_or_less"
+    elif "20" in x or "21" in x or "22" in x or "23" in x or "24" in x:
+        return "BMI-20.0_24.9"
+    elif (
+        "25" in x
+        or "26" in x
+        or "27" in x
+        or "28" in x
+        or "29" in x
+        or "30" in x
+        or "31" in x
+        or "32" in x
+        or "33" in x
+        or "34" in x
+        or "35" in x
+        or "36" in x
+        or "37" in x
+        or "38" in x
+        or "39" in x
+    ):
+        return "BMI-25.0_39.9"
+    elif "40" in x or "45" in x or "50" in x or "60" in x or "70" in x:
+        return "BMI-40.0_or_greater"
     else:
-       return x
+        return x
+
 
 def icd(x):
-    #print(x)
-    if '.' in x:
-       return x.split('.')[0]
+    # print(x)
+    if "." in x:
+        return x.split(".")[0]
     else:
         return x
 
-def encode(pdf,pdf1,pdf2,ndf,ndf1,ndf2,nulls):
-    print('Merging cohorts and one-hot encoding...')
-    df = pd.concat([pdf,ndf], ignore_index=True)
-    df1 = pd.concat([pdf1,ndf1], ignore_index=True)
-    df2 = pd.concat([pdf2,ndf2], ignore_index=True)
-    df = pd.get_dummies(df,prefix='f').groupby('PERSON_ID').sum()
-    df1 = pd.get_dummies(df1,prefix='f').groupby('PERSON_ID').sum()
-    df2 = pd.get_dummies(df2,prefix='f').groupby('PERSON_ID').sum()
-    df = df.merge(df1, on='PERSON_ID', how='outer').merge(df2, on='PERSON_ID', how='outer')
+
+def encode(pdf, pdf1, pdf2, ndf, ndf1, ndf2, nulls):
+    print("Merging cohorts and one-hot encoding...")
+    df = pd.concat([pdf, ndf], ignore_index=True)
+    df1 = pd.concat([pdf1, ndf1], ignore_index=True)
+    df2 = pd.concat([pdf2, ndf2], ignore_index=True)
+    df = pd.get_dummies(df, prefix="f").groupby("PERSON_ID").sum()
+    df1 = pd.get_dummies(df1, prefix="f").groupby("PERSON_ID").sum()
+    df2 = pd.get_dummies(df2, prefix="f").groupby("PERSON_ID").sum()
+    df = df.merge(df1, on="PERSON_ID", how="outer").merge(df2, on="PERSON_ID", how="outer")
     del df1, df2
     df.isnull().sum(axis=0).to_csv(nulls)
-    cols = ['f_unknown_smoker', 'f_unknown_alcohol', 'f_unknown_Substance_use']
-    df[cols]=df[cols].fillna(1)
-    df['f_BMI-unknown'] = df['f_BMI-25.0_39.9'].apply(lambda x: 1 if pd.isna(x) else 0)
+    cols = ["f_unknown_smoker", "f_unknown_alcohol", "f_unknown_Substance_use"]
+    df[cols] = df[cols].fillna(1)
+    df["f_BMI-unknown"] = df["f_BMI-25.0_39.9"].apply(lambda x: 1 if pd.isna(x) else 0)
     df = df.fillna(0)
     df.reset_index(inplace=True)
-    #df['class'] = df.PERSON_ID.apply(classes)
+    # df['class'] = df.PERSON_ID.apply(classes)
+    return df
+
+
+def classes(df, pid, nid):
+    pos = df[df.PERSON_ID.isin(pid)].copy()
+    pos.loc[:, "class"] = "positive"
+    neg = df[df.PERSON_ID.isin(nid)].copy()
+    neg.loc[:, "class"] = "negative"
+    df = pd.concat([pos, neg], sort=False)
+    del pos, neg
+    df.reset_index(drop=True, inplace=True)
     return df
 
-def classes(df,pid,nid):
-   pos = df[df.PERSON_ID.isin(pid)].copy()
-   pos.loc[:,'class'] = 'positive'
-   neg = df[df.PERSON_ID.isin(nid)].copy()
-   neg.loc[:,'class'] = 'negative'
-   df = pd.concat([pos,neg], sort=False)
-   del pos,neg
-   df.reset_index(drop=True, inplace=True)
-   return df
-
-
-def statistics(df2,stats):
-   current_alcohol = df2[df2.OBSERVATION=='current_alcohol'].shape[0]
-   former_alcohol = df2[df2.OBSERVATION=='former_alcohol'].shape[0]
-   No_alcohol = df2[df2.OBSERVATION=='No_alcohol'].shape[0]
-   unknown_alcohol = df2[df2.OBSERVATION=='unknown_alcohol'].shape[0]
-   current_smoker = df2[df2.OBSERVATION=='current_smoker'].shape[0]
-   former_smoker = df2[df2.OBSERVATION=='former_smoker'].shape[0]
-   never_smoker = df2[df2.OBSERVATION=='never_smoker'].shape[0]
-   unknown_smoker = df2[df2.OBSERVATION=='unknown_smoker'].shape[0]
-   current_Substance_use = df2[df2.OBSERVATION=='current_Substance_use'].shape[0]
-   unknown_Substance_use = df2[df2.OBSERVATION=='unknown_Substance_use'].shape[0]
-   No_Substance_use = df2[df2.OBSERVATION=='No_Substance_use'].shape[0]
-   former_Substance_use = df2[df2.OBSERVATION=='former_Substance_use'].shape[0]
-   lean = df2[df2.OBSERVATION=='BMI-19.9_or_less'].shape[0]
-   normal = df2[df2.OBSERVATION=='BMI-20.0_24.9'].shape[0]
-   overweight = df2[df2.OBSERVATION=='BMI-25.0_39.9'].shape[0]
-   obese = df2[df2.OBSERVATION=='BMI-40.0_or_greater'].shape[0]
-   print(f'current_alcohol: {current_alcohol}\nformer_alcohol: {former_alcohol}\nNo_alcohol: {No_alcohol}\nunknown_alcohol: {unknown_alcohol}', file=open(stats, "a"))
-   print(f'current_smoker: {current_smoker}\nformer_smoker: {former_smoker}\nnever_smoker: {never_smoker}\nunknown_smoker: {unknown_smoker}', file=open(stats, "a"))
-   print(f'current_Substance_use: {current_Substance_use}\nunknown_Substance_use: {unknown_Substance_use}\nNo_Substance_use: {No_Substance_use}\nformer_Substance_use: {former_Substance_use}', file=open(stats, "a"))
-   print(f'lean(BMI-19.9_or_less): {lean}\nnormal(BMI-20.0_24.9): {normal}\noverweight(BMI-25.0_39.9): {overweight}\nobese(BMI-40.0_or_greater): {obese}', file=open(stats, "a"))
-   return None
-
-def statistics1(df1,patients,stats):
-   print(f'Total patients: {patients}\nMean age: {df1.AGE.mean()} {df1.AGE.min(),df1.AGE.max()}', file=open(stats, "a"))
-   male = df1[df1.GENDER_SOURCE_VALUE=='M'].shape[0]
-   female = df1[df1.GENDER_SOURCE_VALUE=='F'].shape[0]
-   unknown = df1[df1.GENDER_SOURCE_VALUE=='U'].shape[0]
-   white = df1[df1.RACE_SOURCE_VALUE=='White'].shape[0]
-   black = df1[df1.RACE_SOURCE_VALUE=='Black or African American'].shape[0]
-   asian = df1[df1.RACE_SOURCE_VALUE=='Asian'].shape[0]
-   latino = df1[df1.RACE_SOURCE_VALUE=='Hispanic or Latino'].shape[0]
-   other = patients - white - black - asian - latino
-   print(f'Male: {male}\nFemale: {female}\nUnknown: {unknown}', file=open(stats, "a"))
-   print(f'White: {white}\nBlack: {black}\nAsian: {asian}\nHispanic or Latino: {latino}', file=open(stats, "a"))
-   return None
-
-
-def main(config_f, cohort, omop,stats,fil):
-   config_dict = get_col_configs(config_f)
-   #Read and parse PERSON table
-   df1 = pd.read_csv(cohort + 'person.csv', sep="|", header = 0)
-   df1 = extract_col(config_dict,df1,'person')
-   pid = df1.PERSON_ID.unique().tolist()
-   df1['AGE'] = 2020-df1['YEAR_OF_BIRTH']
-   bins = [0, 5, 18, 30, 40, 50, 65, 75, 85, np.inf]
-   df1['AGE_BINS'] = pd.cut(df1['AGE'], bins)
-   df1['AGE_BINS'] = 'AGE_' + df1['AGE_BINS'].astype(str)
-   df1['ETHNICITY_SOURCE_VALUE'] = df1['ETHNICITY_SOURCE_VALUE'].fillna('Unknown')
-   df1['ETHNICITY_SOURCE_VALUE'] = np.where((df1.ETHNICITY_SOURCE_VALUE == 'Not reported'), 'Unknown', df1.ETHNICITY_SOURCE_VALUE)
-   statistics1(df1,len(pid),stats)
-      
-   #Read and parse Condition table 
-   df = pd.read_csv(cohort + 'condition_occurrence.csv', sep="|", header = 0)
-   df = extract_col(config_dict,df,'condition')
-   print(f'Total conditions: {df.shape[0]}\nUnique conditions: {df.CONDITION_SOURCE_CONCEPT_ID.nunique()}\nAvg conditions per person: {df.shape[0]/len(pid)}\nAvg unique conditions per patient: {df.CONDITION_SOURCE_CONCEPT_ID.nunique()/len(pid)}', file=open(stats, "a"))
-   df=df[df.PERSON_ID.isin(pid)]
-   df['CONDITION_START_DATE'] = pd.to_datetime(df['CONDITION_START_DATE'])
-   df = pd.merge(df,omop, left_on='CONDITION_SOURCE_CONCEPT_ID', right_on='CONCEPT_ID', how='left')
-   df = df.dropna(axis=0, subset=['CODE'])#, thresh=2
-   df = df.drop(['CONDITION_SOURCE_VALUE','CONDITION_SOURCE_CONCEPT_ID','CONCEPT_ID'], axis=1)
-   
-   #Read and parse MEASUREMENT table
-   df2 = pd.read_csv(cohort + 'measurement.csv', sep="|", header = 0)
-   df2 = extract_col(config_dict,df2,'measurement')
-   print('Total number of tests performed: ', df2.shape[0], file=open(stats, "a"))
-   
-   #Extract dates from measurements table to get 1st diagnosed (for positive cohort) or 1st covid test performed
-   df2 = df2[df2.MEASUREMENT_SOURCE_VALUE.str.contains('cov',regex=True,na=False, flags = re.IGNORECASE)].sort_values(by=['PERSON_ID','MEASUREMENT_DATE'])
-   print(f'Total number of COVID tests performed: {df2.shape[0]}\nTests per patient: {df2.shape[0]/len(pid)}', file=open(stats, "a"))
-   df2 = df2[df2.PERSON_ID.duplicated()==False]
-   print('Patients with at least one Covid test: ',df2.shape[0], file=open(stats, "a"))
-   df2=df2[df2.PERSON_ID.isin(pid)]
-   df1 = pd.merge(df1,df2, on='PERSON_ID', how='outer')
-   df1 = df1.sort_values(by='PERSON_ID')
-   #This info is only needed for positive cohort
-   if 'Positive' in cohort:
-      codes = ['U07.1', 'U07.2', 'B97.29', 'Z86.16', 'J12.82', 'B94.8', 'B34.2']
-      df2 = df[df['CODE'].isin(codes)]
-      df2 = df2.sort_values(by=['PERSON_ID','CONDITION_START_DATE'], ascending=True)
-      df2 = df2[df2.PERSON_ID.duplicated()==False]
-      df1 = pd.merge(df1,df2, on='PERSON_ID', how='outer')
-      df1['MEASUREMENT_DATE'] = pd.to_datetime(df1['MEASUREMENT_DATE'])
-      df1['DIAGNOSIS'] = df1[['CONDITION_START_DATE','MEASUREMENT_DATE']].min(axis=1)
-      df1 = df1.drop(['YEAR_OF_BIRTH','MEASUREMENT_DATE', 'MEASUREMENT_SOURCE_VALUE','VALUE_SOURCE_VALUE', 'CONDITION_START_DATE', 'CODE'], axis=1)
-   else:
-      df1['DIAGNOSIS'] = pd.to_datetime(df1['MEASUREMENT_DATE'])
-      df1 = df1.drop(['YEAR_OF_BIRTH','MEASUREMENT_DATE', 'MEASUREMENT_SOURCE_VALUE','VALUE_SOURCE_VALUE'], axis=1)
-   
-   df = pd.merge(df,df1[['PERSON_ID','DIAGNOSIS']], on='PERSON_ID', how='outer')
-   df = df.dropna(axis=0, subset=['DIAGNOSIS'])
-   #df.drop(df[df['DIAGNOSIS'] < df['CONDITION_START_DATE']].index, inplace=True)
-   df = df.drop(df[(df['DIAGNOSIS'] < df['CONDITION_START_DATE']) | ((df['DIAGNOSIS']- pd.to_timedelta(fil, unit='W')) > df['CONDITION_START_DATE'])].index)
-   
-   #Get patients that only have conditions
-   df = df[['PERSON_ID','CODE']]
-   df['CODE'] = df['CODE'].apply(icd)
-   df.drop_duplicates(inplace=True)
-   pid = df.PERSON_ID.unique().tolist()
-   df1 = df1[['PERSON_ID','GENDER_SOURCE_VALUE','RACE_SOURCE_VALUE','ETHNICITY_SOURCE_VALUE','AGE_BINS']]
-   df1=df1[df1.PERSON_ID.isin(pid)]
-
-   #Read and parse OBSERVATION table
-   df2 = pd.read_csv(cohort + 'observation.csv', sep="|", header = 0)
-   df2 = extract_col(config_dict,df2,'observation')
-   options = ['SHX Alcohol use','SHX Substance abuse use','SHX Tobacco use']
-   df2 = df2[df2['OBSERVATION_SOURCE_VALUE'].isin(options) | df2['VALUE_AS_STRING'].str.contains('BMI',regex=True,na=False, flags = re.IGNORECASE) ].sort_values(by=['PERSON_ID','OBSERVATION_DATE','QUALIFIER_SOURCE_VALUE'])
-   df2 = df2.groupby(['PERSON_ID', 'QUALIFIER_SOURCE_VALUE'], as_index=False).last()
-   df2['dummy'] = df2[['VALUE_AS_STRING', 'OBSERVATION_SOURCE_VALUE']].agg('-'.join, axis=1)
-   df2['OBSERVATION'] = df2['dummy'].apply(parse_values)
-   df2['OBSERVATION'] = df2['OBSERVATION'].apply(weight_bins)
-   df2 = df2[['PERSON_ID','OBSERVATION']]
-   statistics(df2,stats)
-   df2=df2[df2.PERSON_ID.isin(pid)]
-   print('Patients after filtering: ', len(pid), file=open(stats, "a"))
-   print(f'Total conditions after filtering: ',df.shape[0], file=open(stats, "a"))
-   return df,df1,df2
+
+def statistics(df2, stats):
+    current_alcohol = df2[df2.OBSERVATION == "current_alcohol"].shape[0]
+    former_alcohol = df2[df2.OBSERVATION == "former_alcohol"].shape[0]
+    No_alcohol = df2[df2.OBSERVATION == "No_alcohol"].shape[0]
+    unknown_alcohol = df2[df2.OBSERVATION == "unknown_alcohol"].shape[0]
+    current_smoker = df2[df2.OBSERVATION == "current_smoker"].shape[0]
+    former_smoker = df2[df2.OBSERVATION == "former_smoker"].shape[0]
+    never_smoker = df2[df2.OBSERVATION == "never_smoker"].shape[0]
+    unknown_smoker = df2[df2.OBSERVATION == "unknown_smoker"].shape[0]
+    current_Substance_use = df2[df2.OBSERVATION == "current_Substance_use"].shape[0]
+    unknown_Substance_use = df2[df2.OBSERVATION == "unknown_Substance_use"].shape[0]
+    No_Substance_use = df2[df2.OBSERVATION == "No_Substance_use"].shape[0]
+    former_Substance_use = df2[df2.OBSERVATION == "former_Substance_use"].shape[0]
+    lean = df2[df2.OBSERVATION == "BMI-19.9_or_less"].shape[0]
+    normal = df2[df2.OBSERVATION == "BMI-20.0_24.9"].shape[0]
+    overweight = df2[df2.OBSERVATION == "BMI-25.0_39.9"].shape[0]
+    obese = df2[df2.OBSERVATION == "BMI-40.0_or_greater"].shape[0]
+    print(
+        f"current_alcohol: {current_alcohol}\nformer_alcohol: {former_alcohol}\nNo_alcohol: {No_alcohol}\nunknown_alcohol: {unknown_alcohol}",
+        file=open(stats, "a"),
+    )
+    print(
+        f"current_smoker: {current_smoker}\nformer_smoker: {former_smoker}\nnever_smoker: {never_smoker}\nunknown_smoker: {unknown_smoker}",
+        file=open(stats, "a"),
+    )
+    print(
+        f"current_Substance_use: {current_Substance_use}\nunknown_Substance_use: {unknown_Substance_use}\nNo_Substance_use: {No_Substance_use}\nformer_Substance_use: {former_Substance_use}",
+        file=open(stats, "a"),
+    )
+    print(
+        f"lean(BMI-19.9_or_less): {lean}\nnormal(BMI-20.0_24.9): {normal}\noverweight(BMI-25.0_39.9): {overweight}\nobese(BMI-40.0_or_greater): {obese}",
+        file=open(stats, "a"),
+    )
+    return None
+
+
+def statistics1(df1, patients, stats):
+    print(
+        f"Total patients: {patients}\nMean age: {df1.AGE.mean()} {df1.AGE.min(),df1.AGE.max()}",
+        file=open(stats, "a"),
+    )
+    male = df1[df1.GENDER_SOURCE_VALUE == "M"].shape[0]
+    female = df1[df1.GENDER_SOURCE_VALUE == "F"].shape[0]
+    unknown = df1[df1.GENDER_SOURCE_VALUE == "U"].shape[0]
+    white = df1[df1.RACE_SOURCE_VALUE == "White"].shape[0]
+    black = df1[df1.RACE_SOURCE_VALUE == "Black or African American"].shape[0]
+    asian = df1[df1.RACE_SOURCE_VALUE == "Asian"].shape[0]
+    latino = df1[df1.RACE_SOURCE_VALUE == "Hispanic or Latino"].shape[0]
+    other = patients - white - black - asian - latino
+    print(f"Male: {male}\nFemale: {female}\nUnknown: {unknown}", file=open(stats, "a"))
+    print(
+        f"White: {white}\nBlack: {black}\nAsian: {asian}\nHispanic or Latino: {latino}\nOther: {other}",
+        file=open(stats, "a"),
+    )
+    return None
+
+
+def main(config_f, cohort, omop, stats, fil):
+    config_dict = get_col_configs(config_f)
+    # Read and parse PERSON table
+    df1 = pd.read_csv(cohort + "person.csv", sep="|", header=0)
+    df1 = extract_col(config_dict, df1, "person")
+    pid = df1.PERSON_ID.unique().tolist()
+    df1["AGE"] = 2020 - df1["YEAR_OF_BIRTH"]
+    bins = [0, 5, 18, 30, 40, 50, 65, 75, 85, np.inf]
+    df1["AGE_BINS"] = pd.cut(df1["AGE"], bins)
+    df1["AGE_BINS"] = "AGE_" + df1["AGE_BINS"].astype(str)
+    df1["ETHNICITY_SOURCE_VALUE"] = df1["ETHNICITY_SOURCE_VALUE"].fillna("Unknown")
+    df1["ETHNICITY_SOURCE_VALUE"] = np.where(
+        (df1.ETHNICITY_SOURCE_VALUE == "Not reported"), "Unknown", df1.ETHNICITY_SOURCE_VALUE,
+    )
+    statistics1(df1, len(pid), stats)
+
+    # Read and parse Condition table
+    df = pd.read_csv(cohort + "condition_occurrence.csv", sep="|", header=0)
+    df = extract_col(config_dict, df, "condition")
+    print(
+        f"Total conditions: {df.shape[0]}\nUnique conditions: {df.CONDITION_SOURCE_CONCEPT_ID.nunique()}\nAvg conditions per person: {df.shape[0]/len(pid)}\nAvg unique conditions per patient: {df.CONDITION_SOURCE_CONCEPT_ID.nunique()/len(pid)}",
+        file=open(stats, "a"),
+    )
+    df = df[df.PERSON_ID.isin(pid)]
+    df["CONDITION_START_DATE"] = pd.to_datetime(df["CONDITION_START_DATE"])
+    df = pd.merge(
+        df, omop, left_on="CONDITION_SOURCE_CONCEPT_ID", right_on="CONCEPT_ID", how="left",
+    )
+    df = df.dropna(axis=0, subset=["CODE"])  # , thresh=2
+    df = df.drop(["CONDITION_SOURCE_VALUE", "CONDITION_SOURCE_CONCEPT_ID", "CONCEPT_ID"], axis=1)
+
+    # Read and parse MEASUREMENT table
+    df2 = pd.read_csv(cohort + "measurement.csv", sep="|", header=0)
+    df2 = extract_col(config_dict, df2, "measurement")
+    print("Total number of tests performed: ", df2.shape[0], file=open(stats, "a"))
+
+    # Extract dates from measurements table to get 1st diagnosed (for positive cohort) or 1st covid test performed
+    df2 = df2[
+        df2.MEASUREMENT_SOURCE_VALUE.str.contains("cov", regex=True, na=False, flags=re.IGNORECASE)
+    ].sort_values(by=["PERSON_ID", "MEASUREMENT_DATE"])
+    print(
+        f"Total number of COVID tests performed: {df2.shape[0]}\nTests per patient: {df2.shape[0]/len(pid)}",
+        file=open(stats, "a"),
+    )
+    df2 = df2[df2.PERSON_ID.duplicated() == False]
+    print("Patients with at least one Covid test: ", df2.shape[0], file=open(stats, "a"))
+    df2 = df2[df2.PERSON_ID.isin(pid)]
+    df1 = pd.merge(df1, df2, on="PERSON_ID", how="outer")
+    df1 = df1.sort_values(by="PERSON_ID")
+    # This info is only needed for positive cohort
+    if "Positive" in cohort:
+        codes = ["U07.1", "U07.2", "B97.29", "Z86.16", "J12.82", "B94.8", "B34.2"]
+        df2 = df[df["CODE"].isin(codes)]
+        df2 = df2.sort_values(by=["PERSON_ID", "CONDITION_START_DATE"], ascending=True)
+        df2 = df2[df2.PERSON_ID.duplicated() == False]
+        df1 = pd.merge(df1, df2, on="PERSON_ID", how="outer")
+        df1["MEASUREMENT_DATE"] = pd.to_datetime(df1["MEASUREMENT_DATE"])
+        df1["DIAGNOSIS"] = df1[["CONDITION_START_DATE", "MEASUREMENT_DATE"]].min(axis=1)
+        df1 = df1.drop(
+            [
+                "YEAR_OF_BIRTH",
+                "MEASUREMENT_DATE",
+                "MEASUREMENT_SOURCE_VALUE",
+                "VALUE_SOURCE_VALUE",
+                "CONDITION_START_DATE",
+                "CODE",
+            ],
+            axis=1,
+        )
+    else:
+        df1["DIAGNOSIS"] = pd.to_datetime(df1["MEASUREMENT_DATE"])
+        df1 = df1.drop(
+            [
+                "YEAR_OF_BIRTH",
+                "MEASUREMENT_DATE",
+                "MEASUREMENT_SOURCE_VALUE",
+                "VALUE_SOURCE_VALUE",
+            ],
+            axis=1,
+        )
+
+    df = pd.merge(df, df1[["PERSON_ID", "DIAGNOSIS"]], on="PERSON_ID", how="outer")
+    df = df.dropna(axis=0, subset=["DIAGNOSIS"])
+    # df.drop(df[df['DIAGNOSIS'] < df['CONDITION_START_DATE']].index, inplace=True)
+    df = df.drop(
+        df[
+            (df["DIAGNOSIS"] < df["CONDITION_START_DATE"])
+            | ((df["DIAGNOSIS"] - pd.to_timedelta(fil, unit="W")) > df["CONDITION_START_DATE"])
+        ].index
+    )
+
+    # Get patients that only have conditions
+    df = df[["PERSON_ID", "CODE"]]
+    df["CODE"] = df["CODE"].apply(icd)
+    df.drop_duplicates(inplace=True)
+    pid = df.PERSON_ID.unique().tolist()
+    df1 = df1[
+        [
+            "PERSON_ID",
+            "GENDER_SOURCE_VALUE",
+            "RACE_SOURCE_VALUE",
+            "ETHNICITY_SOURCE_VALUE",
+            "AGE_BINS",
+        ]
+    ]
+    df1 = df1[df1.PERSON_ID.isin(pid)]
+
+    # Read and parse OBSERVATION table
+    df2 = pd.read_csv(cohort + "observation.csv", sep="|", header=0)
+    df2 = extract_col(config_dict, df2, "observation")
+    options = ["SHX Alcohol use", "SHX Substance abuse use", "SHX Tobacco use"]
+    df2 = df2[
+        df2["OBSERVATION_SOURCE_VALUE"].isin(options)
+        | df2["VALUE_AS_STRING"].str.contains("BMI", regex=True, na=False, flags=re.IGNORECASE)
+    ].sort_values(by=["PERSON_ID", "OBSERVATION_DATE", "QUALIFIER_SOURCE_VALUE"])
+    df2 = df2.groupby(["PERSON_ID", "QUALIFIER_SOURCE_VALUE"], as_index=False).last()
+    df2["dummy"] = df2[["VALUE_AS_STRING", "OBSERVATION_SOURCE_VALUE"]].agg("-".join, axis=1)
+    df2["OBSERVATION"] = df2["dummy"].apply(parse_values)
+    df2["OBSERVATION"] = df2["OBSERVATION"].apply(weight_bins)
+    df2 = df2[["PERSON_ID", "OBSERVATION"]]
+    statistics(df2, stats)
+    df2 = df2[df2.PERSON_ID.isin(pid)]
+    print("Patients after filtering: ", len(pid), file=open(stats, "a"))
+    print(f"Total conditions after filtering: ", df.shape[0], file=open(stats, "a"))
+    return df, df1, df2
+
 
 if __name__ == "__main__":
-   parser = argparse.ArgumentParser()
-   parser.add_argument(
-       "--pos", 
-       type=str,
-       required=True,
-       help="Path to directory with positive cohort")
-   parser.add_argument(
-       "--neg", 
-       type=str,
-       required=True,
-       help="Path to directory with negative cohort")
-   parser.add_argument(
-       "--config", 
-       type=str,
-       default='./configs/columns.yaml',
-       help="Path to config file")
-   parser.add_argument(
-       "--outdir",
-       type=str,
-       default='./results/',
-       help="Path to directory to write parsed files")
-   parser.add_argument(
-       "--filter",
-       type=int,
-       default=100,
-       help="Number of weeks of data to filter before COVID-19 diagnosis. Give a large number (eg. 100) if you don't want any filter.")
-   args = parser.parse_args()
-   
-   #Path to config file
-   config_f = args.config
-
-   #Check for output directory and create if it doesn't exist
-   if not os.path.exists(args.outdir):
-            os.makedirs(args.outdir)
-
-   #Path to stats file
-   stats = args.outdir+"stats-"+str(args.filter)+"-week-filter.csv"
-   
-   #Read OMOP database
-   omop = pd.read_csv('ICD10.tsv', sep='\t')
-
-   #Output encoded file
-   output = args.outdir+'encoded-'+str(args.filter)+'-week-filter.csv' 
-
-   #Output nulls
-   nulls = args.outdir+'Nulls-'+str(args.filter)+'-week-filter.csv'
-   
-   #Extract tables from POSITIVE cohort
-   print('Parsing through POSITIVE cohort...')
-   print('Parsing through POSITIVE cohort...', file=open(stats, "w"))
-   positive = args.pos
-   pdf,pdf1,pdf2 = main(config_f,positive,omop,stats,args.filter)
-   pid = pdf.PERSON_ID.unique().tolist()
-   
-   #Extract info from NEGATIVE cohort
-   print('\nParsing through NEGATIVE cohort...')
-   print('\nParsing through NEGATIVE cohort...', file=open(stats, "a"))
-   negative = args.neg
-   ndf,ndf1,ndf2 = main(config_f,negative,omop,stats,args.filter)
-   nid = ndf.PERSON_ID.unique().tolist()
-   
-   encoded = encode(pdf,pdf1,pdf2,ndf,ndf1,ndf2,nulls)
-   del pdf,pdf1,pdf2,ndf,ndf1,ndf2, omop
-   encoded = classes(encoded,pid,nid)
-   encoded.to_csv(output, index=False)
-   del encoded,pid,nid
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pos", type=str, required=True, help="Path to directory with positive cohort"
+    )
+    parser.add_argument(
+        "--neg", type=str, required=True, help="Path to directory with negative cohort"
+    )
+    parser.add_argument(
+        "--config", type=str, default="./configs/columns.yaml", help="Path to config file",
+    )
+    parser.add_argument(
+        "--outdir", type=str, default="./results/", help="Path to directory to write parsed files",
+    )
+    parser.add_argument(
+        "--filter",
+        type=int,
+        default=100,
+        help="Number of weeks of data to filter before COVID-19 diagnosis. Give a large number (eg. 100) if you don't want any filter.",
+    )
+    args = parser.parse_args()
+
+    # Path to config file
+    config_f = args.config
+
+    # Check for output directory and create if it doesn't exist
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+
+    # Path to stats file
+    stats = args.outdir + "stats-" + str(args.filter) + "-week-filter.csv"
+
+    # Read OMOP database
+    omop = pd.read_csv("ICD10.tsv", sep="\t")
+
+    # Output encoded file
+    output = args.outdir + "encoded-" + str(args.filter) + "-week-filter.csv"
+
+    # Output nulls
+    nulls = args.outdir + "Nulls-" + str(args.filter) + "-week-filter.csv"
+
+    # Extract tables from POSITIVE cohort
+    print("Parsing through POSITIVE cohort...")
+    print("Parsing through POSITIVE cohort...", file=open(stats, "w"))
+    positive = args.pos
+    pdf, pdf1, pdf2 = main(config_f, positive, omop, stats, args.filter)
+    pid = pdf.PERSON_ID.unique().tolist()
+
+    # Extract info from NEGATIVE cohort
+    print("\nParsing through NEGATIVE cohort...")
+    print("\nParsing through NEGATIVE cohort...", file=open(stats, "a"))
+    negative = args.neg
+    ndf, ndf1, ndf2 = main(config_f, negative, omop, stats, args.filter)
+    nid = ndf.PERSON_ID.unique().tolist()
+
+    encoded = encode(pdf, pdf1, pdf2, ndf, ndf1, ndf2, nulls)
+    del pdf, pdf1, pdf2, ndf, ndf1, ndf2, omop
+    encoded = classes(encoded, pid, nid)
+    encoded.to_csv(output, index=False)
+    del encoded, pid, nid
diff --git a/src/streamlit/RICO.py b/src/streamlit/RICO.py
index d0b5b460ed5ef85eabffa66c195a8a32cd23c705..2456d22347c9ccea927590a128043d5ed9068e8e 100644
--- a/src/streamlit/RICO.py
+++ b/src/streamlit/RICO.py
@@ -2,247 +2,255 @@ import streamlit as st
 import plotly.graph_objects as go
 
 st.title("COVID-19 Risk Predictor")
-st.markdown("<h3 style='text-align: right;'>for research purposes only</h3>", unsafe_allow_html=True)
-''' 
+st.markdown(
+    "<h3 style='text-align: right;'>for research purposes only</h3>", unsafe_allow_html=True,
+)
+"""
 
- 
-'''
+
+"""
 pd.options.display.max_colwidth = 500
 
+
 def imc_chart(imc):
-    if (imc>=213):
-        color="red"
-        '## Alert: Please take a COVID test immediately.'
-       # '### You are >20% likely.'
-    elif (imc>=170 and imc<213):
-        color="orange"
-        '## Alert: Please consult a doctor to take COVID test'
-    elif (imc>=0 and imc<170):
+    if imc >= 213:
+        color = "red"
+        "## Alert: Please take a COVID test immediately."
+    # '### You are >20% likely.'
+    elif imc >= 170 and imc < 213:
+        color = "orange"
+        "## Alert: Please consult a doctor to take COVID test"
+    elif imc >= 0 and imc < 170:
         color = "lightgreen"
-        '## Alert: Please consult a doctor to take COVID test'
-    elif (imc<0):
-        color="green"
-        '## Alert: Please consult a doctor if you have symptoms'
-    fig = go.Figure(go.Indicator(
-        mode = "gauge+number+delta",
-        domain = {'x': [0, 1], 'y': [0, 1]},
-        value = imc,
-        title = {'text': "Patient Risk Score"},
-        delta = {'reference': 213, 'increasing': {'color': "RebeccaPurple"}},
-        gauge = {
-            'axis': {'range': [-170, 350], 'tickwidth': 1, 'tickcolor': "darkblue"},
-            'bar': {'color': color},
-            'steps' : [
-                {'range': [-170, 350], 'color': "white"}],
-            'threshold' : {'line': {'color': 'red', 'width': 8}, 
-            'thickness': 0.75, 'value': 213}}))
-
-    
+        "## Alert: Please consult a doctor to take COVID test"
+    elif imc < 0:
+        color = "green"
+        "## Alert: Please consult a doctor if you have symptoms"
+    fig = go.Figure(
+        go.Indicator(
+            mode="gauge+number+delta",
+            domain={"x": [0, 1], "y": [0, 1]},
+            value=imc,
+            title={"text": "Patient Risk Score"},
+            delta={"reference": 213, "increasing": {"color": "RebeccaPurple"}},
+            gauge={
+                "axis": {"range": [-170, 350], "tickwidth": 1, "tickcolor": "darkblue"},
+                "bar": {"color": color},
+                "steps": [{"range": [-170, 350], "color": "white"}],
+                "threshold": {
+                    "line": {"color": "red", "width": 8},
+                    "thickness": 0.75,
+                    "value": 213,
+                },
+            },
+        )
+    )
+
     return fig
 
 
-age = st.sidebar.selectbox(
-     'Please select your age:',
-     ('','<20', '20-39','40-54','>55'))
+age = st.sidebar.selectbox("Please select your age:", ("", "<20", "20-39", "40-54", ">55"))
 
-if age =='':
+if age == "":
     age = 0
-elif age =='<20':
+elif age == "<20":
     age = 29
-elif age =='20-39':
+elif age == "20-39":
     age = -15
-elif age =='40-54':
+elif age == "40-54":
     age = 25
-elif age =='>55':
+elif age == ">55":
     age = -6
 
 
 race = st.sidebar.selectbox(
-     'What was or would be your race on the 2020 census?',
-     ('','Decline to answer', 'Asian','White','Black or African American','Hispanic or Latino','Other or Multiple'))
-if race =='':
+    "What was or would be your race on the 2020 census?",
+    (
+        "",
+        "Decline to answer",
+        "Asian",
+        "White",
+        "Black or African American",
+        "Hispanic or Latino",
+        "Other or Multiple",
+    ),
+)
+if race == "":
     race = 0
-elif race =='Decline to answer':
+elif race == "Decline to answer":
     race = -34
-elif race =='Asian':
+elif race == "Asian":
     race = 74
-elif race =='White':
+elif race == "White":
     race = -27
-elif race =='Black or African American':
+elif race == "Black or African American":
     race = 26
-elif race =='Hispanic or Latino':
+elif race == "Hispanic or Latino":
     race = 18
-elif race =='Other or Multiple':
+elif race == "Other or Multiple":
     race = 35
 
 
-
-
-cough = st.sidebar.selectbox(
-     'Do you have a cough?',
-     ('','Yes', 'No'))
-if cough == '':
+cough = st.sidebar.selectbox("Do you have a cough?", ("", "Yes", "No"))
+if cough == "":
     cough = 0
-elif cough == 'Yes':
+elif cough == "Yes":
     cough = 79
-elif cough == 'No':
+elif cough == "No":
     cough = -37
 
-smoke = st.sidebar.selectbox(
-     'Do you smoke?',
-     ('','Yes', 'No'))
-if smoke == '':
+smoke = st.sidebar.selectbox("Do you smoke?", ("", "Yes", "No"))
+if smoke == "":
     smoke = 0
-elif smoke == 'Yes':
+elif smoke == "Yes":
     smoke = -64
-elif smoke == 'No':
+elif smoke == "No":
     smoke = 10
 
-drink = st.sidebar.selectbox(
-     'Do you drink?',
-     ('','Yes', 'No','Former'))
-if drink == '':
+drink = st.sidebar.selectbox("Do you drink?", ("", "Yes", "No", "Former"))
+if drink == "":
     drink = 0
-elif drink == 'Yes':
+elif drink == "Yes":
     drink = 3
-elif drink == 'No':
+elif drink == "No":
     drink = 0
-elif drink == 'Former':
+elif drink == "Former":
     drink = -32
 
-fever = st.sidebar.selectbox(
-     'Do you have fever?',
-     ('','Yes', 'No'))
-if fever == '':
+fever = st.sidebar.selectbox("Do you have fever?", ("", "Yes", "No"))
+if fever == "":
     fever = 0
-elif fever == 'Yes':
+elif fever == "Yes":
     fever = 33
-elif fever == 'No':
+elif fever == "No":
     fever = -2
 
-tired = st.sidebar.selectbox(
-     'Do you feel tired?',
-     ('','Yes', 'No'))
-if tired == '':
+tired = st.sidebar.selectbox("Do you feel tired?", ("", "Yes", "No"))
+if tired == "":
     tired = 0
-elif tired == 'Yes':
+elif tired == "Yes":
     tired = 20
-elif tired == 'No':
+elif tired == "No":
     tired = -1
 
-muscle = st.sidebar.selectbox(
-     'Do you feel muscle pain?',
-     ('','Yes', 'No'))
-if muscle == '':
+muscle = st.sidebar.selectbox("Do you feel muscle pain?", ("", "Yes", "No"))
+if muscle == "":
     muscle = 0
-elif muscle == 'Yes':
+elif muscle == "Yes":
     muscle = 25
-elif muscle == 'No':
+elif muscle == "No":
     muscle = -2
 
-mucus = st.sidebar.selectbox(
-     'Have you had increased mucus or phlegm?',
-     ('','Yes', 'No'))
-if mucus == '':
+mucus = st.sidebar.selectbox("Have you had increased mucus or phlegm?", ("", "Yes", "No"))
+if mucus == "":
     mucus = 0
-elif mucus == 'Yes':
+elif mucus == "Yes":
     mucus = 25
-elif mucus == 'No':
+elif mucus == "No":
     mucus = -2
 
-headache = st.sidebar.selectbox(
-     'Do you have a headache?',
-     ('','Yes', 'No'))
-if headache == '':
+headache = st.sidebar.selectbox("Do you have a headache?", ("", "Yes", "No"))
+if headache == "":
     headache = 0
-elif headache == 'Yes':
+elif headache == "Yes":
     headache = 119
-elif headache == 'No':
+elif headache == "No":
     headache = -5
 
-t2d = st.sidebar.selectbox(
-     'Do you have Type 2 diabetes?',
-     ('','Yes', 'No'))
-if t2d == '':
+t2d = st.sidebar.selectbox("Do you have Type 2 diabetes?", ("", "Yes", "No"))
+if t2d == "":
     t2d = 0
-elif t2d == 'Yes':
+elif t2d == "Yes":
     t2d = 12
-elif t2d == 'No':
+elif t2d == "No":
     t2d = -2
 
-pregnant = st.sidebar.selectbox(
-     'Are you pregnant?',
-     ('','Yes', 'No'))
-if pregnant == '':
+pregnant = st.sidebar.selectbox("Are you pregnant?", ("", "Yes", "No"))
+if pregnant == "":
     pregnant = 0
-elif pregnant == 'Yes':
+elif pregnant == "Yes":
     pregnant = -93
-elif pregnant == 'No':
+elif pregnant == "No":
     pregnant = 9
 
 kidney = st.sidebar.selectbox(
-     'Are you currently seeing a doctor for kidney issues?',
-     ('','Yes', 'No'))
-if kidney == '':
+    "Are you currently seeing a doctor for kidney issues?", ("", "Yes", "No")
+)
+if kidney == "":
     kidney = 0
-elif kidney == 'Yes':
+elif kidney == "Yes":
     kidney = -85
-elif kidney == 'No':
+elif kidney == "No":
     kidney = 6
 
-hyper = st.sidebar.selectbox(
-     'Have you been diagnosed with hypertension?',
-     ('','Yes', 'No'))
-if hyper == '':
+hyper = st.sidebar.selectbox("Have you been diagnosed with hypertension?", ("", "Yes", "No"))
+if hyper == "":
     hyper = 0
-elif hyper == 'Yes':
+elif hyper == "Yes":
     hyper = -40
-elif hyper == 'No':
+elif hyper == "No":
     hyper = 8
 
 heart = st.sidebar.selectbox(
-     'Have you been diagnosed with heart disease (other than hypertension)?',
-     ('','Yes', 'No'))
-if heart == '':
+    "Have you been diagnosed with heart disease (other than hypertension)?", ("", "Yes", "No"),
+)
+if heart == "":
     heart = 0
-elif heart == 'Yes':
+elif heart == "Yes":
     heart = -56
-elif heart == 'No':
+elif heart == "No":
     heart = 4
 
 
 anxiety = st.sidebar.selectbox(
-     'Have you been diagnosed with an anxiety disorder?',
-     ('','Yes', 'No'))
-if anxiety == '':
+    "Have you been diagnosed with an anxiety disorder?", ("", "Yes", "No")
+)
+if anxiety == "":
     anxiety = 0
-elif anxiety == 'Yes':
+elif anxiety == "Yes":
     anxiety = -64
-elif anxiety == 'No':
+elif anxiety == "No":
     anxiety = 4
 
-copd = st.sidebar.selectbox(
-     'Have you been diagnosed with COPD?',
-     ('','Yes', 'No'))
-if copd == '':
+copd = st.sidebar.selectbox("Have you been diagnosed with COPD?", ("", "Yes", "No"))
+if copd == "":
     copd = 0
-elif copd == 'Yes':
+elif copd == "Yes":
     copd = -101
-elif copd == 'No':
+elif copd == "No":
     copd = 3
 
- 
-total = 137 + cough + smoke + fever + tired +muscle + mucus +headache +t2d + pregnant + kidney + heart + anxiety + hyper + copd + drink + age + race
+
+total = (
+    137
+    + cough
+    + smoke
+    + fever
+    + tired
+    + muscle
+    + mucus
+    + headache
+    + t2d
+    + pregnant
+    + kidney
+    + heart
+    + anxiety
+    + hyper
+    + copd
+    + drink
+    + age
+    + race
+)
 st.write(imc_chart(total))
-'## ðŸ’Š Patient Risk Score:', total 
+"## ðŸ’Š Patient Risk Score:", total
 
-'''
+"""
 ---
 
 Risk score chart:
 ------------------
 ####    Base score = 137
- 
+
 
 Predictive safest score < 0
 
@@ -252,4 +260,4 @@ Predictive risk score = 170 - 213
 
 Predictive high risk = >213
 
-'''
\ No newline at end of file
+"""
diff --git a/testing/__init__.py b/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/testing/unit_test.py b/testing/unit_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fd0cd930e723d076d6bb699e157a9f3c2831f77
--- /dev/null
+++ b/testing/unit_test.py
@@ -0,0 +1,40 @@
+import unittest
+from parameterized import parameterized
+from src import filter_dataset
+
+# inspired from https://github.com/wolever/parameterized
+
+
+class FilterDatasetTest(unittest.TestCase):
+
+    # For icd codes
+    @parameterized.expand(
+        [["diabetesIcdCode", "E08.22", "E08",], ["withoutdot", "E08", "E08"],]
+    )
+    def test_icd(self, name, icdCode, expectedCategory):
+        assert filter_dataset.icd(icdCode) == expectedCategory
+
+    # For observation table habits values
+    @parameterized.expand(
+        [
+            ["former_smoker", "Former smoker-HX Tobacco use", "former_smoker",],
+            ["No_Substance_use", "None-SHX Substance abuse use", "No_Substance_use"],
+            ["No_alcohol", "None-SHX Alcohol use", "No_alcohol"],
+            ["irreg_BMI_instance", "BMI-30+", "30.0-34.9"],
+            ["irreg_BMI_instance1", "Body mass index (BMI 50.0-59.9), adult-Z68.43", "50-59.9"],
+            ["irreg_BMI_instance2", "Body mass index (BMI 20.0_24.9), adult-Z68.24", "20.0_24.9"],
+        ]
+    )
+    def test_habits(self, name, habits, expectedhabits):
+        assert filter_dataset.parse_values(habits) == expectedhabits
+
+    # For observation table BMI values
+    @parameterized.expand(
+        [
+            ["age_43", "Body mass index (BMI) 50.0-59.9, adult-Z68.43", "BMI-40.0_or_greater",],
+            ["age_38", "Body mass index (BMI) 38.0-38.9, adult-Z68.38", "BMI-25.0_39.9"],
+            ["age_24", "Body mass index (BMI) 24.0-24.9, adult-Z68.24", "BMI-20.0_24.9"],
+        ]
+    )
+    def test_BMI(self, name, BMI, expectedBMI):
+        assert filter_dataset.weight_bins(BMI) == expectedBMI