From fc971ad0d934a1d4503d5c093d687e1875d3fc4a Mon Sep 17 00:00:00 2001
From: Matthew K Defenderfer <mdefende@uab.edu>
Date: Tue, 17 Dec 2019 16:31:27 -0600
Subject: [PATCH] Change the workflow structure so cv is performed at all
 smoothing levels and final models are built from that. Reduced the number of
 combinations for the hyperparameter grid search drastically based on previous
 experience

---
 misc-scripts/cv-compare.R             |  54 ++--
 model-py/batch/create_GBDT_cv_jobs.m  |  29 +-
 model-py/batch/create_GBDT_cv_jobs.m~ |  30 ++
 model-py/batch/create_lm_model_jobs.m |   4 +-
 model-py/batch/create_model_jobs.m    |  27 +-
 model-py/batch/create_model_jobs.m~   |  24 ++
 model-py/batch/write_cv_script.m      |  10 +-
 model-py/gbdt-cv.py                   |   4 +-
 model-py/lm-betas.py                  |   4 +-
 model-py/notebooks/Untitled.ipynb     | 391 ++++++++++++++++++++++++++
 10 files changed, 524 insertions(+), 53 deletions(-)
 create mode 100644 model-py/batch/create_GBDT_cv_jobs.m~
 create mode 100644 model-py/batch/create_model_jobs.m~
 create mode 100644 model-py/notebooks/Untitled.ipynb

diff --git a/misc-scripts/cv-compare.R b/misc-scripts/cv-compare.R
index 3a66a89..bda6013 100644
--- a/misc-scripts/cv-compare.R
+++ b/misc-scripts/cv-compare.R
@@ -3,44 +3,58 @@
 # the GBDT error metrics, pick out the relevant columns from lm.df, and combine
 # them into a single cross validation matrix. From there, graphs comparing MAE
 # and RMSE across areas, smoothing kernels, and model types can be created
+library(sciplot)
+library(tidyverse)
 
-lmpath <- "/data/user/mdefende/Projects/ctmodel-ml/model-py/cv-out/linear-models/"
-lm.df <- list.files(path = lmpath, pattern = "*.csv", full.names = TRUE) %>%
-  sapply(read_csv, simplify=FALSE,n_max = 5) %>%
+
+lm.1200.path <- "/data/user/mdefende/Projects/ctmodel-ml/results/HCP1200/lm/cv/out"
+lm.1200 <- list.files(path = lm.1200.path, pattern = "*.csv", full.names = TRUE) %>%
+  sapply(read_csv, simplify=FALSE,n_max = 10) %>%
   bind_rows() %>%
-  mutate(region = as.factor(region),
-         sm = factor(sm,levels = c(0,2,5,10), labels = c(0,2,5,10))) %>%
-  rename(area = 'region') %>%
+  mutate(area = as.factor(area),
+         sm = factor(sm,levels = c(0,2,5,10), labels = c(0,2,5,10)),
+         method = 'lm') %>%
   mutate_if(is.numeric, abs) %>%
   mutate_at(vars(contains('mse')), sqrt) %>%
   rename('test_rmse' = 'test_mse', 'train_rmse' = 'train_mse') %>%
   group_by(area,sm) %>%
-  summarize_if(is.numeric, funs(mean, se)) %>%
+  summarize_if(is.numeric, list(mean = mean, se = se)) %>%
   ungroup() %>%
-  mutate(method = 'lm')
-
+  mutate(method = 'lm-1200')
 
-GBDTpath <- "/data/user/mdefende/Projects/ctmodel-ml/model-py/cv-out/best-GBDT"
-gbdt.df <- list.files(path = GBDTpath, pattern = "*.csv", full.names = TRUE) %>%
+gbdt.1200.path <- "/data/user/mdefende/Projects/ctmodel-ml/results/HCP1200/gbdt/cv/out/best-gbdt"
+gbdt.1200 <- list.files(path = gbdt.1200.path, pattern = "*.csv", full.names = TRUE) %>%
   sapply(read_csv, simplify=FALSE) %>%
   bind_rows() %>%
   mutate(area = as.factor(area),
-         sm = factor(sm,levels = c(0,2,5,10), labels = c(0,2,5,10)),
-         method = 'gbdt')
+         sm = factor(sm,levels = c(0,2,5,10), ordered = TRUE),
+         method = 'gbdt-1200')
+
+gbdt.786.path <- "/data/user/mdefende/Projects/ctmodel-ml/results/HCP786/gbdt/cv/out/best-gbdt"
+gbdt.786 <- list.files(path = gbdt.786.path, pattern = "*.csv", full.names = TRUE) %>%
+  sapply(read_csv, simplify=FALSE) %>%
+  bind_rows() %>%
+  mutate(area = str_replace_all(area,'_','-'),
+         area = as.factor(area),
+         sm = factor(sm,levels = c(0,2,5,10), ordered = TRUE),
+         method = 'gbdt-786')
 
 
-cv.df <- lm.df %>%
-  select(method,area,sm,test_mae_mean,test_rmse_mean) %>%
+cv.df <- lm.1200 %>%
+  dplyr::select(method,area,sm,test_mae_mean,test_rmse_mean) %>%
   rename(test_mae = test_mae_mean, test_rmse = test_rmse_mean) %>%
-  bind_rows(.,select(gbdt.df,-contains('train'))) %>%
-  mutate(method = factor(method))
+  bind_rows(.,dplyr::select(gbdt.1200,area,sm,contains('test'),method)) %>%
+  bind_rows(.,dplyr::select(gbdt.786,area,sm,contains('test'),method)) %>%
+  mutate(method = factor(method),
+         area = factor(area)) %>%
+  arrange(method,area,sm)
 
 
 # Variable to relabel facets
 method_labels <- c(BA45 = "Brodmann Area 45", 
                    dACC = "dorsal Anterior Cingulate Cortex", 
-                   V1_withRet = "V1 with Retinotopic Factors", 
-                   V1_noRet = "V1 with no Retinotopic Factors")
+                   V1-withRet = "V1 with Retinotopic Factors", 
+                   V1-noRet = "V1 with no Retinotopic Factors")
 
 
 cv.df %>%
@@ -48,7 +62,7 @@ cv.df %>%
     geom_point(size = 3) +
     geom_line(aes(group = method)) +
     facet_wrap(~area, labeller = labeller(area = method_labels)) +
-    scale_color_discrete(name = "Model Type", labels = c("GBDT", "Linear")) +
+    scale_color_discrete(name = "Model Type") +
     theme_bw() +
     ggtitle("Linear Model and GBDT Root Mean Square Error") +
     ylab("RMSE (mm)") +
diff --git a/model-py/batch/create_GBDT_cv_jobs.m b/model-py/batch/create_GBDT_cv_jobs.m
index 174086b..5a89247 100644
--- a/model-py/batch/create_GBDT_cv_jobs.m
+++ b/model-py/batch/create_GBDT_cv_jobs.m
@@ -1,12 +1,25 @@
-datafile = '/data/user/mdefende/Projects/ctmodel-ml/data/HCP1200-dACC.csv';
+% Add project to path
+addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml'))
+
+% initial parameters
+projdir = '/data/user/mdefende/Projects/ctmodel-ml';
+dataset = 'HCP1200';
 area = 'dACC';
-eta = [0.01,0.05,0.1:0.1:0.5];
-max_depth = 3:10;
-nboost = 1000:1000:15000;
-sm = 5;
+do_cv = 0;
+make_model = 1;
+
+if make_model
+    jobdir = fullfile(projdir,'results',dataset,'gbdt','models','jobs',area);
+else
+    jobdir = fullfile(projdir,'results',dataset,'gbdt','cv','jobs',area);
+end
+
+eta = [0.01,0.05,0.1,0.2,0.3];
+max_depth = 6:10;
+nboost = 15000;
+sm = [0,2,10];
 
 A = combvec(eta,max_depth,nboost,sm)';
-jobdir = ['/data/user/mdefende/Projects/ctmodel-ml/Results/HCP1200/cv-jobs/' area];
 if ~exist(jobdir,'dir')
     mkdir(jobdir)
 end
@@ -14,9 +27,9 @@ end
 for ii = 1:size(A,1)
     jobname = [area '_eta' num2str(A(ii,1)) '_depth' num2str(A(ii,2)) '_nboost' num2str(A(ii,3)) '_sm' num2str(A(ii,4)) '.sh'];
     
-    write_cv_script(jobname,jobdir,area,A(ii,1),A(ii,2),A(ii,3),A(ii,4),0,datafile)
+    write_cv_script(jobname,jobdir,projdir,area,A(ii,1),A(ii,2),A(ii,3),A(ii,4),do_cv,make_model,dataset)
     
-    if ~exist(['/data/user/mdefende/Projects/ctmodel-ml/Results/HCP1200/cv-out/' area '/' jobname(1:end-3) '.csv'],'file')
+    if ~exist(fullfile(projdir,'results',dataset,'gbdt','cv','out',area,[jobname(1:end-3) '.csv']),'file')
         system(['sbatch ' fullfile(jobdir,jobname)]);
         %disp(jobname)
     end
diff --git a/model-py/batch/create_GBDT_cv_jobs.m~ b/model-py/batch/create_GBDT_cv_jobs.m~
new file mode 100644
index 0000000..bf445cf
--- /dev/null
+++ b/model-py/batch/create_GBDT_cv_jobs.m~
@@ -0,0 +1,30 @@
+% Add project to path
+addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml'))
+
+% initial parameters
+projdir = '/data/user/mdefende/Projects/ctmodel-ml';
+dataset = 'HCP1200';
+area = 'V1-withRet';
+make_model = 1;
+
+eta = [0.01,0.05,0.1,0.2,0.3];
+max_depth = 6:10;
+nboost = 15000;
+sm = [0,2,10];
+
+A = combvec(eta,max_depth,nboost,sm)';
+jobdir = fullfile(projdir,'results',dataset,'cv','jobs');
+if ~exist(jobdir,'dir')
+    mkdir(jobdir)
+end
+
+for ii = 1:size(A,1)
+    jobname = [area '_eta' num2str(A(ii,1)) '_depth' num2str(A(ii,2)) '_nboost' num2str(A(ii,3)) '_sm' num2str(A(ii,4)) '.sh'];
+    
+    write_cv_script(jobname,jobdir,projdir,area,A(ii,1),A(ii,2),A(ii,3),A(ii,4),0,datafile)
+    
+    if ~exist(fullfile(projdir,'results',dataset,'gbdt','cv','out',area,[jobname(1:end-3) '.csv']),'file')
+        system(['sbatch ' fullfile(jobdir,jobname)]);
+        %disp(jobname)
+    end
+end
\ No newline at end of file
diff --git a/model-py/batch/create_lm_model_jobs.m b/model-py/batch/create_lm_model_jobs.m
index b954908..80c0d7b 100644
--- a/model-py/batch/create_lm_model_jobs.m
+++ b/model-py/batch/create_lm_model_jobs.m
@@ -15,8 +15,8 @@ for ii = 1:length(areas)
         fprintf(fid,'#!/bin/bash\n');
         fprintf(fid,'#SBATCH --partition=express\n');
         fprintf(fid,'#SBATCH --time=1:00:00\n');
-        fprintf(fid,'#SBATCH --ntasks=1\n');
-        fprintf(fid,'#SBATCH --mem-per-cpu=150G\n');
+        fprintf(fid,'#SBATCH -ks=1\n');
+        fprintf(fid,'#SBATCH --mem-per-cpu=200G\n');
         fprintf(fid,['#SBATCH --output=' fullfile(jobdir,jobname(1:end-3)) '.txt\n\n']);
         
         fprintf(fid,'module load Anaconda3/5.3.0\n');
diff --git a/model-py/batch/create_model_jobs.m b/model-py/batch/create_model_jobs.m
index d28eb37..4de631c 100644
--- a/model-py/batch/create_model_jobs.m
+++ b/model-py/batch/create_model_jobs.m
@@ -4,27 +4,22 @@ addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml'))
 % initial parameters
 projdir = '/data/user/mdefende/Projects/ctmodel-ml';
 dataset = 'HCP1200';
-area = 'V1-noRet';
+area = 'all';
+do_cv = 0;
 make_model = 1;
 
 % Say where the cv outputs are
-cv_out = fullfile(projdir,'results',dataset,'cv','out');
+cv_out = fullfile(projdir,'results',dataset,'gbdt','cv','out','best-gbdt.csv');
+T = readtable(cv_out);
 
-% What cv parameter set are we going to keep
-eta = round([0.01,0.05,0.1:0.1:0.5],2); % There are problems with 0.3 when created by the : operator
-max_depth = 3:10;
-nboost = 1000:1000:15000;
-sm = 5;
-
-A = check_cv_new(fullfile(cv_out,area),eta,max_depth,nboost,sm);
-
-% Set the different smoothing kernels
-sm = [0,2,5,10];
+if ~strcmp(area,'all')
+    T(~strcmp(T.area,area),:) = [];
+end
 
-jobdir = fullfile(projdir,'results',dataset,'models','jobs');
+jobdir = fullfile(projdir,'results',dataset,'gbdt','models','jobs');
 
-for ii = 1:length(sm)
-    jobname = [area '_eta' num2str(A(2)) '_depth' num2str(A(3)) '_nboost' num2str(A(5) + 1) '_sm' num2str(sm(ii)) '.sh'];
-    write_cv_script(jobname,jobdir,projdir,area,A(2),A(3),A(5)+1,sm(ii),make_model,dataset)
+for ii = 1:height(T)
+    jobname = [T.area{ii} '_eta' num2str(T.eta(ii)) '_depth' num2str(T.depth(ii)) '_nboost' num2str(T.cv_index(ii) + 1) '_sm' num2str(T.sm(ii)) '.sh'];
+    write_cv_script(jobname,jobdir,projdir,T.area{ii},T.eta(ii),T.depth(ii),T.cv_index(ii)+1,T.sm(ii),do_cv,make_model,dataset)
     system(['sbatch ' fullfile(jobdir,jobname)]);
 end
diff --git a/model-py/batch/create_model_jobs.m~ b/model-py/batch/create_model_jobs.m~
new file mode 100644
index 0000000..4421c77
--- /dev/null
+++ b/model-py/batch/create_model_jobs.m~
@@ -0,0 +1,24 @@
+% Add project to path
+addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml'))
+
+% initial parameters
+projdir = '/data/user/mdefende/Projects/ctmodel-ml';
+dataset = 'HCP1200';
+area = 'V1-noRet';
+do_cv = 0;
+make_model = 1;
+
+% Say where the cv outputs are
+cv_out = fullfile(projdir,'results',dataset,'gbdt','cv','out','best-gbdt.csv');
+T = readtable(cv_out);
+
+% Set the different smoothing kernels
+sm = [0,2,5,10];
+
+jobdir = fullfile(projdir,'results',dataset,'models','jobs');
+
+for ii = 1:length(sm)
+    jobname = [area '_eta' num2str(A(2)) '_depth' num2str(A(3)) '_nboost' num2str(A(5) + 1) '_sm' num2str(sm(ii)) '.sh'];
+    write_cv_script(jobname,jobdir,projdir,area,A(2),A(3),A(5)+1,sm(ii),make_model,dataset)
+    system(['sbatch ' fullfile(jobdir,jobname)]);
+end
diff --git a/model-py/batch/write_cv_script.m b/model-py/batch/write_cv_script.m
index 69cea3d..8cf0528 100644
--- a/model-py/batch/write_cv_script.m
+++ b/model-py/batch/write_cv_script.m
@@ -1,4 +1,4 @@
-function write_cv_script(jobname,jobdir,projdir,area,eta,maxdepth,nboost,sm,create_model,dataset)
+function write_cv_script(jobname,jobdir,projdir,area,eta,maxdepth,nboost,sm,do_cv,create_model,dataset)
 fid = fopen(fullfile(jobdir,jobname),'w');
 fprintf(fid,'#!/bin/bash\n');
 fprintf(fid,'#SBATCH --partition=pascalnodes\n');
@@ -14,10 +14,12 @@ fprintf(fid,'source activate ctmodel-ml\n\n');
 
 fprintf(fid,'cd /data/user/mdefende/Projects/ctmodel-ml/model-py\n\n');
 
-fprintf(fid,['python gbdt-cv.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset  ' ' projdir '\n']);
-
+if do_cv
+    fprintf(fid,['python3.6 gbdt-cv.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset  ' ' projdir '\n']);
+end
+    
 if create_model
-    fprintf(fid,['python create-GBDT-model.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset  ' ' projdir '\n']);
+    fprintf(fid,['python3.6 create-GBDT-model.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset  ' ' projdir '\n']);
 end
 
 fclose(fid);
diff --git a/model-py/gbdt-cv.py b/model-py/gbdt-cv.py
index d7f6429..379a59d 100644
--- a/model-py/gbdt-cv.py
+++ b/model-py/gbdt-cv.py
@@ -71,7 +71,9 @@ cv_results = xgb.cv(params = param, dtrain = dfull, num_boost_round = nboost_val
 idx = cv_results['test-mae-mean'].idxmin()
 
 # Write out the cv results to a csv file
-cv_dict = {'eta': eta_val,
+cv_dict = {'area': area,
+           'sm': sm,
+           'eta': eta_val,
            'depth': max_depth_val,
            'nboost': nboost_val,
            'cv_index': idx,
diff --git a/model-py/lm-betas.py b/model-py/lm-betas.py
index f4babe8..efe3831 100644
--- a/model-py/lm-betas.py
+++ b/model-py/lm-betas.py
@@ -86,7 +86,7 @@ del df_pred
 df_x = calculate_vif_(df_x,df_y)
 
 # combine back into single dataframe and apply z-transform
-df_z = pd.concat([df_x,df_y]).apply(stats.zscore)
+df_z = pd.concat([df_x,df_y],axis = 1).apply(stats.zscore, axis = 0)
 
 # create the formula 
 features = '(' + '+'.join(df_z.drop(df_z.filter(like = 'Thick'),axis = 1).columns) + ') ** 3'
@@ -100,7 +100,7 @@ results = mod.fit()
 print(results.summary())
 
 # grab the p-value and beta values for each of the terms
-pvals = reg.pvalues
+pvals = results.pvalues
 pvals.name = 'p.value'
 
 coefs = results.params
diff --git a/model-py/notebooks/Untitled.ipynb b/model-py/notebooks/Untitled.ipynb
new file mode 100644
index 0000000..ae5f35c
--- /dev/null
+++ b/model-py/notebooks/Untitled.ipynb
@@ -0,0 +1,391 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 17:14:51) \n",
+      "[GCC 7.2.0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# basic libraries to load\n",
+    "import os\n",
+    "import sys\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# import model generation and analysis\n",
+    "from patsy import dmatrices\n",
+    "from statsmodels.stats.outliers_influence import variance_inflation_factor\n",
+    "import statsmodels.api as sm\n",
+    "import statsmodels.formula.api as smf\n",
+    "from scipy import stats\n",
+    "\n",
+    "# print the current python version\n",
+    "print(sys.version)\n",
+    "\n",
+    "# Define a function to repeatedly calculate variance inflation factor and drop \n",
+    "def calculate_vif_(df_x, df_y, thresh=10.0):\n",
+    "    # Set up the while loop. While we have something to drop, do ...    \n",
+    "    dropped = True\n",
+    "    while dropped:\n",
+    "        # get a list of the column names\n",
+    "        variables = list(df_x.columns)\n",
+    "\n",
+    "        # concatenate the column names into a string separated by '+'\n",
+    "        features = '+'.join(variables)\n",
+    "        \n",
+    "        # re-concatenate the dataframes for creating the design matrices\n",
+    "        df_tmp = pd.concat([df_x,df_y],axis = 1)\n",
+    "      \n",
+    "        y, X = dmatrices(df_y.columns[0] + ' ~ ' + features, df_tmp, return_type='dataframe')\n",
+    "        vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]\n",
+    "        vif = vif[1:len(vif)]\n",
+    "        maxloc = vif.index(max(vif))\n",
+    "\n",
+    "        thresh = 10        \n",
+    "        if max(vif) > thresh:\n",
+    "            print('dropping \\'' + variables[maxloc] +\n",
+    "                  '\\' at index: ' + str(maxloc) + ' with VIF:' + str(vif[maxloc]))\n",
+    "            df_x = df_x.drop(variables[maxloc],axis=1)\n",
+    "        else:\n",
+    "            dropped = False\n",
+    "\n",
+    "    print('Remaining variables:')\n",
+    "    print(df_x.columns)\n",
+    "    return df_x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import parameters from command line\n",
+    "projdir = '/data/user/mdefende/Projects/ctmodel-ml'\n",
+    "dataset = 'HCP1200'\n",
+    "area = 'BA45'\n",
+    "sm = '0'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# First load data\n",
+    "if 'V1' in area:\n",
+    "    data_file = projdir + '/data/' + dataset + '-V1.csv'\n",
+    "else:\n",
+    "    data_file = projdir + '/data/' + dataset + '-' + area + '.csv'\n",
+    "\n",
+    "df = pd.read_csv(data_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Index(['Age', 'Sulc0', 'CurvK0', 'CurvH0', 'PialCurvK0', 'PialCurvH0', 'Area0',\n",
+      "       'MidArea0', 'PialArea0', 'LGI0'],\n",
+      "      dtype='object'), Index(['Thick0'], dtype='object')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Do initial feature selection by the smoothing kernel\n",
+    "df_pred = df.filter(like=sm)\n",
+    "df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(like='Thick')))]\n",
+    "df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(like='Sulc')))]\n",
+    "\n",
+    "# If sm == 0, remove all columns with 10. Else remove column with name Sulc + sm\n",
+    "if sm == '0':\n",
+    "    df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(regex='10')))]\n",
+    "\n",
+    "# Grab the rest of the features (Age, Ecc, Pol, and unsmoothed Sulc) and concatenate them with what remains from df_pred. Only grab Ecc and Pol if withRet is designated in the area. Then grab the correct thickness column\n",
+    "if 'withRet' in area:\n",
+    "    df_x = pd.concat([df[['Age','Ecc','Pol','Sulc0']],df_pred], axis = 1)\n",
+    "else:\n",
+    "    df_x = pd.concat([df[['Age','Sulc0']],df_pred], axis = 1)\n",
+    "\n",
+    "df_y = df[['Thick' + sm]]\n",
+    "\n",
+    "print([df_x.columns,df_y.columns])\n",
+    "\n",
+    "# remove df_pred from memory\n",
+    "del df_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dropping 'MidArea0' at index: 7 with VIF:272945431961848.25\n",
+      "Remaining variables:\n",
+      "Index(['Age', 'Sulc0', 'CurvK0', 'CurvH0', 'PialCurvK0', 'PialCurvH0', 'Area0',\n",
+      "       'PialArea0', 'LGI0'],\n",
+      "      dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# calculate vif repeatedly removing features with a score above 10\n",
+    "df_x = calculate_vif_(df_x,df_y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# combine back into single dataframe and apply z-transform\n",
+    "df_z = pd.concat([df_x,df_y],axis = 1).apply(stats.zscore, axis = 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                            OLS Regression Results                            \n",
+      "==============================================================================\n",
+      "Dep. Variable:                 Thick0   R-squared:                       0.411\n",
+      "Model:                            OLS   Adj. R-squared:                  0.411\n",
+      "Method:                 Least Squares   F-statistic:                 4.700e+04\n",
+      "Date:                Thu, 12 Dec 2019   Prob (F-statistic):               0.00\n",
+      "Time:                        17:23:52   Log-Likelihood:            -1.0049e+07\n",
+      "No. Observations:             8702966   AIC:                         2.010e+07\n",
+      "Df Residuals:                 8702836   BIC:                         2.010e+07\n",
+      "Df Model:                         129                                         \n",
+      "Covariance Type:            nonrobust                                         \n",
+      "===================================================================================================\n",
+      "                                      coef    std err          t      P>|t|      [0.025      0.975]\n",
+      "---------------------------------------------------------------------------------------------------\n",
+      "Intercept                          -0.3263      0.001   -612.789      0.000      -0.327      -0.325\n",
+      "Age                                -0.0375      0.000    -79.024      0.000      -0.038      -0.037\n",
+      "Sulc0                              -0.1811      0.001   -293.381      0.000      -0.182      -0.180\n",
+      "CurvK0                             -0.1594      0.001   -171.857      0.000      -0.161      -0.158\n",
+      "CurvH0                              0.1093      0.001    132.603      0.000       0.108       0.111\n",
+      "PialCurvK0                         -0.2774      0.001   -268.438      0.000      -0.279      -0.275\n",
+      "PialCurvH0                         -0.9025      0.001   -901.832      0.000      -0.904      -0.900\n",
+      "Area0                              -0.3285      0.001   -503.063      0.000      -0.330      -0.327\n",
+      "PialArea0                           0.2825      0.001    363.814      0.000       0.281       0.284\n",
+      "LGI0                                0.1225      0.000    257.511      0.000       0.122       0.123\n",
+      "Age:Sulc0                           0.0057      0.000     13.697      0.000       0.005       0.007\n",
+      "Age:CurvK0                          0.0065      0.001     10.001      0.000       0.005       0.008\n",
+      "Age:CurvH0                         -0.0094      0.001    -14.927      0.000      -0.011      -0.008\n",
+      "Age:PialCurvK0                      0.0049      0.001      5.913      0.000       0.003       0.006\n",
+      "Age:PialCurvH0                      0.0098      0.001     13.275      0.000       0.008       0.011\n",
+      "Age:Area0                          -0.0025      0.001     -4.705      0.000      -0.004      -0.001\n",
+      "Age:PialArea0                       0.0075      0.001     11.351      0.000       0.006       0.009\n",
+      "Age:LGI0                            0.0048      0.000     18.364      0.000       0.004       0.005\n",
+      "Sulc0:CurvK0                       -0.0456      0.001    -55.149      0.000      -0.047      -0.044\n",
+      "Sulc0:CurvH0                       -0.1505      0.001   -199.646      0.000      -0.152      -0.149\n",
+      "Sulc0:PialCurvK0                    0.0103      0.001     10.468      0.000       0.008       0.012\n",
+      "Sulc0:PialCurvH0                   -0.1050      0.001   -105.627      0.000      -0.107      -0.103\n",
+      "Sulc0:Area0                         0.1350      0.001    246.087      0.000       0.134       0.136\n",
+      "Sulc0:PialArea0                    -0.1482      0.001   -238.676      0.000      -0.149      -0.147\n",
+      "Sulc0:LGI0                          0.1658      0.000    406.482      0.000       0.165       0.167\n",
+      "CurvK0:CurvH0                      -0.0150      0.000    -44.477      0.000      -0.016      -0.014\n",
+      "CurvK0:PialCurvK0                   0.0035      0.001      4.064      0.000       0.002       0.005\n",
+      "CurvK0:PialCurvH0                  -0.1066      0.001    -91.949      0.000      -0.109      -0.104\n",
+      "CurvK0:Area0                       -0.0776      0.001    -86.855      0.000      -0.079      -0.076\n",
+      "CurvK0:PialArea0                   -0.0078      0.001     -7.268      0.000      -0.010      -0.006\n",
+      "CurvK0:LGI0                         0.0266      0.001     40.871      0.000       0.025       0.028\n",
+      "CurvH0:PialCurvK0                   0.0742      0.001     68.673      0.000       0.072       0.076\n",
+      "CurvH0:PialCurvH0                   0.1708      0.001    149.031      0.000       0.169       0.173\n",
+      "CurvH0:Area0                        0.2686      0.001    344.325      0.000       0.267       0.270\n",
+      "CurvH0:PialArea0                   -0.2880      0.001   -311.514      0.000      -0.290      -0.286\n",
+      "CurvH0:LGI0                        -0.0515      0.001    -81.852      0.000      -0.053      -0.050\n",
+      "PialCurvK0:PialCurvH0               0.0442      0.000    118.913      0.000       0.043       0.045\n",
+      "PialCurvK0:Area0                   -0.0769      0.001    -75.549      0.000      -0.079      -0.075\n",
+      "PialCurvK0:PialArea0               -0.0187      0.001    -16.214      0.000      -0.021      -0.016\n",
+      "PialCurvK0:LGI0                     0.0114      0.001     14.062      0.000       0.010       0.013\n",
+      "PialCurvH0:Area0                    0.0604      0.001     59.040      0.000       0.058       0.062\n",
+      "PialCurvH0:PialArea0               -0.3898      0.001   -323.395      0.000      -0.392      -0.387\n",
+      "PialCurvH0:LGI0                     0.0321      0.001     43.407      0.000       0.031       0.034\n",
+      "Area0:PialArea0                     0.0084      0.000     24.263      0.000       0.008       0.009\n",
+      "Area0:LGI0                         -0.0329      0.001    -62.457      0.000      -0.034      -0.032\n",
+      "PialArea0:LGI0                      0.0755      0.001    115.907      0.000       0.074       0.077\n",
+      "Age:Sulc0:CurvK0                   -0.0024      0.000     -5.566      0.000      -0.003      -0.002\n",
+      "Age:Sulc0:CurvH0                   -0.0016      0.000     -3.502      0.000      -0.003      -0.001\n",
+      "Age:Sulc0:PialCurvK0             8.591e-05      0.000      0.272      0.786      -0.001       0.001\n",
+      "Age:Sulc0:PialCurvH0                0.0007      0.000      1.866      0.062   -3.33e-05       0.001\n",
+      "Age:Sulc0:Area0                    -0.0005      0.000     -1.096      0.273      -0.001       0.000\n",
+      "Age:Sulc0:PialArea0                 0.0049      0.000     10.699      0.000       0.004       0.006\n",
+      "Age:Sulc0:LGI0                     -0.0002      0.000     -0.506      0.613      -0.001       0.001\n",
+      "Age:CurvK0:CurvH0                   0.0007      0.000      4.978      0.000       0.000       0.001\n",
+      "Age:CurvK0:PialCurvK0            5.093e-05      0.000      0.164      0.870      -0.001       0.001\n",
+      "Age:CurvK0:PialCurvH0               0.0013      0.000      2.578      0.010       0.000       0.002\n",
+      "Age:CurvK0:Area0                    0.0054      0.001      8.538      0.000       0.004       0.007\n",
+      "Age:CurvK0:PialArea0               -0.0033      0.001     -6.312      0.000      -0.004      -0.002\n",
+      "Age:CurvK0:LGI0                    -0.0012      0.000     -4.331      0.000      -0.002      -0.001\n",
+      "Age:CurvH0:PialCurvK0              -0.0031      0.000     -6.480      0.000      -0.004      -0.002\n",
+      "Age:CurvH0:PialCurvH0              -0.0053      0.001     -8.327      0.000      -0.007      -0.004\n",
+      "Age:CurvH0:Area0                   -0.0034      0.001     -5.349      0.000      -0.005      -0.002\n",
+      "Age:CurvH0:PialArea0               -0.0023      0.001     -4.130      0.000      -0.003      -0.001\n",
+      "Age:CurvH0:LGI0                    -0.0003      0.000     -1.043      0.297      -0.001       0.000\n",
+      "Age:PialCurvK0:PialCurvH0        6.689e-05   8.98e-05      0.745      0.456      -0.000       0.000\n",
+      "Age:PialCurvK0:Area0               -0.0003      0.000     -0.840      0.401      -0.001       0.000\n",
+      "Age:PialCurvK0:PialArea0            0.0029      0.001      3.363      0.001       0.001       0.005\n",
+      "Age:PialCurvK0:LGI0                -0.0012      0.000     -4.195      0.000      -0.002      -0.001\n",
+      "Age:PialCurvH0:Area0                0.0012      0.000      3.313      0.001       0.000       0.002\n",
+      "Age:PialCurvH0:PialArea0            0.0046      0.001      5.996      0.000       0.003       0.006\n",
+      "Age:PialCurvH0:LGI0                -0.0006      0.000     -1.950      0.051      -0.001    2.88e-06\n",
+      "Age:Area0:PialArea0                -0.0007      0.000     -2.251      0.024      -0.001   -8.42e-05\n",
+      "Age:Area0:LGI0                      0.0001      0.000      0.259      0.796      -0.001       0.001\n",
+      "Age:PialArea0:LGI0                 -0.0003      0.000     -0.601      0.548      -0.001       0.001\n",
+      "Sulc0:CurvK0:CurvH0                 0.0023      0.000     11.980      0.000       0.002       0.003\n",
+      "Sulc0:CurvK0:PialCurvK0            -0.0115      0.000    -30.406      0.000      -0.012      -0.011\n",
+      "Sulc0:CurvK0:PialCurvH0            -0.0057      0.001     -9.359      0.000      -0.007      -0.004\n",
+      "Sulc0:CurvK0:Area0                 -0.0373      0.001    -51.817      0.000      -0.039      -0.036\n",
+      "Sulc0:CurvK0:PialArea0              0.0070      0.001     10.570      0.000       0.006       0.008\n",
+      "Sulc0:CurvK0:LGI0                  -0.0022      0.000     -5.451      0.000      -0.003      -0.001\n",
+      "Sulc0:CurvH0:PialCurvK0             0.0024      0.001      4.439      0.000       0.001       0.003\n",
+      "Sulc0:CurvH0:PialCurvH0             0.0109      0.001     14.784      0.000       0.009       0.012\n",
+      "Sulc0:CurvH0:Area0                  0.0047      0.001      7.863      0.000       0.004       0.006\n",
+      "Sulc0:CurvH0:PialArea0             -0.0736      0.001   -135.096      0.000      -0.075      -0.072\n",
+      "Sulc0:CurvH0:LGI0                  -0.0025      0.000     -5.507      0.000      -0.003      -0.002\n",
+      "Sulc0:PialCurvK0:PialCurvH0        -0.0010      0.000     -8.762      0.000      -0.001      -0.001\n",
+      "Sulc0:PialCurvK0:Area0              0.0184      0.000     52.024      0.000       0.018       0.019\n",
+      "Sulc0:PialCurvK0:PialArea0         -0.0250      0.001    -28.204      0.000      -0.027      -0.023\n",
+      "Sulc0:PialCurvK0:LGI0              -0.0013      0.000     -4.001      0.000      -0.002      -0.001\n",
+      "Sulc0:PialCurvH0:Area0             -0.0330      0.000    -87.655      0.000      -0.034      -0.032\n",
+      "Sulc0:PialCurvH0:PialArea0         -0.0710      0.001    -90.453      0.000      -0.073      -0.069\n",
+      "Sulc0:PialCurvH0:LGI0              -0.0054      0.000    -14.313      0.000      -0.006      -0.005\n",
+      "Sulc0:Area0:PialArea0               0.0180      0.000     48.006      0.000       0.017       0.019\n",
+      "Sulc0:Area0:LGI0                   -0.0144      0.000    -33.923      0.000      -0.015      -0.014\n",
+      "Sulc0:PialArea0:LGI0                0.0466      0.000    102.277      0.000       0.046       0.048\n",
+      "CurvK0:CurvH0:PialCurvK0            0.0011      0.000      7.882      0.000       0.001       0.001\n",
+      "CurvK0:CurvH0:PialCurvH0           -0.0031      0.000    -13.965      0.000      -0.004      -0.003\n",
+      "CurvK0:CurvH0:Area0                -0.0019      0.000     -5.947      0.000      -0.003      -0.001\n",
+      "CurvK0:CurvH0:PialArea0            -0.0102      0.000    -38.435      0.000      -0.011      -0.010\n",
+      "CurvK0:CurvH0:LGI0                  0.0025      0.000     17.905      0.000       0.002       0.003\n",
+      "CurvK0:PialCurvK0:PialCurvH0        0.0031      0.000     26.367      0.000       0.003       0.003\n",
+      "CurvK0:PialCurvK0:Area0            -0.0140      0.001    -24.563      0.000      -0.015      -0.013\n",
+      "CurvK0:PialCurvK0:PialArea0         0.0190      0.001     23.023      0.000       0.017       0.021\n",
+      "CurvK0:PialCurvK0:LGI0              0.0002      0.000      0.613      0.540      -0.000       0.001\n",
+      "CurvK0:PialCurvH0:Area0            -0.0727      0.001    -90.603      0.000      -0.074      -0.071\n",
+      "CurvK0:PialCurvH0:PialArea0         0.0208      0.001     18.001      0.000       0.019       0.023\n",
+      "CurvK0:PialCurvH0:LGI0              0.0020      0.000      4.071      0.000       0.001       0.003\n",
+      "CurvK0:Area0:PialArea0              0.0009      0.000      2.971      0.003       0.000       0.002\n",
+      "CurvK0:Area0:LGI0                   0.0196      0.001     30.804      0.000       0.018       0.021\n",
+      "CurvK0:PialArea0:LGI0              -0.0139      0.001    -27.324      0.000      -0.015      -0.013\n",
+      "CurvH0:PialCurvK0:PialCurvH0        0.0022      0.000     12.089      0.000       0.002       0.003\n",
+      "CurvH0:PialCurvK0:Area0             0.0700      0.001     86.041      0.000       0.068       0.072\n",
+      "CurvH0:PialCurvK0:PialArea0        -0.0563      0.001    -50.653      0.000      -0.058      -0.054\n",
+      "CurvH0:PialCurvK0:LGI0             -0.0018      0.000     -3.730      0.000      -0.003      -0.001\n",
+      "CurvH0:PialCurvH0:Area0             0.1860      0.001    194.524      0.000       0.184       0.188\n",
+      "CurvH0:PialCurvH0:PialArea0        -0.1857      0.001   -153.565      0.000      -0.188      -0.183\n",
+      "CurvH0:PialCurvH0:LGI0             -0.0097      0.001    -15.230      0.000      -0.011      -0.008\n",
+      "CurvH0:Area0:PialArea0             -0.0288      0.000    -82.492      0.000      -0.030      -0.028\n",
+      "CurvH0:Area0:LGI0                  -0.0198      0.001    -31.639      0.000      -0.021      -0.019\n",
+      "CurvH0:PialArea0:LGI0               0.0086      0.001     15.554      0.000       0.008       0.010\n",
+      "PialCurvK0:PialCurvH0:Area0        -0.0006      0.000     -5.802      0.000      -0.001      -0.000\n",
+      "PialCurvK0:PialCurvH0:PialArea0     0.0383      0.000    108.995      0.000       0.038       0.039\n",
+      "PialCurvK0:PialCurvH0:LGI0      -6.878e-08   9.79e-05     -0.001      0.999      -0.000       0.000\n",
+      "PialCurvK0:Area0:PialArea0          0.0160      0.001     22.272      0.000       0.015       0.017\n",
+      "PialCurvK0:Area0:LGI0              -0.0048      0.000    -13.681      0.000      -0.005      -0.004\n",
+      "PialCurvK0:PialArea0:LGI0           0.0103      0.001     11.868      0.000       0.009       0.012\n",
+      "PialCurvH0:Area0:PialArea0          0.0863      0.001    140.913      0.000       0.085       0.087\n",
+      "PialCurvH0:Area0:LGI0               0.0190      0.000     52.148      0.000       0.018       0.020\n",
+      "PialCurvH0:PialArea0:LGI0           0.0059      0.001      7.812      0.000       0.004       0.007\n",
+      "Area0:PialArea0:LGI0               -0.0058      0.000    -20.366      0.000      -0.006      -0.005\n",
+      "==============================================================================\n",
+      "Omnibus:                   446101.536   Durbin-Watson:                   0.612\n",
+      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):           754981.664\n",
+      "Skew:                           0.422   Prob(JB):                         0.00\n",
+      "Kurtosis:                       4.171   Cond. No.                         86.8\n",
+      "==============================================================================\n",
+      "\n",
+      "Warnings:\n",
+      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# create the formula \n",
+    "features = '(' + '+'.join(df_z.drop(df_z.filter(like = 'Thick'),axis = 1).columns) + ') ** 3'\n",
+    "formula = 'Thick' + sm + ' ~ ' + features\n",
+    "\n",
+    "# create the model\n",
+    "mod = smf.ols(formula=formula, data=df_z)\n",
+    "\n",
+    "results = mod.fit()\n",
+    "\n",
+    "print(results.summary())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# grab the p-value and beta values for each of the terms\n",
+    "pvals = results.pvalues\n",
+    "pvals.name = 'p.value'\n",
+    "\n",
+    "coefs = results.params\n",
+    "coefs.name = 'norm.slope'\n",
+    "\n",
+    "# concatenate the coefficients and p-values, and write to a csv\n",
+    "df_out = pd.concat([coefs,pvals],axis = 1)\n",
+    "df_out.index.name = 'term'\n",
+    "df_out.reset_index(inplace=True)\n",
+    "\n",
+    "outdir = projdir + '/results/' + dataset + '/lm/models'\n",
+    "if not os.path.isdir(outdir):\n",
+    "    os.mkdir(outdir)\n",
+    "\n",
+    "filename = outdir + '/coefs/' + area + '-sm' + sm + '-lm-betas.csv'\n",
+    "df_out.to_csv(filename)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:ctmodel-ml]",
+   "language": "python",
+   "name": "conda-env-ctmodel-ml-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-- 
GitLab