From fc971ad0d934a1d4503d5c093d687e1875d3fc4a Mon Sep 17 00:00:00 2001 From: Matthew K Defenderfer <mdefende@uab.edu> Date: Tue, 17 Dec 2019 16:31:27 -0600 Subject: [PATCH] Change the workflow structure so cv is performed at all smoothing levels and final models are built from that. Reduced the number of combinations for the hyperparameter grid search drastically based on previous experience --- misc-scripts/cv-compare.R | 54 ++-- model-py/batch/create_GBDT_cv_jobs.m | 29 +- model-py/batch/create_GBDT_cv_jobs.m~ | 30 ++ model-py/batch/create_lm_model_jobs.m | 4 +- model-py/batch/create_model_jobs.m | 27 +- model-py/batch/create_model_jobs.m~ | 24 ++ model-py/batch/write_cv_script.m | 10 +- model-py/gbdt-cv.py | 4 +- model-py/lm-betas.py | 4 +- model-py/notebooks/Untitled.ipynb | 391 ++++++++++++++++++++++++++ 10 files changed, 524 insertions(+), 53 deletions(-) create mode 100644 model-py/batch/create_GBDT_cv_jobs.m~ create mode 100644 model-py/batch/create_model_jobs.m~ create mode 100644 model-py/notebooks/Untitled.ipynb diff --git a/misc-scripts/cv-compare.R b/misc-scripts/cv-compare.R index 3a66a89..bda6013 100644 --- a/misc-scripts/cv-compare.R +++ b/misc-scripts/cv-compare.R @@ -3,44 +3,58 @@ # the GBDT error metrics, pick out the relevant columns from lm.df, and combine # them into a single cross validation matrix. From there, graphs comparing MAE # and RMSE across areas, smoothing kernels, and model types can be created +library(sciplot) +library(tidyverse) -lmpath <- "/data/user/mdefende/Projects/ctmodel-ml/model-py/cv-out/linear-models/" -lm.df <- list.files(path = lmpath, pattern = "*.csv", full.names = TRUE) %>% - sapply(read_csv, simplify=FALSE,n_max = 5) %>% + +lm.1200.path <- "/data/user/mdefende/Projects/ctmodel-ml/results/HCP1200/lm/cv/out" +lm.1200 <- list.files(path = lm.1200.path, pattern = "*.csv", full.names = TRUE) %>% + sapply(read_csv, simplify=FALSE,n_max = 10) %>% bind_rows() %>% - mutate(region = as.factor(region), - sm = factor(sm,levels = c(0,2,5,10), labels = c(0,2,5,10))) %>% - rename(area = 'region') %>% + mutate(area = as.factor(area), + sm = factor(sm,levels = c(0,2,5,10), labels = c(0,2,5,10)), + method = 'lm') %>% mutate_if(is.numeric, abs) %>% mutate_at(vars(contains('mse')), sqrt) %>% rename('test_rmse' = 'test_mse', 'train_rmse' = 'train_mse') %>% group_by(area,sm) %>% - summarize_if(is.numeric, funs(mean, se)) %>% + summarize_if(is.numeric, list(mean = mean, se = se)) %>% ungroup() %>% - mutate(method = 'lm') - + mutate(method = 'lm-1200') -GBDTpath <- "/data/user/mdefende/Projects/ctmodel-ml/model-py/cv-out/best-GBDT" -gbdt.df <- list.files(path = GBDTpath, pattern = "*.csv", full.names = TRUE) %>% +gbdt.1200.path <- "/data/user/mdefende/Projects/ctmodel-ml/results/HCP1200/gbdt/cv/out/best-gbdt" +gbdt.1200 <- list.files(path = gbdt.1200.path, pattern = "*.csv", full.names = TRUE) %>% sapply(read_csv, simplify=FALSE) %>% bind_rows() %>% mutate(area = as.factor(area), - sm = factor(sm,levels = c(0,2,5,10), labels = c(0,2,5,10)), - method = 'gbdt') + sm = factor(sm,levels = c(0,2,5,10), ordered = TRUE), + method = 'gbdt-1200') + +gbdt.786.path <- "/data/user/mdefende/Projects/ctmodel-ml/results/HCP786/gbdt/cv/out/best-gbdt" +gbdt.786 <- list.files(path = gbdt.786.path, pattern = "*.csv", full.names = TRUE) %>% + sapply(read_csv, simplify=FALSE) %>% + bind_rows() %>% + mutate(area = str_replace_all(area,'_','-'), + area = as.factor(area), + sm = factor(sm,levels = c(0,2,5,10), ordered = TRUE), + method = 'gbdt-786') -cv.df <- lm.df %>% - select(method,area,sm,test_mae_mean,test_rmse_mean) %>% +cv.df <- lm.1200 %>% + dplyr::select(method,area,sm,test_mae_mean,test_rmse_mean) %>% rename(test_mae = test_mae_mean, test_rmse = test_rmse_mean) %>% - bind_rows(.,select(gbdt.df,-contains('train'))) %>% - mutate(method = factor(method)) + bind_rows(.,dplyr::select(gbdt.1200,area,sm,contains('test'),method)) %>% + bind_rows(.,dplyr::select(gbdt.786,area,sm,contains('test'),method)) %>% + mutate(method = factor(method), + area = factor(area)) %>% + arrange(method,area,sm) # Variable to relabel facets method_labels <- c(BA45 = "Brodmann Area 45", dACC = "dorsal Anterior Cingulate Cortex", - V1_withRet = "V1 with Retinotopic Factors", - V1_noRet = "V1 with no Retinotopic Factors") + V1-withRet = "V1 with Retinotopic Factors", + V1-noRet = "V1 with no Retinotopic Factors") cv.df %>% @@ -48,7 +62,7 @@ cv.df %>% geom_point(size = 3) + geom_line(aes(group = method)) + facet_wrap(~area, labeller = labeller(area = method_labels)) + - scale_color_discrete(name = "Model Type", labels = c("GBDT", "Linear")) + + scale_color_discrete(name = "Model Type") + theme_bw() + ggtitle("Linear Model and GBDT Root Mean Square Error") + ylab("RMSE (mm)") + diff --git a/model-py/batch/create_GBDT_cv_jobs.m b/model-py/batch/create_GBDT_cv_jobs.m index 174086b..5a89247 100644 --- a/model-py/batch/create_GBDT_cv_jobs.m +++ b/model-py/batch/create_GBDT_cv_jobs.m @@ -1,12 +1,25 @@ -datafile = '/data/user/mdefende/Projects/ctmodel-ml/data/HCP1200-dACC.csv'; +% Add project to path +addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml')) + +% initial parameters +projdir = '/data/user/mdefende/Projects/ctmodel-ml'; +dataset = 'HCP1200'; area = 'dACC'; -eta = [0.01,0.05,0.1:0.1:0.5]; -max_depth = 3:10; -nboost = 1000:1000:15000; -sm = 5; +do_cv = 0; +make_model = 1; + +if make_model + jobdir = fullfile(projdir,'results',dataset,'gbdt','models','jobs',area); +else + jobdir = fullfile(projdir,'results',dataset,'gbdt','cv','jobs',area); +end + +eta = [0.01,0.05,0.1,0.2,0.3]; +max_depth = 6:10; +nboost = 15000; +sm = [0,2,10]; A = combvec(eta,max_depth,nboost,sm)'; -jobdir = ['/data/user/mdefende/Projects/ctmodel-ml/Results/HCP1200/cv-jobs/' area]; if ~exist(jobdir,'dir') mkdir(jobdir) end @@ -14,9 +27,9 @@ end for ii = 1:size(A,1) jobname = [area '_eta' num2str(A(ii,1)) '_depth' num2str(A(ii,2)) '_nboost' num2str(A(ii,3)) '_sm' num2str(A(ii,4)) '.sh']; - write_cv_script(jobname,jobdir,area,A(ii,1),A(ii,2),A(ii,3),A(ii,4),0,datafile) + write_cv_script(jobname,jobdir,projdir,area,A(ii,1),A(ii,2),A(ii,3),A(ii,4),do_cv,make_model,dataset) - if ~exist(['/data/user/mdefende/Projects/ctmodel-ml/Results/HCP1200/cv-out/' area '/' jobname(1:end-3) '.csv'],'file') + if ~exist(fullfile(projdir,'results',dataset,'gbdt','cv','out',area,[jobname(1:end-3) '.csv']),'file') system(['sbatch ' fullfile(jobdir,jobname)]); %disp(jobname) end diff --git a/model-py/batch/create_GBDT_cv_jobs.m~ b/model-py/batch/create_GBDT_cv_jobs.m~ new file mode 100644 index 0000000..bf445cf --- /dev/null +++ b/model-py/batch/create_GBDT_cv_jobs.m~ @@ -0,0 +1,30 @@ +% Add project to path +addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml')) + +% initial parameters +projdir = '/data/user/mdefende/Projects/ctmodel-ml'; +dataset = 'HCP1200'; +area = 'V1-withRet'; +make_model = 1; + +eta = [0.01,0.05,0.1,0.2,0.3]; +max_depth = 6:10; +nboost = 15000; +sm = [0,2,10]; + +A = combvec(eta,max_depth,nboost,sm)'; +jobdir = fullfile(projdir,'results',dataset,'cv','jobs'); +if ~exist(jobdir,'dir') + mkdir(jobdir) +end + +for ii = 1:size(A,1) + jobname = [area '_eta' num2str(A(ii,1)) '_depth' num2str(A(ii,2)) '_nboost' num2str(A(ii,3)) '_sm' num2str(A(ii,4)) '.sh']; + + write_cv_script(jobname,jobdir,projdir,area,A(ii,1),A(ii,2),A(ii,3),A(ii,4),0,datafile) + + if ~exist(fullfile(projdir,'results',dataset,'gbdt','cv','out',area,[jobname(1:end-3) '.csv']),'file') + system(['sbatch ' fullfile(jobdir,jobname)]); + %disp(jobname) + end +end \ No newline at end of file diff --git a/model-py/batch/create_lm_model_jobs.m b/model-py/batch/create_lm_model_jobs.m index b954908..80c0d7b 100644 --- a/model-py/batch/create_lm_model_jobs.m +++ b/model-py/batch/create_lm_model_jobs.m @@ -15,8 +15,8 @@ for ii = 1:length(areas) fprintf(fid,'#!/bin/bash\n'); fprintf(fid,'#SBATCH --partition=express\n'); fprintf(fid,'#SBATCH --time=1:00:00\n'); - fprintf(fid,'#SBATCH --ntasks=1\n'); - fprintf(fid,'#SBATCH --mem-per-cpu=150G\n'); + fprintf(fid,'#SBATCH -ks=1\n'); + fprintf(fid,'#SBATCH --mem-per-cpu=200G\n'); fprintf(fid,['#SBATCH --output=' fullfile(jobdir,jobname(1:end-3)) '.txt\n\n']); fprintf(fid,'module load Anaconda3/5.3.0\n'); diff --git a/model-py/batch/create_model_jobs.m b/model-py/batch/create_model_jobs.m index d28eb37..4de631c 100644 --- a/model-py/batch/create_model_jobs.m +++ b/model-py/batch/create_model_jobs.m @@ -4,27 +4,22 @@ addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml')) % initial parameters projdir = '/data/user/mdefende/Projects/ctmodel-ml'; dataset = 'HCP1200'; -area = 'V1-noRet'; +area = 'all'; +do_cv = 0; make_model = 1; % Say where the cv outputs are -cv_out = fullfile(projdir,'results',dataset,'cv','out'); +cv_out = fullfile(projdir,'results',dataset,'gbdt','cv','out','best-gbdt.csv'); +T = readtable(cv_out); -% What cv parameter set are we going to keep -eta = round([0.01,0.05,0.1:0.1:0.5],2); % There are problems with 0.3 when created by the : operator -max_depth = 3:10; -nboost = 1000:1000:15000; -sm = 5; - -A = check_cv_new(fullfile(cv_out,area),eta,max_depth,nboost,sm); - -% Set the different smoothing kernels -sm = [0,2,5,10]; +if ~strcmp(area,'all') + T(~strcmp(T.area,area),:) = []; +end -jobdir = fullfile(projdir,'results',dataset,'models','jobs'); +jobdir = fullfile(projdir,'results',dataset,'gbdt','models','jobs'); -for ii = 1:length(sm) - jobname = [area '_eta' num2str(A(2)) '_depth' num2str(A(3)) '_nboost' num2str(A(5) + 1) '_sm' num2str(sm(ii)) '.sh']; - write_cv_script(jobname,jobdir,projdir,area,A(2),A(3),A(5)+1,sm(ii),make_model,dataset) +for ii = 1:height(T) + jobname = [T.area{ii} '_eta' num2str(T.eta(ii)) '_depth' num2str(T.depth(ii)) '_nboost' num2str(T.cv_index(ii) + 1) '_sm' num2str(T.sm(ii)) '.sh']; + write_cv_script(jobname,jobdir,projdir,T.area{ii},T.eta(ii),T.depth(ii),T.cv_index(ii)+1,T.sm(ii),do_cv,make_model,dataset) system(['sbatch ' fullfile(jobdir,jobname)]); end diff --git a/model-py/batch/create_model_jobs.m~ b/model-py/batch/create_model_jobs.m~ new file mode 100644 index 0000000..4421c77 --- /dev/null +++ b/model-py/batch/create_model_jobs.m~ @@ -0,0 +1,24 @@ +% Add project to path +addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml')) + +% initial parameters +projdir = '/data/user/mdefende/Projects/ctmodel-ml'; +dataset = 'HCP1200'; +area = 'V1-noRet'; +do_cv = 0; +make_model = 1; + +% Say where the cv outputs are +cv_out = fullfile(projdir,'results',dataset,'gbdt','cv','out','best-gbdt.csv'); +T = readtable(cv_out); + +% Set the different smoothing kernels +sm = [0,2,5,10]; + +jobdir = fullfile(projdir,'results',dataset,'models','jobs'); + +for ii = 1:length(sm) + jobname = [area '_eta' num2str(A(2)) '_depth' num2str(A(3)) '_nboost' num2str(A(5) + 1) '_sm' num2str(sm(ii)) '.sh']; + write_cv_script(jobname,jobdir,projdir,area,A(2),A(3),A(5)+1,sm(ii),make_model,dataset) + system(['sbatch ' fullfile(jobdir,jobname)]); +end diff --git a/model-py/batch/write_cv_script.m b/model-py/batch/write_cv_script.m index 69cea3d..8cf0528 100644 --- a/model-py/batch/write_cv_script.m +++ b/model-py/batch/write_cv_script.m @@ -1,4 +1,4 @@ -function write_cv_script(jobname,jobdir,projdir,area,eta,maxdepth,nboost,sm,create_model,dataset) +function write_cv_script(jobname,jobdir,projdir,area,eta,maxdepth,nboost,sm,do_cv,create_model,dataset) fid = fopen(fullfile(jobdir,jobname),'w'); fprintf(fid,'#!/bin/bash\n'); fprintf(fid,'#SBATCH --partition=pascalnodes\n'); @@ -14,10 +14,12 @@ fprintf(fid,'source activate ctmodel-ml\n\n'); fprintf(fid,'cd /data/user/mdefende/Projects/ctmodel-ml/model-py\n\n'); -fprintf(fid,['python gbdt-cv.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset ' ' projdir '\n']); - +if do_cv + fprintf(fid,['python3.6 gbdt-cv.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset ' ' projdir '\n']); +end + if create_model - fprintf(fid,['python create-GBDT-model.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset ' ' projdir '\n']); + fprintf(fid,['python3.6 create-GBDT-model.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset ' ' projdir '\n']); end fclose(fid); diff --git a/model-py/gbdt-cv.py b/model-py/gbdt-cv.py index d7f6429..379a59d 100644 --- a/model-py/gbdt-cv.py +++ b/model-py/gbdt-cv.py @@ -71,7 +71,9 @@ cv_results = xgb.cv(params = param, dtrain = dfull, num_boost_round = nboost_val idx = cv_results['test-mae-mean'].idxmin() # Write out the cv results to a csv file -cv_dict = {'eta': eta_val, +cv_dict = {'area': area, + 'sm': sm, + 'eta': eta_val, 'depth': max_depth_val, 'nboost': nboost_val, 'cv_index': idx, diff --git a/model-py/lm-betas.py b/model-py/lm-betas.py index f4babe8..efe3831 100644 --- a/model-py/lm-betas.py +++ b/model-py/lm-betas.py @@ -86,7 +86,7 @@ del df_pred df_x = calculate_vif_(df_x,df_y) # combine back into single dataframe and apply z-transform -df_z = pd.concat([df_x,df_y]).apply(stats.zscore) +df_z = pd.concat([df_x,df_y],axis = 1).apply(stats.zscore, axis = 0) # create the formula features = '(' + '+'.join(df_z.drop(df_z.filter(like = 'Thick'),axis = 1).columns) + ') ** 3' @@ -100,7 +100,7 @@ results = mod.fit() print(results.summary()) # grab the p-value and beta values for each of the terms -pvals = reg.pvalues +pvals = results.pvalues pvals.name = 'p.value' coefs = results.params diff --git a/model-py/notebooks/Untitled.ipynb b/model-py/notebooks/Untitled.ipynb new file mode 100644 index 0000000..ae5f35c --- /dev/null +++ b/model-py/notebooks/Untitled.ipynb @@ -0,0 +1,391 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 17:14:51) \n", + "[GCC 7.2.0]\n" + ] + } + ], + "source": [ + "# basic libraries to load\n", + "import os\n", + "import sys\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# import model generation and analysis\n", + "from patsy import dmatrices\n", + "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", + "import statsmodels.api as sm\n", + "import statsmodels.formula.api as smf\n", + "from scipy import stats\n", + "\n", + "# print the current python version\n", + "print(sys.version)\n", + "\n", + "# Define a function to repeatedly calculate variance inflation factor and drop \n", + "def calculate_vif_(df_x, df_y, thresh=10.0):\n", + " # Set up the while loop. While we have something to drop, do ... \n", + " dropped = True\n", + " while dropped:\n", + " # get a list of the column names\n", + " variables = list(df_x.columns)\n", + "\n", + " # concatenate the column names into a string separated by '+'\n", + " features = '+'.join(variables)\n", + " \n", + " # re-concatenate the dataframes for creating the design matrices\n", + " df_tmp = pd.concat([df_x,df_y],axis = 1)\n", + " \n", + " y, X = dmatrices(df_y.columns[0] + ' ~ ' + features, df_tmp, return_type='dataframe')\n", + " vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]\n", + " vif = vif[1:len(vif)]\n", + " maxloc = vif.index(max(vif))\n", + "\n", + " thresh = 10 \n", + " if max(vif) > thresh:\n", + " print('dropping \\'' + variables[maxloc] +\n", + " '\\' at index: ' + str(maxloc) + ' with VIF:' + str(vif[maxloc]))\n", + " df_x = df_x.drop(variables[maxloc],axis=1)\n", + " else:\n", + " dropped = False\n", + "\n", + " print('Remaining variables:')\n", + " print(df_x.columns)\n", + " return df_x" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# import parameters from command line\n", + "projdir = '/data/user/mdefende/Projects/ctmodel-ml'\n", + "dataset = 'HCP1200'\n", + "area = 'BA45'\n", + "sm = '0'" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# First load data\n", + "if 'V1' in area:\n", + " data_file = projdir + '/data/' + dataset + '-V1.csv'\n", + "else:\n", + " data_file = projdir + '/data/' + dataset + '-' + area + '.csv'\n", + "\n", + "df = pd.read_csv(data_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Index(['Age', 'Sulc0', 'CurvK0', 'CurvH0', 'PialCurvK0', 'PialCurvH0', 'Area0',\n", + " 'MidArea0', 'PialArea0', 'LGI0'],\n", + " dtype='object'), Index(['Thick0'], dtype='object')]\n" + ] + } + ], + "source": [ + "# Do initial feature selection by the smoothing kernel\n", + "df_pred = df.filter(like=sm)\n", + "df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(like='Thick')))]\n", + "df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(like='Sulc')))]\n", + "\n", + "# If sm == 0, remove all columns with 10. Else remove column with name Sulc + sm\n", + "if sm == '0':\n", + " df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(regex='10')))]\n", + "\n", + "# Grab the rest of the features (Age, Ecc, Pol, and unsmoothed Sulc) and concatenate them with what remains from df_pred. Only grab Ecc and Pol if withRet is designated in the area. Then grab the correct thickness column\n", + "if 'withRet' in area:\n", + " df_x = pd.concat([df[['Age','Ecc','Pol','Sulc0']],df_pred], axis = 1)\n", + "else:\n", + " df_x = pd.concat([df[['Age','Sulc0']],df_pred], axis = 1)\n", + "\n", + "df_y = df[['Thick' + sm]]\n", + "\n", + "print([df_x.columns,df_y.columns])\n", + "\n", + "# remove df_pred from memory\n", + "del df_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dropping 'MidArea0' at index: 7 with VIF:272945431961848.25\n", + "Remaining variables:\n", + "Index(['Age', 'Sulc0', 'CurvK0', 'CurvH0', 'PialCurvK0', 'PialCurvH0', 'Area0',\n", + " 'PialArea0', 'LGI0'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "# calculate vif repeatedly removing features with a score above 10\n", + "df_x = calculate_vif_(df_x,df_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# combine back into single dataframe and apply z-transform\n", + "df_z = pd.concat([df_x,df_y],axis = 1).apply(stats.zscore, axis = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Thick0 R-squared: 0.411\n", + "Model: OLS Adj. R-squared: 0.411\n", + "Method: Least Squares F-statistic: 4.700e+04\n", + "Date: Thu, 12 Dec 2019 Prob (F-statistic): 0.00\n", + "Time: 17:23:52 Log-Likelihood: -1.0049e+07\n", + "No. Observations: 8702966 AIC: 2.010e+07\n", + "Df Residuals: 8702836 BIC: 2.010e+07\n", + "Df Model: 129 \n", + "Covariance Type: nonrobust \n", + "===================================================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "---------------------------------------------------------------------------------------------------\n", + "Intercept -0.3263 0.001 -612.789 0.000 -0.327 -0.325\n", + "Age -0.0375 0.000 -79.024 0.000 -0.038 -0.037\n", + "Sulc0 -0.1811 0.001 -293.381 0.000 -0.182 -0.180\n", + "CurvK0 -0.1594 0.001 -171.857 0.000 -0.161 -0.158\n", + "CurvH0 0.1093 0.001 132.603 0.000 0.108 0.111\n", + "PialCurvK0 -0.2774 0.001 -268.438 0.000 -0.279 -0.275\n", + "PialCurvH0 -0.9025 0.001 -901.832 0.000 -0.904 -0.900\n", + "Area0 -0.3285 0.001 -503.063 0.000 -0.330 -0.327\n", + "PialArea0 0.2825 0.001 363.814 0.000 0.281 0.284\n", + "LGI0 0.1225 0.000 257.511 0.000 0.122 0.123\n", + "Age:Sulc0 0.0057 0.000 13.697 0.000 0.005 0.007\n", + "Age:CurvK0 0.0065 0.001 10.001 0.000 0.005 0.008\n", + "Age:CurvH0 -0.0094 0.001 -14.927 0.000 -0.011 -0.008\n", + "Age:PialCurvK0 0.0049 0.001 5.913 0.000 0.003 0.006\n", + "Age:PialCurvH0 0.0098 0.001 13.275 0.000 0.008 0.011\n", + "Age:Area0 -0.0025 0.001 -4.705 0.000 -0.004 -0.001\n", + "Age:PialArea0 0.0075 0.001 11.351 0.000 0.006 0.009\n", + "Age:LGI0 0.0048 0.000 18.364 0.000 0.004 0.005\n", + "Sulc0:CurvK0 -0.0456 0.001 -55.149 0.000 -0.047 -0.044\n", + "Sulc0:CurvH0 -0.1505 0.001 -199.646 0.000 -0.152 -0.149\n", + "Sulc0:PialCurvK0 0.0103 0.001 10.468 0.000 0.008 0.012\n", + "Sulc0:PialCurvH0 -0.1050 0.001 -105.627 0.000 -0.107 -0.103\n", + "Sulc0:Area0 0.1350 0.001 246.087 0.000 0.134 0.136\n", + "Sulc0:PialArea0 -0.1482 0.001 -238.676 0.000 -0.149 -0.147\n", + "Sulc0:LGI0 0.1658 0.000 406.482 0.000 0.165 0.167\n", + "CurvK0:CurvH0 -0.0150 0.000 -44.477 0.000 -0.016 -0.014\n", + "CurvK0:PialCurvK0 0.0035 0.001 4.064 0.000 0.002 0.005\n", + "CurvK0:PialCurvH0 -0.1066 0.001 -91.949 0.000 -0.109 -0.104\n", + "CurvK0:Area0 -0.0776 0.001 -86.855 0.000 -0.079 -0.076\n", + "CurvK0:PialArea0 -0.0078 0.001 -7.268 0.000 -0.010 -0.006\n", + "CurvK0:LGI0 0.0266 0.001 40.871 0.000 0.025 0.028\n", + "CurvH0:PialCurvK0 0.0742 0.001 68.673 0.000 0.072 0.076\n", + "CurvH0:PialCurvH0 0.1708 0.001 149.031 0.000 0.169 0.173\n", + "CurvH0:Area0 0.2686 0.001 344.325 0.000 0.267 0.270\n", + "CurvH0:PialArea0 -0.2880 0.001 -311.514 0.000 -0.290 -0.286\n", + "CurvH0:LGI0 -0.0515 0.001 -81.852 0.000 -0.053 -0.050\n", + "PialCurvK0:PialCurvH0 0.0442 0.000 118.913 0.000 0.043 0.045\n", + "PialCurvK0:Area0 -0.0769 0.001 -75.549 0.000 -0.079 -0.075\n", + "PialCurvK0:PialArea0 -0.0187 0.001 -16.214 0.000 -0.021 -0.016\n", + "PialCurvK0:LGI0 0.0114 0.001 14.062 0.000 0.010 0.013\n", + "PialCurvH0:Area0 0.0604 0.001 59.040 0.000 0.058 0.062\n", + "PialCurvH0:PialArea0 -0.3898 0.001 -323.395 0.000 -0.392 -0.387\n", + "PialCurvH0:LGI0 0.0321 0.001 43.407 0.000 0.031 0.034\n", + "Area0:PialArea0 0.0084 0.000 24.263 0.000 0.008 0.009\n", + "Area0:LGI0 -0.0329 0.001 -62.457 0.000 -0.034 -0.032\n", + "PialArea0:LGI0 0.0755 0.001 115.907 0.000 0.074 0.077\n", + "Age:Sulc0:CurvK0 -0.0024 0.000 -5.566 0.000 -0.003 -0.002\n", + "Age:Sulc0:CurvH0 -0.0016 0.000 -3.502 0.000 -0.003 -0.001\n", + "Age:Sulc0:PialCurvK0 8.591e-05 0.000 0.272 0.786 -0.001 0.001\n", + "Age:Sulc0:PialCurvH0 0.0007 0.000 1.866 0.062 -3.33e-05 0.001\n", + "Age:Sulc0:Area0 -0.0005 0.000 -1.096 0.273 -0.001 0.000\n", + "Age:Sulc0:PialArea0 0.0049 0.000 10.699 0.000 0.004 0.006\n", + "Age:Sulc0:LGI0 -0.0002 0.000 -0.506 0.613 -0.001 0.001\n", + "Age:CurvK0:CurvH0 0.0007 0.000 4.978 0.000 0.000 0.001\n", + "Age:CurvK0:PialCurvK0 5.093e-05 0.000 0.164 0.870 -0.001 0.001\n", + "Age:CurvK0:PialCurvH0 0.0013 0.000 2.578 0.010 0.000 0.002\n", + "Age:CurvK0:Area0 0.0054 0.001 8.538 0.000 0.004 0.007\n", + "Age:CurvK0:PialArea0 -0.0033 0.001 -6.312 0.000 -0.004 -0.002\n", + "Age:CurvK0:LGI0 -0.0012 0.000 -4.331 0.000 -0.002 -0.001\n", + "Age:CurvH0:PialCurvK0 -0.0031 0.000 -6.480 0.000 -0.004 -0.002\n", + "Age:CurvH0:PialCurvH0 -0.0053 0.001 -8.327 0.000 -0.007 -0.004\n", + "Age:CurvH0:Area0 -0.0034 0.001 -5.349 0.000 -0.005 -0.002\n", + "Age:CurvH0:PialArea0 -0.0023 0.001 -4.130 0.000 -0.003 -0.001\n", + "Age:CurvH0:LGI0 -0.0003 0.000 -1.043 0.297 -0.001 0.000\n", + "Age:PialCurvK0:PialCurvH0 6.689e-05 8.98e-05 0.745 0.456 -0.000 0.000\n", + "Age:PialCurvK0:Area0 -0.0003 0.000 -0.840 0.401 -0.001 0.000\n", + "Age:PialCurvK0:PialArea0 0.0029 0.001 3.363 0.001 0.001 0.005\n", + "Age:PialCurvK0:LGI0 -0.0012 0.000 -4.195 0.000 -0.002 -0.001\n", + "Age:PialCurvH0:Area0 0.0012 0.000 3.313 0.001 0.000 0.002\n", + "Age:PialCurvH0:PialArea0 0.0046 0.001 5.996 0.000 0.003 0.006\n", + "Age:PialCurvH0:LGI0 -0.0006 0.000 -1.950 0.051 -0.001 2.88e-06\n", + "Age:Area0:PialArea0 -0.0007 0.000 -2.251 0.024 -0.001 -8.42e-05\n", + "Age:Area0:LGI0 0.0001 0.000 0.259 0.796 -0.001 0.001\n", + "Age:PialArea0:LGI0 -0.0003 0.000 -0.601 0.548 -0.001 0.001\n", + "Sulc0:CurvK0:CurvH0 0.0023 0.000 11.980 0.000 0.002 0.003\n", + "Sulc0:CurvK0:PialCurvK0 -0.0115 0.000 -30.406 0.000 -0.012 -0.011\n", + "Sulc0:CurvK0:PialCurvH0 -0.0057 0.001 -9.359 0.000 -0.007 -0.004\n", + "Sulc0:CurvK0:Area0 -0.0373 0.001 -51.817 0.000 -0.039 -0.036\n", + "Sulc0:CurvK0:PialArea0 0.0070 0.001 10.570 0.000 0.006 0.008\n", + "Sulc0:CurvK0:LGI0 -0.0022 0.000 -5.451 0.000 -0.003 -0.001\n", + "Sulc0:CurvH0:PialCurvK0 0.0024 0.001 4.439 0.000 0.001 0.003\n", + "Sulc0:CurvH0:PialCurvH0 0.0109 0.001 14.784 0.000 0.009 0.012\n", + "Sulc0:CurvH0:Area0 0.0047 0.001 7.863 0.000 0.004 0.006\n", + "Sulc0:CurvH0:PialArea0 -0.0736 0.001 -135.096 0.000 -0.075 -0.072\n", + "Sulc0:CurvH0:LGI0 -0.0025 0.000 -5.507 0.000 -0.003 -0.002\n", + "Sulc0:PialCurvK0:PialCurvH0 -0.0010 0.000 -8.762 0.000 -0.001 -0.001\n", + "Sulc0:PialCurvK0:Area0 0.0184 0.000 52.024 0.000 0.018 0.019\n", + "Sulc0:PialCurvK0:PialArea0 -0.0250 0.001 -28.204 0.000 -0.027 -0.023\n", + "Sulc0:PialCurvK0:LGI0 -0.0013 0.000 -4.001 0.000 -0.002 -0.001\n", + "Sulc0:PialCurvH0:Area0 -0.0330 0.000 -87.655 0.000 -0.034 -0.032\n", + "Sulc0:PialCurvH0:PialArea0 -0.0710 0.001 -90.453 0.000 -0.073 -0.069\n", + "Sulc0:PialCurvH0:LGI0 -0.0054 0.000 -14.313 0.000 -0.006 -0.005\n", + "Sulc0:Area0:PialArea0 0.0180 0.000 48.006 0.000 0.017 0.019\n", + "Sulc0:Area0:LGI0 -0.0144 0.000 -33.923 0.000 -0.015 -0.014\n", + "Sulc0:PialArea0:LGI0 0.0466 0.000 102.277 0.000 0.046 0.048\n", + "CurvK0:CurvH0:PialCurvK0 0.0011 0.000 7.882 0.000 0.001 0.001\n", + "CurvK0:CurvH0:PialCurvH0 -0.0031 0.000 -13.965 0.000 -0.004 -0.003\n", + "CurvK0:CurvH0:Area0 -0.0019 0.000 -5.947 0.000 -0.003 -0.001\n", + "CurvK0:CurvH0:PialArea0 -0.0102 0.000 -38.435 0.000 -0.011 -0.010\n", + "CurvK0:CurvH0:LGI0 0.0025 0.000 17.905 0.000 0.002 0.003\n", + "CurvK0:PialCurvK0:PialCurvH0 0.0031 0.000 26.367 0.000 0.003 0.003\n", + "CurvK0:PialCurvK0:Area0 -0.0140 0.001 -24.563 0.000 -0.015 -0.013\n", + "CurvK0:PialCurvK0:PialArea0 0.0190 0.001 23.023 0.000 0.017 0.021\n", + "CurvK0:PialCurvK0:LGI0 0.0002 0.000 0.613 0.540 -0.000 0.001\n", + "CurvK0:PialCurvH0:Area0 -0.0727 0.001 -90.603 0.000 -0.074 -0.071\n", + "CurvK0:PialCurvH0:PialArea0 0.0208 0.001 18.001 0.000 0.019 0.023\n", + "CurvK0:PialCurvH0:LGI0 0.0020 0.000 4.071 0.000 0.001 0.003\n", + "CurvK0:Area0:PialArea0 0.0009 0.000 2.971 0.003 0.000 0.002\n", + "CurvK0:Area0:LGI0 0.0196 0.001 30.804 0.000 0.018 0.021\n", + "CurvK0:PialArea0:LGI0 -0.0139 0.001 -27.324 0.000 -0.015 -0.013\n", + "CurvH0:PialCurvK0:PialCurvH0 0.0022 0.000 12.089 0.000 0.002 0.003\n", + "CurvH0:PialCurvK0:Area0 0.0700 0.001 86.041 0.000 0.068 0.072\n", + "CurvH0:PialCurvK0:PialArea0 -0.0563 0.001 -50.653 0.000 -0.058 -0.054\n", + "CurvH0:PialCurvK0:LGI0 -0.0018 0.000 -3.730 0.000 -0.003 -0.001\n", + "CurvH0:PialCurvH0:Area0 0.1860 0.001 194.524 0.000 0.184 0.188\n", + "CurvH0:PialCurvH0:PialArea0 -0.1857 0.001 -153.565 0.000 -0.188 -0.183\n", + "CurvH0:PialCurvH0:LGI0 -0.0097 0.001 -15.230 0.000 -0.011 -0.008\n", + "CurvH0:Area0:PialArea0 -0.0288 0.000 -82.492 0.000 -0.030 -0.028\n", + "CurvH0:Area0:LGI0 -0.0198 0.001 -31.639 0.000 -0.021 -0.019\n", + "CurvH0:PialArea0:LGI0 0.0086 0.001 15.554 0.000 0.008 0.010\n", + "PialCurvK0:PialCurvH0:Area0 -0.0006 0.000 -5.802 0.000 -0.001 -0.000\n", + "PialCurvK0:PialCurvH0:PialArea0 0.0383 0.000 108.995 0.000 0.038 0.039\n", + "PialCurvK0:PialCurvH0:LGI0 -6.878e-08 9.79e-05 -0.001 0.999 -0.000 0.000\n", + "PialCurvK0:Area0:PialArea0 0.0160 0.001 22.272 0.000 0.015 0.017\n", + "PialCurvK0:Area0:LGI0 -0.0048 0.000 -13.681 0.000 -0.005 -0.004\n", + "PialCurvK0:PialArea0:LGI0 0.0103 0.001 11.868 0.000 0.009 0.012\n", + "PialCurvH0:Area0:PialArea0 0.0863 0.001 140.913 0.000 0.085 0.087\n", + "PialCurvH0:Area0:LGI0 0.0190 0.000 52.148 0.000 0.018 0.020\n", + "PialCurvH0:PialArea0:LGI0 0.0059 0.001 7.812 0.000 0.004 0.007\n", + "Area0:PialArea0:LGI0 -0.0058 0.000 -20.366 0.000 -0.006 -0.005\n", + "==============================================================================\n", + "Omnibus: 446101.536 Durbin-Watson: 0.612\n", + "Prob(Omnibus): 0.000 Jarque-Bera (JB): 754981.664\n", + "Skew: 0.422 Prob(JB): 0.00\n", + "Kurtosis: 4.171 Cond. No. 86.8\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" + ] + } + ], + "source": [ + "# create the formula \n", + "features = '(' + '+'.join(df_z.drop(df_z.filter(like = 'Thick'),axis = 1).columns) + ') ** 3'\n", + "formula = 'Thick' + sm + ' ~ ' + features\n", + "\n", + "# create the model\n", + "mod = smf.ols(formula=formula, data=df_z)\n", + "\n", + "results = mod.fit()\n", + "\n", + "print(results.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# grab the p-value and beta values for each of the terms\n", + "pvals = results.pvalues\n", + "pvals.name = 'p.value'\n", + "\n", + "coefs = results.params\n", + "coefs.name = 'norm.slope'\n", + "\n", + "# concatenate the coefficients and p-values, and write to a csv\n", + "df_out = pd.concat([coefs,pvals],axis = 1)\n", + "df_out.index.name = 'term'\n", + "df_out.reset_index(inplace=True)\n", + "\n", + "outdir = projdir + '/results/' + dataset + '/lm/models'\n", + "if not os.path.isdir(outdir):\n", + " os.mkdir(outdir)\n", + "\n", + "filename = outdir + '/coefs/' + area + '-sm' + sm + '-lm-betas.csv'\n", + "df_out.to_csv(filename)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:ctmodel-ml]", + "language": "python", + "name": "conda-env-ctmodel-ml-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- GitLab