diff --git a/misc-scripts/cv-compare.R b/misc-scripts/cv-compare.R index 3a66a89aee943a0d87fa5160cb1fb7a6d5ac5900..bda60135d102c2a800ea84ad8f5cb260fc4981f2 100644 --- a/misc-scripts/cv-compare.R +++ b/misc-scripts/cv-compare.R @@ -3,44 +3,58 @@ # the GBDT error metrics, pick out the relevant columns from lm.df, and combine # them into a single cross validation matrix. From there, graphs comparing MAE # and RMSE across areas, smoothing kernels, and model types can be created +library(sciplot) +library(tidyverse) -lmpath <- "/data/user/mdefende/Projects/ctmodel-ml/model-py/cv-out/linear-models/" -lm.df <- list.files(path = lmpath, pattern = "*.csv", full.names = TRUE) %>% - sapply(read_csv, simplify=FALSE,n_max = 5) %>% + +lm.1200.path <- "/data/user/mdefende/Projects/ctmodel-ml/results/HCP1200/lm/cv/out" +lm.1200 <- list.files(path = lm.1200.path, pattern = "*.csv", full.names = TRUE) %>% + sapply(read_csv, simplify=FALSE,n_max = 10) %>% bind_rows() %>% - mutate(region = as.factor(region), - sm = factor(sm,levels = c(0,2,5,10), labels = c(0,2,5,10))) %>% - rename(area = 'region') %>% + mutate(area = as.factor(area), + sm = factor(sm,levels = c(0,2,5,10), labels = c(0,2,5,10)), + method = 'lm') %>% mutate_if(is.numeric, abs) %>% mutate_at(vars(contains('mse')), sqrt) %>% rename('test_rmse' = 'test_mse', 'train_rmse' = 'train_mse') %>% group_by(area,sm) %>% - summarize_if(is.numeric, funs(mean, se)) %>% + summarize_if(is.numeric, list(mean = mean, se = se)) %>% ungroup() %>% - mutate(method = 'lm') - + mutate(method = 'lm-1200') -GBDTpath <- "/data/user/mdefende/Projects/ctmodel-ml/model-py/cv-out/best-GBDT" -gbdt.df <- list.files(path = GBDTpath, pattern = "*.csv", full.names = TRUE) %>% +gbdt.1200.path <- "/data/user/mdefende/Projects/ctmodel-ml/results/HCP1200/gbdt/cv/out/best-gbdt" +gbdt.1200 <- list.files(path = gbdt.1200.path, pattern = "*.csv", full.names = TRUE) %>% sapply(read_csv, simplify=FALSE) %>% bind_rows() %>% mutate(area = as.factor(area), - sm = factor(sm,levels = c(0,2,5,10), labels = c(0,2,5,10)), - method = 'gbdt') + sm = factor(sm,levels = c(0,2,5,10), ordered = TRUE), + method = 'gbdt-1200') + +gbdt.786.path <- "/data/user/mdefende/Projects/ctmodel-ml/results/HCP786/gbdt/cv/out/best-gbdt" +gbdt.786 <- list.files(path = gbdt.786.path, pattern = "*.csv", full.names = TRUE) %>% + sapply(read_csv, simplify=FALSE) %>% + bind_rows() %>% + mutate(area = str_replace_all(area,'_','-'), + area = as.factor(area), + sm = factor(sm,levels = c(0,2,5,10), ordered = TRUE), + method = 'gbdt-786') -cv.df <- lm.df %>% - select(method,area,sm,test_mae_mean,test_rmse_mean) %>% +cv.df <- lm.1200 %>% + dplyr::select(method,area,sm,test_mae_mean,test_rmse_mean) %>% rename(test_mae = test_mae_mean, test_rmse = test_rmse_mean) %>% - bind_rows(.,select(gbdt.df,-contains('train'))) %>% - mutate(method = factor(method)) + bind_rows(.,dplyr::select(gbdt.1200,area,sm,contains('test'),method)) %>% + bind_rows(.,dplyr::select(gbdt.786,area,sm,contains('test'),method)) %>% + mutate(method = factor(method), + area = factor(area)) %>% + arrange(method,area,sm) # Variable to relabel facets method_labels <- c(BA45 = "Brodmann Area 45", dACC = "dorsal Anterior Cingulate Cortex", - V1_withRet = "V1 with Retinotopic Factors", - V1_noRet = "V1 with no Retinotopic Factors") + V1-withRet = "V1 with Retinotopic Factors", + V1-noRet = "V1 with no Retinotopic Factors") cv.df %>% @@ -48,7 +62,7 @@ cv.df %>% geom_point(size = 3) + geom_line(aes(group = method)) + facet_wrap(~area, labeller = labeller(area = method_labels)) + - scale_color_discrete(name = "Model Type", labels = c("GBDT", "Linear")) + + scale_color_discrete(name = "Model Type") + theme_bw() + ggtitle("Linear Model and GBDT Root Mean Square Error") + ylab("RMSE (mm)") + diff --git a/model-py/batch/create_GBDT_cv_jobs.m b/model-py/batch/create_GBDT_cv_jobs.m index 174086b0962a52f1509389f8f891c19fbdbfdf98..5a89247e39ddeffa0a3efa14f85713ba6e776b63 100644 --- a/model-py/batch/create_GBDT_cv_jobs.m +++ b/model-py/batch/create_GBDT_cv_jobs.m @@ -1,12 +1,25 @@ -datafile = '/data/user/mdefende/Projects/ctmodel-ml/data/HCP1200-dACC.csv'; +% Add project to path +addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml')) + +% initial parameters +projdir = '/data/user/mdefende/Projects/ctmodel-ml'; +dataset = 'HCP1200'; area = 'dACC'; -eta = [0.01,0.05,0.1:0.1:0.5]; -max_depth = 3:10; -nboost = 1000:1000:15000; -sm = 5; +do_cv = 0; +make_model = 1; + +if make_model + jobdir = fullfile(projdir,'results',dataset,'gbdt','models','jobs',area); +else + jobdir = fullfile(projdir,'results',dataset,'gbdt','cv','jobs',area); +end + +eta = [0.01,0.05,0.1,0.2,0.3]; +max_depth = 6:10; +nboost = 15000; +sm = [0,2,10]; A = combvec(eta,max_depth,nboost,sm)'; -jobdir = ['/data/user/mdefende/Projects/ctmodel-ml/Results/HCP1200/cv-jobs/' area]; if ~exist(jobdir,'dir') mkdir(jobdir) end @@ -14,9 +27,9 @@ end for ii = 1:size(A,1) jobname = [area '_eta' num2str(A(ii,1)) '_depth' num2str(A(ii,2)) '_nboost' num2str(A(ii,3)) '_sm' num2str(A(ii,4)) '.sh']; - write_cv_script(jobname,jobdir,area,A(ii,1),A(ii,2),A(ii,3),A(ii,4),0,datafile) + write_cv_script(jobname,jobdir,projdir,area,A(ii,1),A(ii,2),A(ii,3),A(ii,4),do_cv,make_model,dataset) - if ~exist(['/data/user/mdefende/Projects/ctmodel-ml/Results/HCP1200/cv-out/' area '/' jobname(1:end-3) '.csv'],'file') + if ~exist(fullfile(projdir,'results',dataset,'gbdt','cv','out',area,[jobname(1:end-3) '.csv']),'file') system(['sbatch ' fullfile(jobdir,jobname)]); %disp(jobname) end diff --git a/model-py/batch/create_GBDT_cv_jobs.m~ b/model-py/batch/create_GBDT_cv_jobs.m~ new file mode 100644 index 0000000000000000000000000000000000000000..bf445cf86e14260b7e8e2b2b778d7758fd9096ab --- /dev/null +++ b/model-py/batch/create_GBDT_cv_jobs.m~ @@ -0,0 +1,30 @@ +% Add project to path +addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml')) + +% initial parameters +projdir = '/data/user/mdefende/Projects/ctmodel-ml'; +dataset = 'HCP1200'; +area = 'V1-withRet'; +make_model = 1; + +eta = [0.01,0.05,0.1,0.2,0.3]; +max_depth = 6:10; +nboost = 15000; +sm = [0,2,10]; + +A = combvec(eta,max_depth,nboost,sm)'; +jobdir = fullfile(projdir,'results',dataset,'cv','jobs'); +if ~exist(jobdir,'dir') + mkdir(jobdir) +end + +for ii = 1:size(A,1) + jobname = [area '_eta' num2str(A(ii,1)) '_depth' num2str(A(ii,2)) '_nboost' num2str(A(ii,3)) '_sm' num2str(A(ii,4)) '.sh']; + + write_cv_script(jobname,jobdir,projdir,area,A(ii,1),A(ii,2),A(ii,3),A(ii,4),0,datafile) + + if ~exist(fullfile(projdir,'results',dataset,'gbdt','cv','out',area,[jobname(1:end-3) '.csv']),'file') + system(['sbatch ' fullfile(jobdir,jobname)]); + %disp(jobname) + end +end \ No newline at end of file diff --git a/model-py/batch/create_lm_model_jobs.m b/model-py/batch/create_lm_model_jobs.m index b954908857f05a142c13d696567d06ceee3f3c8e..80c0d7bc9f45142fc6e98f3a90353fa403cef8e9 100644 --- a/model-py/batch/create_lm_model_jobs.m +++ b/model-py/batch/create_lm_model_jobs.m @@ -15,8 +15,8 @@ for ii = 1:length(areas) fprintf(fid,'#!/bin/bash\n'); fprintf(fid,'#SBATCH --partition=express\n'); fprintf(fid,'#SBATCH --time=1:00:00\n'); - fprintf(fid,'#SBATCH --ntasks=1\n'); - fprintf(fid,'#SBATCH --mem-per-cpu=150G\n'); + fprintf(fid,'#SBATCH -ks=1\n'); + fprintf(fid,'#SBATCH --mem-per-cpu=200G\n'); fprintf(fid,['#SBATCH --output=' fullfile(jobdir,jobname(1:end-3)) '.txt\n\n']); fprintf(fid,'module load Anaconda3/5.3.0\n'); diff --git a/model-py/batch/create_model_jobs.m b/model-py/batch/create_model_jobs.m index d28eb3703a892272e9395096b8f85b305860bc10..4de631cfc045d552b2c93c49264a22ea13e46edc 100644 --- a/model-py/batch/create_model_jobs.m +++ b/model-py/batch/create_model_jobs.m @@ -4,27 +4,22 @@ addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml')) % initial parameters projdir = '/data/user/mdefende/Projects/ctmodel-ml'; dataset = 'HCP1200'; -area = 'V1-noRet'; +area = 'all'; +do_cv = 0; make_model = 1; % Say where the cv outputs are -cv_out = fullfile(projdir,'results',dataset,'cv','out'); +cv_out = fullfile(projdir,'results',dataset,'gbdt','cv','out','best-gbdt.csv'); +T = readtable(cv_out); -% What cv parameter set are we going to keep -eta = round([0.01,0.05,0.1:0.1:0.5],2); % There are problems with 0.3 when created by the : operator -max_depth = 3:10; -nboost = 1000:1000:15000; -sm = 5; - -A = check_cv_new(fullfile(cv_out,area),eta,max_depth,nboost,sm); - -% Set the different smoothing kernels -sm = [0,2,5,10]; +if ~strcmp(area,'all') + T(~strcmp(T.area,area),:) = []; +end -jobdir = fullfile(projdir,'results',dataset,'models','jobs'); +jobdir = fullfile(projdir,'results',dataset,'gbdt','models','jobs'); -for ii = 1:length(sm) - jobname = [area '_eta' num2str(A(2)) '_depth' num2str(A(3)) '_nboost' num2str(A(5) + 1) '_sm' num2str(sm(ii)) '.sh']; - write_cv_script(jobname,jobdir,projdir,area,A(2),A(3),A(5)+1,sm(ii),make_model,dataset) +for ii = 1:height(T) + jobname = [T.area{ii} '_eta' num2str(T.eta(ii)) '_depth' num2str(T.depth(ii)) '_nboost' num2str(T.cv_index(ii) + 1) '_sm' num2str(T.sm(ii)) '.sh']; + write_cv_script(jobname,jobdir,projdir,T.area{ii},T.eta(ii),T.depth(ii),T.cv_index(ii)+1,T.sm(ii),do_cv,make_model,dataset) system(['sbatch ' fullfile(jobdir,jobname)]); end diff --git a/model-py/batch/create_model_jobs.m~ b/model-py/batch/create_model_jobs.m~ new file mode 100644 index 0000000000000000000000000000000000000000..4421c77c6d3c89d142d8c4f65eb06b169acef289 --- /dev/null +++ b/model-py/batch/create_model_jobs.m~ @@ -0,0 +1,24 @@ +% Add project to path +addpath(genpath('/data/user/mdefende/Projects/ctmodel-ml')) + +% initial parameters +projdir = '/data/user/mdefende/Projects/ctmodel-ml'; +dataset = 'HCP1200'; +area = 'V1-noRet'; +do_cv = 0; +make_model = 1; + +% Say where the cv outputs are +cv_out = fullfile(projdir,'results',dataset,'gbdt','cv','out','best-gbdt.csv'); +T = readtable(cv_out); + +% Set the different smoothing kernels +sm = [0,2,5,10]; + +jobdir = fullfile(projdir,'results',dataset,'models','jobs'); + +for ii = 1:length(sm) + jobname = [area '_eta' num2str(A(2)) '_depth' num2str(A(3)) '_nboost' num2str(A(5) + 1) '_sm' num2str(sm(ii)) '.sh']; + write_cv_script(jobname,jobdir,projdir,area,A(2),A(3),A(5)+1,sm(ii),make_model,dataset) + system(['sbatch ' fullfile(jobdir,jobname)]); +end diff --git a/model-py/batch/write_cv_script.m b/model-py/batch/write_cv_script.m index 69cea3da8fa5ecf36455fe177c73d7b86bbc3739..8cf0528c022a03dfd491eb1bdc3f09f2623678de 100644 --- a/model-py/batch/write_cv_script.m +++ b/model-py/batch/write_cv_script.m @@ -1,4 +1,4 @@ -function write_cv_script(jobname,jobdir,projdir,area,eta,maxdepth,nboost,sm,create_model,dataset) +function write_cv_script(jobname,jobdir,projdir,area,eta,maxdepth,nboost,sm,do_cv,create_model,dataset) fid = fopen(fullfile(jobdir,jobname),'w'); fprintf(fid,'#!/bin/bash\n'); fprintf(fid,'#SBATCH --partition=pascalnodes\n'); @@ -14,10 +14,12 @@ fprintf(fid,'source activate ctmodel-ml\n\n'); fprintf(fid,'cd /data/user/mdefende/Projects/ctmodel-ml/model-py\n\n'); -fprintf(fid,['python gbdt-cv.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset ' ' projdir '\n']); - +if do_cv + fprintf(fid,['python3.6 gbdt-cv.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset ' ' projdir '\n']); +end + if create_model - fprintf(fid,['python create-GBDT-model.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset ' ' projdir '\n']); + fprintf(fid,['python3.6 create-GBDT-model.py ' num2str(eta) ' ' num2str(maxdepth) ' ' num2str(nboost) ' ' num2str(sm) ' ' area ' ' dataset ' ' projdir '\n']); end fclose(fid); diff --git a/model-py/gbdt-cv.py b/model-py/gbdt-cv.py index d7f6429abdfa8af96ef9a08c7ab197b581a7a118..379a59de33e83db44fc1412bc963db80cb755f1a 100644 --- a/model-py/gbdt-cv.py +++ b/model-py/gbdt-cv.py @@ -71,7 +71,9 @@ cv_results = xgb.cv(params = param, dtrain = dfull, num_boost_round = nboost_val idx = cv_results['test-mae-mean'].idxmin() # Write out the cv results to a csv file -cv_dict = {'eta': eta_val, +cv_dict = {'area': area, + 'sm': sm, + 'eta': eta_val, 'depth': max_depth_val, 'nboost': nboost_val, 'cv_index': idx, diff --git a/model-py/lm-betas.py b/model-py/lm-betas.py index f4babe867c05908d6679dfa3e73b2f552e95d574..efe3831db3a9e7719f5dc7fd6141cd8323f713e5 100644 --- a/model-py/lm-betas.py +++ b/model-py/lm-betas.py @@ -86,7 +86,7 @@ del df_pred df_x = calculate_vif_(df_x,df_y) # combine back into single dataframe and apply z-transform -df_z = pd.concat([df_x,df_y]).apply(stats.zscore) +df_z = pd.concat([df_x,df_y],axis = 1).apply(stats.zscore, axis = 0) # create the formula features = '(' + '+'.join(df_z.drop(df_z.filter(like = 'Thick'),axis = 1).columns) + ') ** 3' @@ -100,7 +100,7 @@ results = mod.fit() print(results.summary()) # grab the p-value and beta values for each of the terms -pvals = reg.pvalues +pvals = results.pvalues pvals.name = 'p.value' coefs = results.params diff --git a/model-py/notebooks/Untitled.ipynb b/model-py/notebooks/Untitled.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ae5f35c2676fad98f626b2c05f81f3af65a82d22 --- /dev/null +++ b/model-py/notebooks/Untitled.ipynb @@ -0,0 +1,391 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 17:14:51) \n", + "[GCC 7.2.0]\n" + ] + } + ], + "source": [ + "# basic libraries to load\n", + "import os\n", + "import sys\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# import model generation and analysis\n", + "from patsy import dmatrices\n", + "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", + "import statsmodels.api as sm\n", + "import statsmodels.formula.api as smf\n", + "from scipy import stats\n", + "\n", + "# print the current python version\n", + "print(sys.version)\n", + "\n", + "# Define a function to repeatedly calculate variance inflation factor and drop \n", + "def calculate_vif_(df_x, df_y, thresh=10.0):\n", + " # Set up the while loop. While we have something to drop, do ... \n", + " dropped = True\n", + " while dropped:\n", + " # get a list of the column names\n", + " variables = list(df_x.columns)\n", + "\n", + " # concatenate the column names into a string separated by '+'\n", + " features = '+'.join(variables)\n", + " \n", + " # re-concatenate the dataframes for creating the design matrices\n", + " df_tmp = pd.concat([df_x,df_y],axis = 1)\n", + " \n", + " y, X = dmatrices(df_y.columns[0] + ' ~ ' + features, df_tmp, return_type='dataframe')\n", + " vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]\n", + " vif = vif[1:len(vif)]\n", + " maxloc = vif.index(max(vif))\n", + "\n", + " thresh = 10 \n", + " if max(vif) > thresh:\n", + " print('dropping \\'' + variables[maxloc] +\n", + " '\\' at index: ' + str(maxloc) + ' with VIF:' + str(vif[maxloc]))\n", + " df_x = df_x.drop(variables[maxloc],axis=1)\n", + " else:\n", + " dropped = False\n", + "\n", + " print('Remaining variables:')\n", + " print(df_x.columns)\n", + " return df_x" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# import parameters from command line\n", + "projdir = '/data/user/mdefende/Projects/ctmodel-ml'\n", + "dataset = 'HCP1200'\n", + "area = 'BA45'\n", + "sm = '0'" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# First load data\n", + "if 'V1' in area:\n", + " data_file = projdir + '/data/' + dataset + '-V1.csv'\n", + "else:\n", + " data_file = projdir + '/data/' + dataset + '-' + area + '.csv'\n", + "\n", + "df = pd.read_csv(data_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Index(['Age', 'Sulc0', 'CurvK0', 'CurvH0', 'PialCurvK0', 'PialCurvH0', 'Area0',\n", + " 'MidArea0', 'PialArea0', 'LGI0'],\n", + " dtype='object'), Index(['Thick0'], dtype='object')]\n" + ] + } + ], + "source": [ + "# Do initial feature selection by the smoothing kernel\n", + "df_pred = df.filter(like=sm)\n", + "df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(like='Thick')))]\n", + "df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(like='Sulc')))]\n", + "\n", + "# If sm == 0, remove all columns with 10. Else remove column with name Sulc + sm\n", + "if sm == '0':\n", + " df_pred = df_pred[df_pred.columns.drop(list(df_pred.filter(regex='10')))]\n", + "\n", + "# Grab the rest of the features (Age, Ecc, Pol, and unsmoothed Sulc) and concatenate them with what remains from df_pred. Only grab Ecc and Pol if withRet is designated in the area. Then grab the correct thickness column\n", + "if 'withRet' in area:\n", + " df_x = pd.concat([df[['Age','Ecc','Pol','Sulc0']],df_pred], axis = 1)\n", + "else:\n", + " df_x = pd.concat([df[['Age','Sulc0']],df_pred], axis = 1)\n", + "\n", + "df_y = df[['Thick' + sm]]\n", + "\n", + "print([df_x.columns,df_y.columns])\n", + "\n", + "# remove df_pred from memory\n", + "del df_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dropping 'MidArea0' at index: 7 with VIF:272945431961848.25\n", + "Remaining variables:\n", + "Index(['Age', 'Sulc0', 'CurvK0', 'CurvH0', 'PialCurvK0', 'PialCurvH0', 'Area0',\n", + " 'PialArea0', 'LGI0'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "# calculate vif repeatedly removing features with a score above 10\n", + "df_x = calculate_vif_(df_x,df_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# combine back into single dataframe and apply z-transform\n", + "df_z = pd.concat([df_x,df_y],axis = 1).apply(stats.zscore, axis = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Thick0 R-squared: 0.411\n", + "Model: OLS Adj. R-squared: 0.411\n", + "Method: Least Squares F-statistic: 4.700e+04\n", + "Date: Thu, 12 Dec 2019 Prob (F-statistic): 0.00\n", + "Time: 17:23:52 Log-Likelihood: -1.0049e+07\n", + "No. Observations: 8702966 AIC: 2.010e+07\n", + "Df Residuals: 8702836 BIC: 2.010e+07\n", + "Df Model: 129 \n", + "Covariance Type: nonrobust \n", + "===================================================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "---------------------------------------------------------------------------------------------------\n", + "Intercept -0.3263 0.001 -612.789 0.000 -0.327 -0.325\n", + "Age -0.0375 0.000 -79.024 0.000 -0.038 -0.037\n", + "Sulc0 -0.1811 0.001 -293.381 0.000 -0.182 -0.180\n", + "CurvK0 -0.1594 0.001 -171.857 0.000 -0.161 -0.158\n", + "CurvH0 0.1093 0.001 132.603 0.000 0.108 0.111\n", + "PialCurvK0 -0.2774 0.001 -268.438 0.000 -0.279 -0.275\n", + "PialCurvH0 -0.9025 0.001 -901.832 0.000 -0.904 -0.900\n", + "Area0 -0.3285 0.001 -503.063 0.000 -0.330 -0.327\n", + "PialArea0 0.2825 0.001 363.814 0.000 0.281 0.284\n", + "LGI0 0.1225 0.000 257.511 0.000 0.122 0.123\n", + "Age:Sulc0 0.0057 0.000 13.697 0.000 0.005 0.007\n", + "Age:CurvK0 0.0065 0.001 10.001 0.000 0.005 0.008\n", + "Age:CurvH0 -0.0094 0.001 -14.927 0.000 -0.011 -0.008\n", + "Age:PialCurvK0 0.0049 0.001 5.913 0.000 0.003 0.006\n", + "Age:PialCurvH0 0.0098 0.001 13.275 0.000 0.008 0.011\n", + "Age:Area0 -0.0025 0.001 -4.705 0.000 -0.004 -0.001\n", + "Age:PialArea0 0.0075 0.001 11.351 0.000 0.006 0.009\n", + "Age:LGI0 0.0048 0.000 18.364 0.000 0.004 0.005\n", + "Sulc0:CurvK0 -0.0456 0.001 -55.149 0.000 -0.047 -0.044\n", + "Sulc0:CurvH0 -0.1505 0.001 -199.646 0.000 -0.152 -0.149\n", + "Sulc0:PialCurvK0 0.0103 0.001 10.468 0.000 0.008 0.012\n", + "Sulc0:PialCurvH0 -0.1050 0.001 -105.627 0.000 -0.107 -0.103\n", + "Sulc0:Area0 0.1350 0.001 246.087 0.000 0.134 0.136\n", + "Sulc0:PialArea0 -0.1482 0.001 -238.676 0.000 -0.149 -0.147\n", + "Sulc0:LGI0 0.1658 0.000 406.482 0.000 0.165 0.167\n", + "CurvK0:CurvH0 -0.0150 0.000 -44.477 0.000 -0.016 -0.014\n", + "CurvK0:PialCurvK0 0.0035 0.001 4.064 0.000 0.002 0.005\n", + "CurvK0:PialCurvH0 -0.1066 0.001 -91.949 0.000 -0.109 -0.104\n", + "CurvK0:Area0 -0.0776 0.001 -86.855 0.000 -0.079 -0.076\n", + "CurvK0:PialArea0 -0.0078 0.001 -7.268 0.000 -0.010 -0.006\n", + "CurvK0:LGI0 0.0266 0.001 40.871 0.000 0.025 0.028\n", + "CurvH0:PialCurvK0 0.0742 0.001 68.673 0.000 0.072 0.076\n", + "CurvH0:PialCurvH0 0.1708 0.001 149.031 0.000 0.169 0.173\n", + "CurvH0:Area0 0.2686 0.001 344.325 0.000 0.267 0.270\n", + "CurvH0:PialArea0 -0.2880 0.001 -311.514 0.000 -0.290 -0.286\n", + "CurvH0:LGI0 -0.0515 0.001 -81.852 0.000 -0.053 -0.050\n", + "PialCurvK0:PialCurvH0 0.0442 0.000 118.913 0.000 0.043 0.045\n", + "PialCurvK0:Area0 -0.0769 0.001 -75.549 0.000 -0.079 -0.075\n", + "PialCurvK0:PialArea0 -0.0187 0.001 -16.214 0.000 -0.021 -0.016\n", + "PialCurvK0:LGI0 0.0114 0.001 14.062 0.000 0.010 0.013\n", + "PialCurvH0:Area0 0.0604 0.001 59.040 0.000 0.058 0.062\n", + "PialCurvH0:PialArea0 -0.3898 0.001 -323.395 0.000 -0.392 -0.387\n", + "PialCurvH0:LGI0 0.0321 0.001 43.407 0.000 0.031 0.034\n", + "Area0:PialArea0 0.0084 0.000 24.263 0.000 0.008 0.009\n", + "Area0:LGI0 -0.0329 0.001 -62.457 0.000 -0.034 -0.032\n", + "PialArea0:LGI0 0.0755 0.001 115.907 0.000 0.074 0.077\n", + "Age:Sulc0:CurvK0 -0.0024 0.000 -5.566 0.000 -0.003 -0.002\n", + "Age:Sulc0:CurvH0 -0.0016 0.000 -3.502 0.000 -0.003 -0.001\n", + "Age:Sulc0:PialCurvK0 8.591e-05 0.000 0.272 0.786 -0.001 0.001\n", + "Age:Sulc0:PialCurvH0 0.0007 0.000 1.866 0.062 -3.33e-05 0.001\n", + "Age:Sulc0:Area0 -0.0005 0.000 -1.096 0.273 -0.001 0.000\n", + "Age:Sulc0:PialArea0 0.0049 0.000 10.699 0.000 0.004 0.006\n", + "Age:Sulc0:LGI0 -0.0002 0.000 -0.506 0.613 -0.001 0.001\n", + "Age:CurvK0:CurvH0 0.0007 0.000 4.978 0.000 0.000 0.001\n", + "Age:CurvK0:PialCurvK0 5.093e-05 0.000 0.164 0.870 -0.001 0.001\n", + "Age:CurvK0:PialCurvH0 0.0013 0.000 2.578 0.010 0.000 0.002\n", + "Age:CurvK0:Area0 0.0054 0.001 8.538 0.000 0.004 0.007\n", + "Age:CurvK0:PialArea0 -0.0033 0.001 -6.312 0.000 -0.004 -0.002\n", + "Age:CurvK0:LGI0 -0.0012 0.000 -4.331 0.000 -0.002 -0.001\n", + "Age:CurvH0:PialCurvK0 -0.0031 0.000 -6.480 0.000 -0.004 -0.002\n", + "Age:CurvH0:PialCurvH0 -0.0053 0.001 -8.327 0.000 -0.007 -0.004\n", + "Age:CurvH0:Area0 -0.0034 0.001 -5.349 0.000 -0.005 -0.002\n", + "Age:CurvH0:PialArea0 -0.0023 0.001 -4.130 0.000 -0.003 -0.001\n", + "Age:CurvH0:LGI0 -0.0003 0.000 -1.043 0.297 -0.001 0.000\n", + "Age:PialCurvK0:PialCurvH0 6.689e-05 8.98e-05 0.745 0.456 -0.000 0.000\n", + "Age:PialCurvK0:Area0 -0.0003 0.000 -0.840 0.401 -0.001 0.000\n", + "Age:PialCurvK0:PialArea0 0.0029 0.001 3.363 0.001 0.001 0.005\n", + "Age:PialCurvK0:LGI0 -0.0012 0.000 -4.195 0.000 -0.002 -0.001\n", + "Age:PialCurvH0:Area0 0.0012 0.000 3.313 0.001 0.000 0.002\n", + "Age:PialCurvH0:PialArea0 0.0046 0.001 5.996 0.000 0.003 0.006\n", + "Age:PialCurvH0:LGI0 -0.0006 0.000 -1.950 0.051 -0.001 2.88e-06\n", + "Age:Area0:PialArea0 -0.0007 0.000 -2.251 0.024 -0.001 -8.42e-05\n", + "Age:Area0:LGI0 0.0001 0.000 0.259 0.796 -0.001 0.001\n", + "Age:PialArea0:LGI0 -0.0003 0.000 -0.601 0.548 -0.001 0.001\n", + "Sulc0:CurvK0:CurvH0 0.0023 0.000 11.980 0.000 0.002 0.003\n", + "Sulc0:CurvK0:PialCurvK0 -0.0115 0.000 -30.406 0.000 -0.012 -0.011\n", + "Sulc0:CurvK0:PialCurvH0 -0.0057 0.001 -9.359 0.000 -0.007 -0.004\n", + "Sulc0:CurvK0:Area0 -0.0373 0.001 -51.817 0.000 -0.039 -0.036\n", + "Sulc0:CurvK0:PialArea0 0.0070 0.001 10.570 0.000 0.006 0.008\n", + "Sulc0:CurvK0:LGI0 -0.0022 0.000 -5.451 0.000 -0.003 -0.001\n", + "Sulc0:CurvH0:PialCurvK0 0.0024 0.001 4.439 0.000 0.001 0.003\n", + "Sulc0:CurvH0:PialCurvH0 0.0109 0.001 14.784 0.000 0.009 0.012\n", + "Sulc0:CurvH0:Area0 0.0047 0.001 7.863 0.000 0.004 0.006\n", + "Sulc0:CurvH0:PialArea0 -0.0736 0.001 -135.096 0.000 -0.075 -0.072\n", + "Sulc0:CurvH0:LGI0 -0.0025 0.000 -5.507 0.000 -0.003 -0.002\n", + "Sulc0:PialCurvK0:PialCurvH0 -0.0010 0.000 -8.762 0.000 -0.001 -0.001\n", + "Sulc0:PialCurvK0:Area0 0.0184 0.000 52.024 0.000 0.018 0.019\n", + "Sulc0:PialCurvK0:PialArea0 -0.0250 0.001 -28.204 0.000 -0.027 -0.023\n", + "Sulc0:PialCurvK0:LGI0 -0.0013 0.000 -4.001 0.000 -0.002 -0.001\n", + "Sulc0:PialCurvH0:Area0 -0.0330 0.000 -87.655 0.000 -0.034 -0.032\n", + "Sulc0:PialCurvH0:PialArea0 -0.0710 0.001 -90.453 0.000 -0.073 -0.069\n", + "Sulc0:PialCurvH0:LGI0 -0.0054 0.000 -14.313 0.000 -0.006 -0.005\n", + "Sulc0:Area0:PialArea0 0.0180 0.000 48.006 0.000 0.017 0.019\n", + "Sulc0:Area0:LGI0 -0.0144 0.000 -33.923 0.000 -0.015 -0.014\n", + "Sulc0:PialArea0:LGI0 0.0466 0.000 102.277 0.000 0.046 0.048\n", + "CurvK0:CurvH0:PialCurvK0 0.0011 0.000 7.882 0.000 0.001 0.001\n", + "CurvK0:CurvH0:PialCurvH0 -0.0031 0.000 -13.965 0.000 -0.004 -0.003\n", + "CurvK0:CurvH0:Area0 -0.0019 0.000 -5.947 0.000 -0.003 -0.001\n", + "CurvK0:CurvH0:PialArea0 -0.0102 0.000 -38.435 0.000 -0.011 -0.010\n", + "CurvK0:CurvH0:LGI0 0.0025 0.000 17.905 0.000 0.002 0.003\n", + "CurvK0:PialCurvK0:PialCurvH0 0.0031 0.000 26.367 0.000 0.003 0.003\n", + "CurvK0:PialCurvK0:Area0 -0.0140 0.001 -24.563 0.000 -0.015 -0.013\n", + "CurvK0:PialCurvK0:PialArea0 0.0190 0.001 23.023 0.000 0.017 0.021\n", + "CurvK0:PialCurvK0:LGI0 0.0002 0.000 0.613 0.540 -0.000 0.001\n", + "CurvK0:PialCurvH0:Area0 -0.0727 0.001 -90.603 0.000 -0.074 -0.071\n", + "CurvK0:PialCurvH0:PialArea0 0.0208 0.001 18.001 0.000 0.019 0.023\n", + "CurvK0:PialCurvH0:LGI0 0.0020 0.000 4.071 0.000 0.001 0.003\n", + "CurvK0:Area0:PialArea0 0.0009 0.000 2.971 0.003 0.000 0.002\n", + "CurvK0:Area0:LGI0 0.0196 0.001 30.804 0.000 0.018 0.021\n", + "CurvK0:PialArea0:LGI0 -0.0139 0.001 -27.324 0.000 -0.015 -0.013\n", + "CurvH0:PialCurvK0:PialCurvH0 0.0022 0.000 12.089 0.000 0.002 0.003\n", + "CurvH0:PialCurvK0:Area0 0.0700 0.001 86.041 0.000 0.068 0.072\n", + "CurvH0:PialCurvK0:PialArea0 -0.0563 0.001 -50.653 0.000 -0.058 -0.054\n", + "CurvH0:PialCurvK0:LGI0 -0.0018 0.000 -3.730 0.000 -0.003 -0.001\n", + "CurvH0:PialCurvH0:Area0 0.1860 0.001 194.524 0.000 0.184 0.188\n", + "CurvH0:PialCurvH0:PialArea0 -0.1857 0.001 -153.565 0.000 -0.188 -0.183\n", + "CurvH0:PialCurvH0:LGI0 -0.0097 0.001 -15.230 0.000 -0.011 -0.008\n", + "CurvH0:Area0:PialArea0 -0.0288 0.000 -82.492 0.000 -0.030 -0.028\n", + "CurvH0:Area0:LGI0 -0.0198 0.001 -31.639 0.000 -0.021 -0.019\n", + "CurvH0:PialArea0:LGI0 0.0086 0.001 15.554 0.000 0.008 0.010\n", + "PialCurvK0:PialCurvH0:Area0 -0.0006 0.000 -5.802 0.000 -0.001 -0.000\n", + "PialCurvK0:PialCurvH0:PialArea0 0.0383 0.000 108.995 0.000 0.038 0.039\n", + "PialCurvK0:PialCurvH0:LGI0 -6.878e-08 9.79e-05 -0.001 0.999 -0.000 0.000\n", + "PialCurvK0:Area0:PialArea0 0.0160 0.001 22.272 0.000 0.015 0.017\n", + "PialCurvK0:Area0:LGI0 -0.0048 0.000 -13.681 0.000 -0.005 -0.004\n", + "PialCurvK0:PialArea0:LGI0 0.0103 0.001 11.868 0.000 0.009 0.012\n", + "PialCurvH0:Area0:PialArea0 0.0863 0.001 140.913 0.000 0.085 0.087\n", + "PialCurvH0:Area0:LGI0 0.0190 0.000 52.148 0.000 0.018 0.020\n", + "PialCurvH0:PialArea0:LGI0 0.0059 0.001 7.812 0.000 0.004 0.007\n", + "Area0:PialArea0:LGI0 -0.0058 0.000 -20.366 0.000 -0.006 -0.005\n", + "==============================================================================\n", + "Omnibus: 446101.536 Durbin-Watson: 0.612\n", + "Prob(Omnibus): 0.000 Jarque-Bera (JB): 754981.664\n", + "Skew: 0.422 Prob(JB): 0.00\n", + "Kurtosis: 4.171 Cond. No. 86.8\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" + ] + } + ], + "source": [ + "# create the formula \n", + "features = '(' + '+'.join(df_z.drop(df_z.filter(like = 'Thick'),axis = 1).columns) + ') ** 3'\n", + "formula = 'Thick' + sm + ' ~ ' + features\n", + "\n", + "# create the model\n", + "mod = smf.ols(formula=formula, data=df_z)\n", + "\n", + "results = mod.fit()\n", + "\n", + "print(results.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# grab the p-value and beta values for each of the terms\n", + "pvals = results.pvalues\n", + "pvals.name = 'p.value'\n", + "\n", + "coefs = results.params\n", + "coefs.name = 'norm.slope'\n", + "\n", + "# concatenate the coefficients and p-values, and write to a csv\n", + "df_out = pd.concat([coefs,pvals],axis = 1)\n", + "df_out.index.name = 'term'\n", + "df_out.reset_index(inplace=True)\n", + "\n", + "outdir = projdir + '/results/' + dataset + '/lm/models'\n", + "if not os.path.isdir(outdir):\n", + " os.mkdir(outdir)\n", + "\n", + "filename = outdir + '/coefs/' + area + '-sm' + sm + '-lm-betas.csv'\n", + "df_out.to_csv(filename)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:ctmodel-ml]", + "language": "python", + "name": "conda-env-ctmodel-ml-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}