diff --git a/split-info-file b/split-info-file deleted file mode 100755 index 331b00ff0087ed55f7141df56bd45ec7b3dc81ff..0000000000000000000000000000000000000000 --- a/split-info-file +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# split an info file into individual list files to make searching with an array job fast - -file=$1 -dirname=${file}.d -prefix=${dirname}/list- - -mkdir -p ${dirname} -srun -p amd-hdr100 --time 06:00:00 --mem 4G split -a 3 -d -l 5000000 ${file} ${prefix} diff --git a/src/split-info-file.sh b/src/split-info-file.sh new file mode 100755 index 0000000000000000000000000000000000000000..871813aa9ea1641be7fc610888c3f17f55660c67 --- /dev/null +++ b/src/split-info-file.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +set -euo pipefail + +############################################################ +# Default Values # +############################################################ + +ntasks=4 +mem="16G" +time="12:00:00" +partition="amd-hdr100" +lines=5000000 + +############################################################ +# Help # +############################################################ +usage() +{ +>&2 cat << EOF +Usage: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] + [ -l | --lines ] log +EOF +} + +help() +{ +# Display Help +>&2 cat << EOF +Splits a GPFS policy log into multiple parts for batch array processing +Usage: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition ] [ -t | --time ] [ -m | --mem ] + [ -l | --lines ] log + +General: + -h|--help Print this Help. + +Required: + log Path to the log file to split + +File Partitioning: + -l|--lines Max number of records to save in each split (default: 5000000) + +Job Parameters: + -n|--ntasks Number of job tasks (default: 4) + -p|--partition Partition to submit tasks to (default: amd-hdr100) + -t|--time Max walltime (default: 12:00:00) + -m|--mem Memory (default: 16G) +EOF +exit 0 +} + +args=$(getopt -a -o hn:p:t:m:l: --long help,ntasks:,partition:,time:,mem:,lines: -- "$@") +if [[ $? -gt 0 ]]; then + usage +fi + +eval set -- ${args} + +while : +do + case $1 in + -h | --help) help ;; + -n | --ntasks) ntasks=$2 ; shift 2 ;; + -p | --partition) partition=$2 ; shift 2 ;; + -t | --time) time=$2 ; shift 2 ;; + -m | --mem) mem=$2 ; shift 2 ;; + -l | --lines) lines=$2 ; shift 2 ;; + --) shift; break ;; + *) >&2 echo Unsupported option: $1 + usage ;; + esac +done + +if [[ $# -eq 0 ]]; then + usage +fi + +log=$1 +dirname="$(basename ${log} .gz).d" +prefix=${dirname}/list- + +split_cmd="cat ${log} | split -a 3 -d -l ${lines} - ${prefix}" +zip_cmd="ls ${prefix}* | xargs -i -P 0 bash -c 'gzip {} && echo {} done'" + +if [[ $(file -b --mime-type ${log}) == *'gzip'* ]]; then + split_cmd="z${split_cmd}" +fi + +>&2 cat << EOF +-------------------------------------------------------------------------------- +GPFS log: ${log} +Lines per File: ${lines} + +ntasks: ${ntasks} +partition: ${partition} +time: ${time} +mem: ${mem} + +split cmd: ${split_cmd} +zip cmd: ${zip_cmd} +-------------------------------------------------------------------------------- +EOF + +mkdir -p ${dirname} +mkdir -p out +mkdir -p err + +############################################################ +# Create Array Job Script # +############################################################ + +{ cat | sbatch; } << EOF +#!/bin/bash +# +#SBATCH --job-name=split-gpfs-log +#SBATCH --ntasks=${ntasks} +#SBATCH --partition=${partition} +#SBATCH --time=${time} +#SBATCH --mem=${mem} +#SBATCH --output=out/%A.out +#SBATCH --error=err/%A.err + +${split_cmd} +${zip_cmd} +EOF +