Something went wrong on our end
-
Matthew K Defenderfer authoredddc0f3d6
split-info-file.sh 3.50 KiB
#!/bin/bash
set -euxo pipefail
############################################################
# Default Values #
############################################################
ntasks=4
mem="16G"
time="12:00:00"
partition="amd-hdr100"
lines=5000000
outdir=""
############################################################
# Help #
############################################################
usage()
{
>&2 cat << EOF
Usage: $0 [ -h ] [ -l | --lines ] [ -o | --outdir ]
[ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]
log
EOF
exit 0
}
help()
{
# Display Help
>&2 cat << EOF
Splits a GPFS policy log into multiple parts for batch array processing
Usage: $0 [ -h ] [ -l | --lines ] [ -o | --outdir ]
[ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]
log
General:
-h|--help Print this help.
Required:
log Path to the log file to split
Split Parameters:
-l|--lines Max number of records to save in each split (default: 5000000)
File Parameters:
-o|--outdir Directory path to store split files in. Defaults to log.d in log's parent directory.
Job Parameters:
-n|--ntasks Number of job tasks (default: 4)
-p|--partition Partition to submit tasks to (default: amd-hdr100)
-t|--time Max walltime (default: 12:00:00)
-m|--mem Memory (default: 16G)
EOF
exit 0
}
args=$(getopt -a -o hl:o:n:p:t:m: --long help,lines:,outdir:,ntasks:,partition:,time:,mem: -- "$@")
if [[ $? -gt 0 ]]; then
usage
fi
eval set -- ${args}
while :
do
case $1 in
-h | --help) help ;;
-l | --lines) lines=$2 ; shift 2 ;;
-o | --outdir) outdir=$2 ; shift 2 ;;
-n | --ntasks) ntasks=$2 ; shift 2 ;;
-p | --partition) partition=$2 ; shift 2 ;;
-t | --time) time=$2 ; shift 2 ;;
-m | --mem) mem=$2 ; shift 2 ;;
--) shift; break ;;
*) >&2 echo Unsupported option: $1
usage ;;
esac
done
if [[ $# -eq 0 ]]; then
usage
fi
log=$1
if [[ -z "${log}" ]]; then
echo "Log path is required"
usage
fi
if [[ -z "${outdir}" ]]; then
outdir="$(readlink -f ${log}).d"
fi
prefix=${outdir}/list-
split_cmd="cat ${log} | split -a 3 -d -l ${lines} - ${prefix}"
zip_cmd="ls ${prefix}* | xargs -i -P 0 bash -c 'gzip {} && echo {} done'"
if [[ $(file -b --mime-type ${log}) == *'gzip'* ]]; then
split_cmd="z${split_cmd}"
fi
>&2 cat << EOF
--------------------------------------------------------------------------------
GPFS log: ${log}
Output Directory ${outdir}
Lines per File: ${lines}
ntasks: ${ntasks}
partition: ${partition}
time: ${time}
mem: ${mem}
split cmd: ${split_cmd}
zip cmd: ${zip_cmd}
--------------------------------------------------------------------------------
EOF
mkdir -p ${outdir}
mkdir -p out
mkdir -p err
############################################################
# Create Array Job Script #
############################################################
{ cat | sbatch; } << EOF
#!/bin/bash
#
#SBATCH --job-name=split-gpfs-log
#SBATCH --ntasks=${ntasks}
#SBATCH --partition=${partition}
#SBATCH --time=${time}
#SBATCH --mem=${mem}
#SBATCH --output=out/%A.out
#SBATCH --error=err/%A.err
${split_cmd}
${zip_cmd}
EOF