Skip to content
Snippets Groups Projects
split-info-file.sh 3.50 KiB
#!/bin/bash

set -euxo pipefail

############################################################
# Default Values                                           #
############################################################

ntasks=4
mem="16G"
time="12:00:00"
partition="amd-hdr100"
lines=5000000
outdir=""

############################################################
# Help                                                     #
############################################################
usage()
{
>&2 cat << EOF
Usage: $0 [ -h ] [ -l | --lines ] [ -o | --outdir ]
          [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] 
          log
EOF
exit 0
}

help()
{
# Display Help
>&2 cat << EOF
Splits a GPFS policy log into multiple parts for batch array processing
Usage: $0 [ -h ] [ -l | --lines ] [ -o | --outdir ]
          [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] 
          log

General:
    -h|--help           Print this help.

Required:
    log                 Path to the log file to split

Split Parameters:
    -l|--lines          Max number of records to save in each split (default: 5000000)

File Parameters:
    -o|--outdir         Directory path to store split files in. Defaults to log.d in log's parent directory.

Job Parameters:
    -n|--ntasks         Number of job tasks (default: 4)
    -p|--partition      Partition to submit tasks to (default: amd-hdr100)
    -t|--time           Max walltime (default: 12:00:00)
    -m|--mem            Memory (default: 16G)
EOF
exit 0
}

args=$(getopt -a -o hl:o:n:p:t:m: --long help,lines:,outdir:,ntasks:,partition:,time:,mem: -- "$@")
if [[ $? -gt 0 ]]; then
  usage
fi

eval set -- ${args}

while :
do
  case $1 in
    -h | --help)            help            ;;
    -l | --lines)           lines=$2        ; shift 2 ;;
    -o | --outdir)          outdir=$2       ; shift 2 ;;
    -n | --ntasks)          ntasks=$2       ; shift 2 ;;
    -p | --partition)       partition=$2    ; shift 2 ;;
    -t | --time)            time=$2         ; shift 2 ;;
    -m | --mem)             mem=$2          ; shift 2 ;;
    --) shift; break ;;
    *) >&2 echo Unsupported option: $1
       usage ;;
  esac
done

if [[ $# -eq 0 ]]; then
  usage
fi

log=$1

if [[ -z "${log}" ]]; then
    echo "Log path is required"
    usage
fi

if [[ -z "${outdir}" ]]; then
    outdir="$(readlink -f ${log}).d"
fi

prefix=${outdir}/list-

split_cmd="cat ${log} | split -a 3 -d -l ${lines} - ${prefix}"
zip_cmd="ls ${prefix}* | xargs -i -P 0 bash -c 'gzip {} && echo {} done'"

if [[ $(file -b --mime-type ${log}) == *'gzip'* ]]; then
    split_cmd="z${split_cmd}"
fi

>&2 cat << EOF
--------------------------------------------------------------------------------
GPFS log:           ${log}
Output Directory    ${outdir}
Lines per File:     ${lines}

ntasks:             ${ntasks}
partition:          ${partition}
time:               ${time}
mem:                ${mem}

split cmd:          ${split_cmd}
zip cmd:            ${zip_cmd}
--------------------------------------------------------------------------------
EOF

mkdir -p ${outdir}
mkdir -p out
mkdir -p err

############################################################
# Create Array Job Script                                  #
############################################################

{ cat | sbatch; } << EOF
#!/bin/bash
#
#SBATCH --job-name=split-gpfs-log
#SBATCH --ntasks=${ntasks}
#SBATCH --partition=${partition}
#SBATCH --time=${time}
#SBATCH --mem=${mem}
#SBATCH --output=out/%A.out
#SBATCH --error=err/%A.err

${split_cmd}
${zip_cmd}
EOF