Skip to content
Snippets Groups Projects
split-info-file.sh 3.11 KiB
Newer Older
#!/bin/bash

set -euo pipefail

############################################################
# Default Values                                           #
############################################################

ntasks=4
mem="16G"
time="12:00:00"
partition="amd-hdr100"
lines=5000000

############################################################
# Help                                                     #
############################################################
usage()
{
>&2 cat << EOF
Usage: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] 
          [ -l | --lines ] log
EOF
}

help()
{
# Display Help
>&2 cat << EOF
Splits a GPFS policy log into multiple parts for batch array processing
Usage: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition ] [ -t | --time ] [ -m | --mem ] 
          [ -l | --lines ] log

General:
    -h|--help           Print this Help.

Required:
    log                 Path to the log file to split

File Partitioning:
    -l|--lines          Max number of records to save in each split (default: 5000000)

Job Parameters:
    -n|--ntasks         Number of job tasks (default: 4)
    -p|--partition      Partition to submit tasks to (default: amd-hdr100)
    -t|--time           Max walltime (default: 12:00:00)
    -m|--mem            Memory (default: 16G)
EOF
exit 0
}

args=$(getopt -a -o hn:p:t:m:l: --long help,ntasks:,partition:,time:,mem:,lines: -- "$@")
if [[ $? -gt 0 ]]; then
  usage
fi

eval set -- ${args}

while :
do
  case $1 in
    -h | --help)            help            ;;
    -n | --ntasks)          ntasks=$2       ; shift 2 ;;
    -p | --partition)       partition=$2    ; shift 2 ;;
    -t | --time)            time=$2         ; shift 2 ;;
    -m | --mem)             mem=$2          ; shift 2 ;;
    -l | --lines)           lines=$2        ; shift 2 ;;
    --) shift; break ;;
    *) >&2 echo Unsupported option: $1
       usage ;;
  esac
done

if [[ $# -eq 0 ]]; then
  usage
fi

log=$1
dirname="$(basename ${log} .gz).d"
prefix=${dirname}/list-

split_cmd="cat ${log} | split -a 3 -d -l ${lines} - ${prefix}"
zip_cmd="ls ${prefix}* | xargs -i -P 0 bash -c 'gzip {} && echo {} done'"

if [[ $(file -b --mime-type ${log}) == *'gzip'* ]]; then
    split_cmd="z${split_cmd}"
fi

>&2 cat << EOF
--------------------------------------------------------------------------------
GPFS log:           ${log}
Lines per File:     ${lines}

ntasks:             ${ntasks}
partition:          ${partition}
time:               ${time}
mem:                ${mem}

split cmd:          ${split_cmd}
zip cmd:            ${zip_cmd}
--------------------------------------------------------------------------------
EOF

mkdir -p ${dirname}
mkdir -p out
mkdir -p err

############################################################
# Create Array Job Script                                  #
############################################################

{ cat | sbatch; } << EOF
#!/bin/bash
#
#SBATCH --job-name=split-gpfs-log
#SBATCH --ntasks=${ntasks}
#SBATCH --partition=${partition}
#SBATCH --time=${time}
#SBATCH --mem=${mem}
#SBATCH --output=out/%A.out
#SBATCH --error=err/%A.err

${split_cmd}
${zip_cmd}
EOF