diff --git a/src/rc_gpfs/cli/convert_to_parquet.py b/src/rc_gpfs/cli/convert_to_parquet.py index 9abec147d7f72b7e291956166f384279b9868389..ac84448023782b1752affaaa69aac0dfdafd423c 100644 --- a/src/rc_gpfs/cli/convert_to_parquet.py +++ b/src/rc_gpfs/cli/convert_to_parquet.py @@ -23,7 +23,7 @@ Local Parallel Processing: If processing is done via a local parallel pool, the requested cores need to be accessible by the invoking Python process. When run in a Slurm job context where the number of cores were only specified with the --ntasks property, only 1 core will be available to the Python process regardless of the number of cores requested by the job. Instead, use the --cpus-per-task property to set the number of cores paired with --ntasks=1. This will correctly allow the parallel pool to utilize all cores assigned to the job. """ -def parse_args(): +def parse_args(arg_str=None): parser = argparse.ArgumentParser( description=DESCRIPTION, parents=[batch_parser(partition='amd-hdr100,express',time='02:00:00',mem='16G',cpus_per_task=1)], @@ -41,7 +41,7 @@ def parse_args(): help="Number of cores to include in the pool for local parallel processing. If None, will default to all cores available to the invoking Python process") parser.add_argument('--no-clobber', action='store_true',default=False, help='When set, skips any log chunks that already have corresponding parquet files. Chunks without a parquet file are processed as normal.') - args = parser.parse_args() + args = parser.parse_args(arg_str) return vars(args) BATCH_SCRIPT = """\ @@ -58,7 +58,9 @@ BATCH_SCRIPT = """\ {env_cmd} -log=$(ls {input}/*.gz | sort | awk "NR==${{SLURM_ARRAY_TASK_ID+1}} {{ print $1 }}") +idx=$((${{SLURM_ARRAY_TASK_ID}}+1)) + +log=$(ls {input}/*.gz | sort | awk "NR==${{idx}} {{ print $1 }}") convert-to-parquet {no_clobber_opt} -o {output_dir} ${{log}} """