From 6f9ee8afd71416bedc9528b5d5297ba0db8ada97 Mon Sep 17 00:00:00 2001
From: Matthew K Defenderfer <mdefende@uab.edu>
Date: Thu, 12 Dec 2024 15:02:34 -0600
Subject: [PATCH] Update README for new CLI

---
 README.md                             | 43 ++++++++++++++++++---------
 src/rc_gpfs/cli/convert_to_parquet.py |  2 +-
 src/rc_gpfs/cli/split_log.py          |  2 +-
 3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 1fb6044..69fb386 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,19 @@ The slurm job output file will be local to the directory from which this command
 
 #### List Policy Specific Outputs
 
-The raw output file for list policies in `outdir` will be named `list-<jobid>.list.gather-info`.
+The standard organization scheme for list policy outputs can be seen below.
+
+```text
+.
+└── list-policy_<device>_<policy>_slurm-<jobid>_<rundatetime>/
+    ├── raw/
+    │   └── list-policy_<device>_<policy>_slurm-<jobid>_<rundatetime>.gz
+    ├── chunks
+    ├── parquet
+    └── reports
+```
+
+`chunks`, `parquet`, and `reports` are not automatically generated, however they are default directory names for outputs from downstream preprocessing and processing functions.
 
 The output file contains one line per file object stored under the `device`.  No directories or non-file objects are included in this listing unless the `list-path-dirplus` policy is used.  Each entry is a space-seperated set of file attributes selected by the SHOW command in the LIST rule.  Entries are encoded according to RFC3986 URI percent encoding.  This means all spaces and special characters will be encoded, making it easy to split lines into fields using the space separator.
 
@@ -77,42 +89,45 @@ The output file contains one line per file object stored under the `device`.  No
 
 ### Split and compress
 
-Policy outputs generated using `list-path-external` or `list-path-dirplus` can be split into multiple smaller log files to facilitate out-of-memory computation for very large filesets using tools such as dask. The policy output can be split and compressed using the `src/split-info-file.sh` script. See the following for usage:
+Policy outputs generated using `list-path-external` or `list-path-dirplus` can be split into smaller log files to facilitate out-of-memory computation for very large filesets using tools such as dask. Policy outputs can be split and recompressed from the command line using the `split-log` command. This processing can be automatically submitted as a separate batch job using the `--batch` flag.
 
 ```bash
-./split-info-file.sh [ -h ] [ -l | --lines ] [ -o | --outdir ] 
-                     [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]  
-                     log
+split-log [ -h ] [ -l | --lines ] [ -o | --output-dir ] [ --batch ]
+          [ -n | --cpus-per-task ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]  
+          log
 ```
 
 - `lines`: the max number of lines to include in each split file. Defaults to 5000000
 - `outdir`: directory to store the split files in. Defaults to ${log}.d in log's parent directory.
 - `log`: path to the GPFS policy log. Can be either uncompressed or `gzip` compressed
+- `batch`: If specified, a separate batch job will be submitted that splits and compresses the log. Otherwise, both operations will be performed using the local compute resources.
 
 All other options specify job resource parameters. Defaults are as follows:
 
-- `ntasks`: 4
+- `cpus-per-task`: `24`
 - `partition`: `amd-hdr100`
-- `time`: `12:00:00`
-- `mem`: `16G`
+- `time`: `02:00:00`
+- `mem`: `8G`
 
 Split files will have the form `${outdir}/list-XXX.gz` where XXX is an incrementing index. Files are automatically compressed.
 
 ### Pre-parse output for Python
 
-Processing GPFS log outputs is controlled by the `run-convert-to-parquet.sh` script and assumes the GPFS log has been split into a number of files of the form `list-XXX.gz` where `XXX` is an incrementing numeric index. This creates an array job where each task in the array reads the quoted text in one file, parses it into a dataframe, and exports it as a parquet file with the name `list-XXX.parquet`.
+Processing GPFS log outputs is done by `convert-to-parquet` and assumes the GPFS log has been split into a number of files of the form `list-XXX.gz`. `convert-to-parquet` parses each log, adds a column specifying top-level directory ([tld](#tld)) for each file, and saves the data with the appropriate types in parquet format. This processing can be automatically submitted as a separate batch array job using the `--batch` flag.
 
 This script is written to parse the `list-path-external` policy format with quoted special characters.
 
 ```bash
-./run-convert-to-parquet.sh [ -h ] [ -o | --outdir ] 
-                            [ -n | --ntasks ] [ -p | --partition] 
-                            [ -t | --time ] [ -m | --mem ] 
-                            gpfs_logdir
+convert-to-parquet [ -h ] [ -o | --outdir ] [ --pool-size ] [ --batch ]
+                   [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]
+                   [ --slurm-log-dir ]
+                   gpfs_logdir
 ```
 
-- `outdir`: Path to save parquet outputs. Defaults to `${gpfs_logdir}/parquet`
+- `output-dir`: Path to save parquet outputs. Defaults to `${gpfs_logdir}/../parquet`
 - `gpfs_logdir`: Directory path containing the split log files as `*.gz`
+- `pool-size`: If performing conversion locally with multiple cores, this controls exactly how many cores within the job are used in the parallel pool. If not specified, all cores assigned to the job are used in the pool.
+- `batch`: If set, processing will be performed in an array job where each array task converts a single log file.
 
 All other options control the array job resources. Default values are as follows:
 
diff --git a/src/rc_gpfs/cli/convert_to_parquet.py b/src/rc_gpfs/cli/convert_to_parquet.py
index 791330a..28b5895 100644
--- a/src/rc_gpfs/cli/convert_to_parquet.py
+++ b/src/rc_gpfs/cli/convert_to_parquet.py
@@ -95,7 +95,7 @@ def submit_batch(**kwargs):
 def convert_to_parquet() -> None:
     args = parse_args()
     if args['output_dir'] is None:
-        args['output_dir'] = args['input'].parent.parent.joinpath('parquet')
+        args['output_dir'] = args['input'].parent.joinpath('parquet')
     
     args['output_dir'].mkdir(exist_ok = True, mode = 0o2770)
     
diff --git a/src/rc_gpfs/cli/split_log.py b/src/rc_gpfs/cli/split_log.py
index 0e9730c..6c37bde 100644
--- a/src/rc_gpfs/cli/split_log.py
+++ b/src/rc_gpfs/cli/split_log.py
@@ -44,7 +44,7 @@ def parse_args():
                        help="Number of cores assigned to the job. Ntasks is always set to 1")
     slurm.add_argument('-p','--partition', type=str, default='amd-hdr100')
     slurm.add_argument('-t','--time',type=str,default='02:00:00')
-    slurm.add_argument('-m','--mem',type=str,default='48G')
+    slurm.add_argument('-m','--mem',type=str,default='8G')
     slurm.add_argument('--slurm-log-dir',type=Path,default='./out',
                        help='Output log directory. If the directory does not exist, it will be created automatically')
     args = parser.parse_args()
-- 
GitLab