import argparse
from pathlib import Path
import pandas as pd

from ..process import fparq

DESCRIPTION = """
gpfspart is a custom implementation of the fpart algorithm specifically designed to work with processed GPFS policy outputs. fpart crawls a directory tree partitioning the files by size up to a specified max and stores them in a number of lists. These lists are useful for passing to rsync and its derivative implementations to load balance and parallelize large data transfers. However, fpart has to crawl a file tree to create these partitions which can impact performance on very large, network file systems such as GPFS.

Instead, gpfspart reads a GPFS policy log output that has been parsed and converted to a parquet dataset and creates file partitions without needing to crawl the filesystem. It simulates fpart's --live mode, a naive partitioning scheme where the file listing is only run through once and partitions are created as the running total of bytes reaches the specified limit. This creates unoptimized file partitions as far as trying to create as few partitions as possible, but it does preserve the order of the files from the original log if that sorting is important.

Tiny and Large Files
--------------------

Options are included to specify minimum and maximum size cutoffs to define which files are included in the default grouping. These excluded files can then either be partitioned separately or not at all with another option. This was written to optimize functions such as parsyncfp2 which have built-in options to process tiny or large files differently. For example, parsyncfp2 will tar tiny files togather and transfer the tarball as well as chunk large files and transfer chunks concurrently.
"""

def parse_args():
    parser = argparse.ArgumentParser(
        description=DESCRIPTION,
        formatter_class=argparse.RawTextHelpFormatter
    )
    parser.add_argument('parquet_path',
                        type=Path,
                        help="Input path for the parquet GPFS dataset to chunk")
    parser.add_argument('-p','--partition-path',
                        type=Path,
                        default=None,
                        help="Path to write partition files. Defaults to ${{parquet_path}}/_partitions")
    parser.add_argument('-m','--max-part-size',
                        type=str,
                        default='50GiB',
                        help="Max combined size of all files in a partition. This can be specified either as a human-readable byte string (e.g. 10M[[i]B], 100G[[i]B]) or as a raw integer. Byte strings will be interpreted as base 2 (e.g 1kB is always 1024 bytes)")
    parser.add_argument('-f','--max-part-files',
                        type=int,
                        default=None,
                        help="Maximum number of files to include in any partition. Works with --max-size where all partitions meet both criteria")
    parser.add_argument('-t','--tiny-size',
                        type=str,
                        default=None,
                        help="Max size of file to be specified as 'tiny'. Tiny files are partitioned separately from other files by default. They can be excluded entirely using the --exclude-nonstandard flag")
    parser.add_argument('--max-tiny-part-size',
                        type=str,
                        default='1GiB',
                        help="Max partition size for tiny files")
    parser.add_argument('--max-tiny-part-files',
                        type=int,
                        default=250000,
                        help="Max number of files in a partition of tiny files")
    parser.add_argument('-b','--big-size',
                        type=str,
                        default=None,
                        help="Minimum file size to specified as 'big'. Files above this limit will be assigned to their own unique partition. This value is implicitly set to the max partition size. Setting this value above the max partition size would have no effect. These files can be excluded entirely using the --exclude-nonstandard flag")
    parser.add_argument('--exclude-nonstandard',
                        default=False,
                        action="store_true",
                        help="Exclude all tiny and big files from partitioning. Partitions will only include files between tiny-size and big-size.")

    args = parser.parse_args()
    return vars(args)

def fparq_cli():
    args = parse_args()

    if args.get('partition_path') is None:
        pq_path = args.get('parquet_path')
        args.update({'partition_path': pq_path.joinpath('_partitions')})

    df = pd.read_parquet(args.get('parquet_path'))
    fparq(df,**args)