diff --git a/poetry.lock b/poetry.lock index 02961ca52705cfe03743868fd762d95a17cc1c9c..24f58d2d829bfc926703c90db341920b82602bab 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,18 @@ # This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main"] +markers = "sys_platform == \"win32\"" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + [[package]] name = "colormaps" version = "0.4.2" @@ -82,6 +95,18 @@ files = [ docs = ["Sphinx", "furo"] test = ["objgraph", "psutil"] +[[package]] +name = "iniconfig" +version = "2.1.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, + {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, +] + [[package]] name = "numpy" version = "2.2.5" @@ -175,6 +200,22 @@ files = [ packaging = "*" tenacity = ">=6.2.0" +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + [[package]] name = "polars" version = "1.27.1" @@ -274,6 +315,42 @@ files = [ [package.extras] test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] +[[package]] +name = "pytest" +version = "8.3.5" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"}, + {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2" + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "pytest-datafiles" +version = "3.0.0" +description = "py.test plugin to create a 'tmp_path' containing predefined files/directories." +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "pytest-datafiles-3.0.0.tar.gz", hash = "sha256:a70c4c66a36d1cdcfc095607f04eee66eaef3fa64cbb62d60c47ce169901d1d4"}, + {file = "pytest_datafiles-3.0.0-py2.py3-none-any.whl", hash = "sha256:2176e10d3f6e76f358925a897e21e2bcc5a0170b92fac4e66ed055eaa2ca6a22"}, +] + +[package.dependencies] +pytest = ">=3.6" + [[package]] name = "sqlalchemy" version = "2.0.40" @@ -419,5 +496,5 @@ files = [ [metadata] lock-version = "2.1" -python-versions = ">=3.12" -content-hash = "b94c26465ac498c7656fafdf498f424379a1e882e5da6ca3a9dcac48b076b770" +python-versions = ">=3.12,<4.0" +content-hash = "b774e71210a804105de6c03976133f42953ebfc5be59ef0d2c6cc62c842a6372" diff --git a/pyproject.toml b/pyproject.toml index 62cb958abe3568216da745ee959c520a021029ba..0120b6f4a8fd720fa86ccb718ef2b30839b78eed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ maintainers = [ ] license = "AFL" readme = "README.md" -requires-python = ">=3.12" +requires-python = ">=3.12,<4.0" keywords = ["GPFS", "policy", "aggregation", "reporting"] dynamic = ["version","dependencies","classifiers"] @@ -33,12 +33,14 @@ classifiers = [ version = "0.0.0" [tool.poetry.dependencies] -python = ">=3.12" +python = ">=3.12,<4.0" colormaps = "*" numpy = "*" plotly = "^5.24.1" polars = ">=1.27.0" pyarrow = "^19.0.1" +pytest = "^8.3.5" +pytest-datafiles = "^3.0.0" sqlalchemy = "*" typeguard = "*" @@ -66,3 +68,10 @@ folders = [ requires = ["poetry-core>=2.0.0,<3.0.0","poetry-dynamic-versioning>=1.0.0,<2.0.0"] build-backend = "poetry_dynamic_versioning.backend" + +[tool.pytest.ini_options] +addopts = [ + "--import-mode=importlib", +] +required_plugins = ["pytest-datafiles>=3.0.0"] +testpaths = ["tests"] \ No newline at end of file diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000000000000000000000000000000000000..10bc61b33a98623344329b56b0d07a195f11e0d9 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,85 @@ +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".gitlab-ci.yml", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".poetry", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", + "poetry.toml", + "poetry.lock", + "test-data", + "legacy-scripts", + "extra", + "data" +] + +# Same as Black. +line-length = 88 +indent-width = 4 + +# Assume Python 3.13 +target-version = "py313" + +[lint] +# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. +# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or +# McCabe complexity (`C901`) by default. +select = ["E4", "E7", "E9", "F"] +ignore = [] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +# Enable auto-formatting of code examples in docstrings. Markdown, +# reStructuredText code/literal blocks and doctests are all supported. +# +# This is currently disabled by default, but it is planned for this +# to be opt-out in the future. +docstring-code-format = false + +# Set the line length limit used when formatting code snippets in +# docstrings. +# +# This only has an effect when the `docstring-code-format` setting is +# enabled. +docstring-code-line-length = "dynamic" \ No newline at end of file diff --git a/src/rc_gpfs/utils/core.py b/src/rc_gpfs/utils/core.py index 2596c4cc0a472977dac337842ce5ee1371859b44..d62115ababdfc742f6da29b8552becbaae90f107 100644 --- a/src/rc_gpfs/utils/core.py +++ b/src/rc_gpfs/utils/core.py @@ -8,7 +8,7 @@ import pyarrow.parquet as pq import numpy as np from .units import as_bytes, convert_si, create_size_bin_labels -from .datetime import * +from .datetime import as_datetime,create_timedelta_breakpoints,create_timedelta_labels def parse_scontrol(): job_id = os.getenv('SLURM_JOB_ID') diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dc1802e761b9533c7bcfabf958da49a28d8b1569 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,346 @@ +import pytest +from typing import Literal +from pathlib import Path +from rc_gpfs import utils + +from polars.testing import assert_series_equal +import polars as pl +import numpy as np + + +### General Purpose Utils +@pytest.mark.parametrize("path", ["/data/rc/gpfs-policy", Path("/data/rc/gpfs-policy")]) +def test_as_path_valid(path: Path | Literal["/data/rc/gpfs-policy"]): + p_path = utils.as_path(path) + assert isinstance(p_path, Path) + + +@pytest.mark.parametrize( + "value,unit,to_unit,use_binary,expected", + [ + (1, "G", "K", False, 1000000), + (4, "K", "base", True, 4096), + ("100", "base", "T", False, 1e-10), + ], +) +def test_convert_si(value, unit, to_unit, use_binary, expected): + assert utils.convert_si(value, unit, to_unit, use_binary) == expected + + +### Memory and File Size Utils +@pytest.mark.parametrize( + "val,default,expected", + [ + ("1 kiB", None, 1024), + ("1 kiB", (1024**4), 1024), + ("10 TiB", None, 10 * (1024**4)), + (None, 1024, 1024), + (None, None, None), + ("1.5 MiB", None, 1572864), + ], +) +def test_as_bytes(val, default, expected): + assert utils.as_bytes(val, default) == expected + + +class TestSizeGrouping: + input_sizes = pl.Series( + name="size", + values=[ + 0, + 2048, + 4096, + 1024**2, # 1 MiB + 1024**3, # 1 GiB + 20 * 1024**3, # 20 GiB + 1024**5, # 1 PiB + ], + dtype=pl.Int128(), + ) + + expected_size_groups = [ + pl.Series( + name="size", + values=[ + "0 B-4 KiB", + "0 B-4 KiB", + "0 B-4 KiB", + "4 KiB-4 MiB", + "4 MiB-1 GiB", + "10 GiB-100 GiB", + ">1 TiB", + ], + dtype=pl.Enum( + [ + "0 B-4 KiB", + "4 KiB-4 MiB", + "4 MiB-1 GiB", + "1 GiB-10 GiB", + "10 GiB-100 GiB", + "100 GiB-1 TiB", + ">1 TiB", + ] + ), + ), + pl.Series( + name="size", + values=[ + "0 B-4 KiB", + "0 B-4 KiB", + "0 B-4 KiB", + "4 KiB-4 MiB", + "4 MiB-1 GiB", + "10 GiB-100 GiB", + ">1 TiB", + ], + dtype=pl.Enum( + [ + "0 B-4 KiB", + "4 KiB-4 MiB", + "4 MiB-1 GiB", + "1 GiB-10 GiB", + "10 GiB-100 GiB", + "100 GiB-1 TiB", + ">1 TiB", + ] + ), + ), + pl.Series( + name="size", + values=[ + "0 B-1 KiB", + "1 KiB-4 KiB", + "1 KiB-4 KiB", + "4 KiB-10 GiB", + "4 KiB-10 GiB", + ">10 GiB", + ">10 GiB", + ], + dtype=pl.Enum(["0 B-1 KiB", "1 KiB-4 KiB", "4 KiB-10 GiB", ">10 GiB"]), + ), + pl.Series( + name="size", + values=[ + "0 B-1 MiB", + "0 B-1 MiB", + "0 B-1 MiB", + "0 B-1 MiB", + ">1 MiB", + ">1 MiB", + ">1 MiB", + ], + dtype=pl.Enum( + [ + "0 B-1 MiB", + ">1 MiB", + ] + ), + ), + ] + + input_bins = [ + ["4 KiB", "4 MiB", "1 GiB", "10 GiB", "100 GiB", "1 TiB"], + [4096, 4194304, 1073741824, 10737418240, 107374182400, 1099511627776], + ["10 GiB", 1024, 4096, "1 KiB", 0], + "1 MiB", + ] + + expected_bins = [ + [4096, 4194304, 1073741824, 10737418240, 107374182400, 1099511627776], + [4096, 4194304, 1073741824, 10737418240, 107374182400, 1099511627776], + [1024, 4096, 10737418240], + [1048576], + ] + + expected_labels = [ + [ + "0 B-4 KiB", + "4 KiB-4 MiB", + "4 MiB-1 GiB", + "1 GiB-10 GiB", + "10 GiB-100 GiB", + "100 GiB-1 TiB", + ">1 TiB", + ], + [ + "0 B-4 KiB", + "4 KiB-4 MiB", + "4 MiB-1 GiB", + "1 GiB-10 GiB", + "10 GiB-100 GiB", + "100 GiB-1 TiB", + ">1 TiB", + ], + ["0 B-1 KiB", "1 KiB-4 KiB", "4 KiB-10 GiB", ">10 GiB"], + ["0 B-1 MiB", ">1 MiB"], + ] + + @pytest.mark.parametrize("bins,expected", list(zip(expected_bins, expected_labels))) + def test_create_size_bin_labels(self, bins, expected): + assert utils.create_size_bin_labels(bins) == expected + + @pytest.mark.parametrize( + "bins,expected", + list(zip(input_bins, list(zip(expected_bins, expected_labels)))), + ) + def test_prep_size_distribution(self, bins, expected): + assert utils.prep_size_distribution(size_bins=bins) == expected + + @pytest.mark.parametrize( + "bins,expected", list(zip(input_bins, expected_size_groups)), ids=[1, 2, 3, 4] + ) + def test_calculate_size_distribution(self, bins, expected): + assert_series_equal( + utils.calculate_size_distribution(self.input_sizes, size_bins=bins), + expected, + ) + + +### DateTime and File Age Utils +@pytest.mark.parametrize( + "date,expected", + [ + ("2025-01-01", np.datetime64("2025-01-01T00:00:00.000000000")), + ( + np.datetime64("2025-01-01T00:00:00.000000000"), + np.datetime64("2025-01-01T00:00:00.000000000"), + ), + (1735689600000000000, 1735689600000000000), + (None, None), + ], +) +def test_as_datetime(date, expected): + assert utils.as_datetime(date) == expected + + +def test_as_datetime_fails(): + with pytest.raises(ValueError): + utils.as_datetime("not a date") + + +@pytest.mark.parametrize("val", [1, 3, 5]) +@pytest.mark.parametrize("unit", ["D", "W"]) +def test_as_timedelta(val, unit): + assert utils.as_timedelta(val, unit) == np.timedelta64(val, unit) + + +class TestAgeGrouping: + timestamps = pl.Series( + name="access", + values=[ + "2024-05-12", + "2025-02-21", + "2025-04-02", + "2025-04-17", + "2025-05-17", + ], + ).str.to_datetime(time_unit="ns") + + acq_date = "2025-06-01" + + delta_vals = [[-2, 30, 60, 90, 180], 365, [0, 4, 8, 12, 16], 52] + + delta_unit = ["D", "D", "W", "W"] + + expected_breakpoints = [ + [ + np.datetime64("2025-05-02T00:00:00.000000000"), + np.datetime64("2025-04-02T00:00:00.000000000"), + np.datetime64("2025-03-03T00:00:00.000000000"), + np.datetime64("2024-12-03T00:00:00.000000000"), + ], + np.datetime64("2024-06-01T00:00:00.000000000"), + [ + np.datetime64("2025-05-04T00:00:00.000000000"), + np.datetime64("2025-04-06T00:00:00.000000000"), + np.datetime64("2025-03-09T00:00:00.000000000"), + np.datetime64("2025-02-09T00:00:00.000000000"), + ], + np.datetime64("2024-06-02T00:00:00.000000000"), + ] + + expected_labels = [ + [">180D", "90D-180D", "60D-90D", "30D-60D", "<30D"], + [ + ">365D", + "<365D", + ], + [">16W", "12W-16W", "8W-12W", "4W-8W", "<4W"], + [">52W", "<52W"], + ] + + expected_age_groups_days = [ + pl.Series( + name="access", + values=[ + ">180D", + "90D-180D", + "60D-90D", + "30D-60D", + "<30D", + ], + dtype=pl.Enum([">180D", "90D-180D", "60D-90D", "30D-60D", "<30D"]), + ), + pl.Series( + name="access", + values=[">365D", "<365D", "<365D", "<365D", "<365D"], + dtype=pl.Enum([">365D", "<365D"]), + ), + pl.Series( + name="access", + values=[">16W", "12W-16W", "8W-12W", "4W-8W", "<4W"], + dtype=pl.Enum( + [">16W", "12W-16W", "8W-12W", "4W-8W", "<4W"], + ), + ), + pl.Series( + name="access", + values=[">52W", "<52W", "<52W", "<52W", "<52W"], + dtype=pl.Enum([">52W", "<52W"]), + ), + ] + + @pytest.mark.parametrize( + "delta_vals,delta_unit,expected", + list(zip(delta_vals, delta_unit, expected_breakpoints)), + ) + def test_create_timedelta_breakpoints_days(self, delta_vals, delta_unit, expected): + assert ( + utils.create_timedelta_breakpoints(self.acq_date, delta_vals, delta_unit) + == expected + ) + + @pytest.mark.parametrize( + "delta_vals,delta_unit,expected", + list(zip(delta_vals, delta_unit, expected_labels)), + ) + def test_create_timedelta_labels(self, delta_vals, delta_unit, expected): + assert utils.create_timedelta_labels(delta_vals, delta_unit) == expected + + @pytest.mark.parametrize( + "age_breakpoints,delta_unit,expected", + list( + zip( + delta_vals, delta_unit, list(zip(expected_breakpoints, expected_labels)) + ) + ), + ) + def test_prep_age_distribution_days(self, age_breakpoints, delta_unit, expected): + assert ( + utils.prep_age_distribution(self.acq_date, age_breakpoints, delta_unit) + == expected + ) + + @pytest.mark.parametrize( + "age_breakpoints,delta_unit,expected", + list(zip(delta_vals, delta_unit, expected_age_groups_days)), + ) + def test_calculate_age_distribution_days( + self, age_breakpoints, delta_unit, expected + ): + assert_series_equal( + utils.calculate_age_distribution( + self.timestamps, self.acq_date, age_breakpoints, delta_unit + ), + expected, + )