Add initial test framework for utils functions

733c118c · Matthew K Defenderfer · df8d1a68 · 733c118c · 733c118c · 733c118c
Commit 733c118c authored 6 months ago by Matthew K Defenderfer
--- a/poetry.lock
+++ b/poetry.lock
 # This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.

+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+groups = ["main"]
+markers = "sys_platform == \"win32\""
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
 [[package]]
 name = "colormaps"
 version = "0.4.2"
@@ -82,6 +95,18 @@ files = [
 docs = ["Sphinx", "furo"]
 test = ["objgraph", "psutil"]

+[[package]]
+name = "iniconfig"
+version = "2.1.0"
+description = "brain-dead simple config-ini parsing"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
+    {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
+]
+
 [[package]]
 name = "numpy"
 version = "2.2.5"
@@ -175,6 +200,22 @@ files = [
 packaging = "*"
 tenacity = ">=6.2.0"

+[[package]]
+name = "pluggy"
+version = "1.5.0"
+description = "plugin and hook calling mechanisms for python"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
+]
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
 [[package]]
 name = "polars"
 version = "1.27.1"
@@ -274,6 +315,42 @@ files = [
 [package.extras]
 test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]

+[[package]]
+name = "pytest"
+version = "8.3.5"
+description = "pytest: simple powerful testing with Python"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
+    {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=1.5,<2"
+
+[package.extras]
+dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+
+[[package]]
+name = "pytest-datafiles"
+version = "3.0.0"
+description = "py.test plugin to create a 'tmp_path' containing predefined files/directories."
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "pytest-datafiles-3.0.0.tar.gz", hash = "sha256:a70c4c66a36d1cdcfc095607f04eee66eaef3fa64cbb62d60c47ce169901d1d4"},
+    {file = "pytest_datafiles-3.0.0-py2.py3-none-any.whl", hash = "sha256:2176e10d3f6e76f358925a897e21e2bcc5a0170b92fac4e66ed055eaa2ca6a22"},
+]
+
+[package.dependencies]
+pytest = ">=3.6"
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.40"
@@ -419,5 +496,5 @@ files = [

 [metadata]
 lock-version = "2.1"
-python-versions = ">=3.12"
-content-hash = "b94c26465ac498c7656fafdf498f424379a1e882e5da6ca3a9dcac48b076b770"
+python-versions = ">=3.12,<4.0"
+content-hash = "b774e71210a804105de6c03976133f42953ebfc5be59ef0d2c6cc62c842a6372"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ maintainers = [
 ]
 license = "AFL"
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.12,<4.0"
 keywords = ["GPFS", "policy", "aggregation", "reporting"]
 dynamic = ["version","dependencies","classifiers"]

@@ -33,12 +33,14 @@ classifiers = [
 version = "0.0.0"

 [tool.poetry.dependencies]
-python = ">=3.12"
+python = ">=3.12,<4.0"
 colormaps = "*"
 numpy = "*"
 plotly = "^5.24.1"
 polars = ">=1.27.0"
 pyarrow = "^19.0.1"
+pytest = "^8.3.5"
+pytest-datafiles = "^3.0.0"
 sqlalchemy = "*"
 typeguard = "*"

@@ -66,3 +68,10 @@ folders = [
 requires = ["poetry-core>=2.0.0,<3.0.0","poetry-dynamic-versioning>=1.0.0,<2.0.0"]
 build-backend = "poetry_dynamic_versioning.backend"

+
+[tool.pytest.ini_options]
+addopts = [
+    "--import-mode=importlib",
+]
+required_plugins = ["pytest-datafiles>=3.0.0"]
+testpaths = ["tests"]
\ No newline at end of file
--- a/ruff.toml
+++ b/ruff.toml
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".gitlab-ci.yml",
+    ".git-rewrite",
+    ".hg",
+    ".ipynb_checkpoints",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".poetry",
+    ".pyenv",
+    ".pytest_cache",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    ".vscode",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "site-packages",
+    "venv",
+    "poetry.toml",
+    "poetry.lock",
+    "test-data",
+    "legacy-scripts",
+    "extra",
+    "data"
+]
+
+# Same as Black.
+line-length = 88
+indent-width = 4
+
+# Assume Python 3.13
+target-version = "py313"
+
+[lint]
+# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
+# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
+# McCabe complexity (`C901`) by default.
+select = ["E4", "E7", "E9", "F"]
+ignore = []
+
+# Allow fix for all enabled rules (when `--fix`) is provided.
+fixable = ["ALL"]
+unfixable = []
+
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
+[format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
+
+# Enable auto-formatting of code examples in docstrings. Markdown,
+# reStructuredText code/literal blocks and doctests are all supported.
+#
+# This is currently disabled by default, but it is planned for this
+# to be opt-out in the future.
+docstring-code-format = false
+
+# Set the line length limit used when formatting code snippets in
+# docstrings.
+#
+# This only has an effect when the `docstring-code-format` setting is
+# enabled.
+docstring-code-line-length = "dynamic"
\ No newline at end of file
--- a/src/rc_gpfs/utils/core.py
+++ b/src/rc_gpfs/utils/core.py
@@ -8,7 +8,7 @@ import pyarrow.parquet as pq
 import numpy as np

 from .units import as_bytes, convert_si, create_size_bin_labels
-from .datetime import *
+from .datetime import as_datetime,create_timedelta_breakpoints,create_timedelta_labels

 def parse_scontrol():
    job_id = os.getenv('SLURM_JOB_ID')

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
+import pytest
+from typing import Literal
+from pathlib import Path
+from rc_gpfs import utils
+
+from polars.testing import assert_series_equal
+import polars as pl
+import numpy as np
+
+
+### General Purpose Utils
+@pytest.mark.parametrize("path", ["/data/rc/gpfs-policy", Path("/data/rc/gpfs-policy")])
+def test_as_path_valid(path: Path | Literal["/data/rc/gpfs-policy"]):
+    p_path = utils.as_path(path)
+    assert isinstance(p_path, Path)
+
+
+@pytest.mark.parametrize(
+    "value,unit,to_unit,use_binary,expected",
+    [
+        (1, "G", "K", False, 1000000),
+        (4, "K", "base", True, 4096),
+        ("100", "base", "T", False, 1e-10),
+    ],
+)
+def test_convert_si(value, unit, to_unit, use_binary, expected):
+    assert utils.convert_si(value, unit, to_unit, use_binary) == expected
+
+
+### Memory and File Size Utils
+@pytest.mark.parametrize(
+    "val,default,expected",
+    [
+        ("1 kiB", None, 1024),
+        ("1 kiB", (1024**4), 1024),
+        ("10 TiB", None, 10 * (1024**4)),
+        (None, 1024, 1024),
+        (None, None, None),
+        ("1.5 MiB", None, 1572864),
+    ],
+)
+def test_as_bytes(val, default, expected):
+    assert utils.as_bytes(val, default) == expected
+
+
+class TestSizeGrouping:
+    input_sizes = pl.Series(
+        name="size",
+        values=[
+            0,
+            2048,
+            4096,
+            1024**2,  # 1 MiB
+            1024**3,  # 1 GiB
+            20 * 1024**3,  # 20 GiB
+            1024**5,  # 1 PiB
+        ],
+        dtype=pl.Int128(),
+    )
+
+    expected_size_groups = [
+        pl.Series(
+            name="size",
+            values=[
+                "0 B-4 KiB",
+                "0 B-4 KiB",
+                "0 B-4 KiB",
+                "4 KiB-4 MiB",
+                "4 MiB-1 GiB",
+                "10 GiB-100 GiB",
+                ">1 TiB",
+            ],
+            dtype=pl.Enum(
+                [
+                    "0 B-4 KiB",
+                    "4 KiB-4 MiB",
+                    "4 MiB-1 GiB",
+                    "1 GiB-10 GiB",
+                    "10 GiB-100 GiB",
+                    "100 GiB-1 TiB",
+                    ">1 TiB",
+                ]
+            ),
+        ),
+        pl.Series(
+            name="size",
+            values=[
+                "0 B-4 KiB",
+                "0 B-4 KiB",
+                "0 B-4 KiB",
+                "4 KiB-4 MiB",
+                "4 MiB-1 GiB",
+                "10 GiB-100 GiB",
+                ">1 TiB",
+            ],
+            dtype=pl.Enum(
+                [
+                    "0 B-4 KiB",
+                    "4 KiB-4 MiB",
+                    "4 MiB-1 GiB",
+                    "1 GiB-10 GiB",
+                    "10 GiB-100 GiB",
+                    "100 GiB-1 TiB",
+                    ">1 TiB",
+                ]
+            ),
+        ),
+        pl.Series(
+            name="size",
+            values=[
+                "0 B-1 KiB",
+                "1 KiB-4 KiB",
+                "1 KiB-4 KiB",
+                "4 KiB-10 GiB",
+                "4 KiB-10 GiB",
+                ">10 GiB",
+                ">10 GiB",
+            ],
+            dtype=pl.Enum(["0 B-1 KiB", "1 KiB-4 KiB", "4 KiB-10 GiB", ">10 GiB"]),
+        ),
+        pl.Series(
+            name="size",
+            values=[
+                "0 B-1 MiB",
+                "0 B-1 MiB",
+                "0 B-1 MiB",
+                "0 B-1 MiB",
+                ">1 MiB",
+                ">1 MiB",
+                ">1 MiB",
+            ],
+            dtype=pl.Enum(
+                [
+                    "0 B-1 MiB",
+                    ">1 MiB",
+                ]
+            ),
+        ),
+    ]
+
+    input_bins = [
+        ["4 KiB", "4 MiB", "1 GiB", "10 GiB", "100 GiB", "1 TiB"],
+        [4096, 4194304, 1073741824, 10737418240, 107374182400, 1099511627776],
+        ["10 GiB", 1024, 4096, "1 KiB", 0],
+        "1 MiB",
+    ]
+
+    expected_bins = [
+        [4096, 4194304, 1073741824, 10737418240, 107374182400, 1099511627776],
+        [4096, 4194304, 1073741824, 10737418240, 107374182400, 1099511627776],
+        [1024, 4096, 10737418240],
+        [1048576],
+    ]
+
+    expected_labels = [
+        [
+            "0 B-4 KiB",
+            "4 KiB-4 MiB",
+            "4 MiB-1 GiB",
+            "1 GiB-10 GiB",
+            "10 GiB-100 GiB",
+            "100 GiB-1 TiB",
+            ">1 TiB",
+        ],
+        [
+            "0 B-4 KiB",
+            "4 KiB-4 MiB",
+            "4 MiB-1 GiB",
+            "1 GiB-10 GiB",
+            "10 GiB-100 GiB",
+            "100 GiB-1 TiB",
+            ">1 TiB",
+        ],
+        ["0 B-1 KiB", "1 KiB-4 KiB", "4 KiB-10 GiB", ">10 GiB"],
+        ["0 B-1 MiB", ">1 MiB"],
+    ]
+
+    @pytest.mark.parametrize("bins,expected", list(zip(expected_bins, expected_labels)))
+    def test_create_size_bin_labels(self, bins, expected):
+        assert utils.create_size_bin_labels(bins) == expected
+
+    @pytest.mark.parametrize(
+        "bins,expected",
+        list(zip(input_bins, list(zip(expected_bins, expected_labels)))),
+    )
+    def test_prep_size_distribution(self, bins, expected):
+        assert utils.prep_size_distribution(size_bins=bins) == expected
+
+    @pytest.mark.parametrize(
+        "bins,expected", list(zip(input_bins, expected_size_groups)), ids=[1, 2, 3, 4]
+    )
+    def test_calculate_size_distribution(self, bins, expected):
+        assert_series_equal(
+            utils.calculate_size_distribution(self.input_sizes, size_bins=bins),
+            expected,
+        )
+
+
+### DateTime and File Age Utils
+@pytest.mark.parametrize(
+    "date,expected",
+    [
+        ("2025-01-01", np.datetime64("2025-01-01T00:00:00.000000000")),
+        (
+            np.datetime64("2025-01-01T00:00:00.000000000"),
+            np.datetime64("2025-01-01T00:00:00.000000000"),
+        ),
+        (1735689600000000000, 1735689600000000000),
+        (None, None),
+    ],
+)
+def test_as_datetime(date, expected):
+    assert utils.as_datetime(date) == expected
+
+
+def test_as_datetime_fails():
+    with pytest.raises(ValueError):
+        utils.as_datetime("not a date")
+
+
+@pytest.mark.parametrize("val", [1, 3, 5])
+@pytest.mark.parametrize("unit", ["D", "W"])
+def test_as_timedelta(val, unit):
+    assert utils.as_timedelta(val, unit) == np.timedelta64(val, unit)
+
+
+class TestAgeGrouping:
+    timestamps = pl.Series(
+        name="access",
+        values=[
+            "2024-05-12",
+            "2025-02-21",
+            "2025-04-02",
+            "2025-04-17",
+            "2025-05-17",
+        ],
+    ).str.to_datetime(time_unit="ns")
+
+    acq_date = "2025-06-01"
+
+    delta_vals = [[-2, 30, 60, 90, 180], 365, [0, 4, 8, 12, 16], 52]
+
+    delta_unit = ["D", "D", "W", "W"]
+
+    expected_breakpoints = [
+        [
+            np.datetime64("2025-05-02T00:00:00.000000000"),
+            np.datetime64("2025-04-02T00:00:00.000000000"),
+            np.datetime64("2025-03-03T00:00:00.000000000"),
+            np.datetime64("2024-12-03T00:00:00.000000000"),
+        ],
+        np.datetime64("2024-06-01T00:00:00.000000000"),
+        [
+            np.datetime64("2025-05-04T00:00:00.000000000"),
+            np.datetime64("2025-04-06T00:00:00.000000000"),
+            np.datetime64("2025-03-09T00:00:00.000000000"),
+            np.datetime64("2025-02-09T00:00:00.000000000"),
+        ],
+        np.datetime64("2024-06-02T00:00:00.000000000"),
+    ]
+
+    expected_labels = [
+        [">180D", "90D-180D", "60D-90D", "30D-60D", "<30D"],
+        [
+            ">365D",
+            "<365D",
+        ],
+        [">16W", "12W-16W", "8W-12W", "4W-8W", "<4W"],
+        [">52W", "<52W"],
+    ]
+
+    expected_age_groups_days = [
+        pl.Series(
+            name="access",
+            values=[
+                ">180D",
+                "90D-180D",
+                "60D-90D",
+                "30D-60D",
+                "<30D",
+            ],
+            dtype=pl.Enum([">180D", "90D-180D", "60D-90D", "30D-60D", "<30D"]),
+        ),
+        pl.Series(
+            name="access",
+            values=[">365D", "<365D", "<365D", "<365D", "<365D"],
+            dtype=pl.Enum([">365D", "<365D"]),
+        ),
+        pl.Series(
+            name="access",
+            values=[">16W", "12W-16W", "8W-12W", "4W-8W", "<4W"],
+            dtype=pl.Enum(
+                [">16W", "12W-16W", "8W-12W", "4W-8W", "<4W"],
+            ),
+        ),
+        pl.Series(
+            name="access",
+            values=[">52W", "<52W", "<52W", "<52W", "<52W"],
+            dtype=pl.Enum([">52W", "<52W"]),
+        ),
+    ]
+
+    @pytest.mark.parametrize(
+        "delta_vals,delta_unit,expected",
+        list(zip(delta_vals, delta_unit, expected_breakpoints)),
+    )
+    def test_create_timedelta_breakpoints_days(self, delta_vals, delta_unit, expected):
+        assert (
+            utils.create_timedelta_breakpoints(self.acq_date, delta_vals, delta_unit)
+            == expected
+        )
+
+    @pytest.mark.parametrize(
+        "delta_vals,delta_unit,expected",
+        list(zip(delta_vals, delta_unit, expected_labels)),
+    )
+    def test_create_timedelta_labels(self, delta_vals, delta_unit, expected):
+        assert utils.create_timedelta_labels(delta_vals, delta_unit) == expected
+
+    @pytest.mark.parametrize(
+        "age_breakpoints,delta_unit,expected",
+        list(
+            zip(
+                delta_vals, delta_unit, list(zip(expected_breakpoints, expected_labels))
+            )
+        ),
+    )
+    def test_prep_age_distribution_days(self, age_breakpoints, delta_unit, expected):
+        assert (
+            utils.prep_age_distribution(self.acq_date, age_breakpoints, delta_unit)
+            == expected
+        )
+
+    @pytest.mark.parametrize(
+        "age_breakpoints,delta_unit,expected",
+        list(zip(delta_vals, delta_unit, expected_age_groups_days)),
+    )
+    def test_calculate_age_distribution_days(
+        self, age_breakpoints, delta_unit, expected
+    ):
+        assert_series_equal(
+            utils.calculate_age_distribution(
+                self.timestamps, self.acq_date, age_breakpoints, delta_unit
+            ),
+            expected,
+        )