Source code for beorn.load_input_data.cosmo_sim_pkdgrav

"""PKDGrav3 N-body data loader for BEoRN.

Provides :class:`PKDGravLoader`, a concrete implementation of
:class:`~beorn.load_input_data.nbody_base.NBodyLoader` for data produced by
the `PKDGrav3 <https://github.com/pkdgrav3/pkdgrav3>`_ N-body code.

**Expected data layout** (all paths relative to ``cosmo_sim.file_root``):

- ``<log_filename>`` — PKDGrav3 log file; column 1 is the redshift at each
  step (row index = step number − 1).
- ``<density_subpath>/<prefix>.<step>.den.256.0`` — binary float32 density
  grids on a 256³ mesh (or another resolution; controlled by
  ``simulation.Ncell``).
- ``<halo_subpath>/<prefix>.<step>.<halo_suffix>`` — halo catalog files
  produced by PKDGrav3's built-in FOF finder or an external code such as
  AHF or Rockstar.  Only snapshots that have a catalog file can contribute
  halos during painting; snapshots without one are treated as having zero
  halos.

The step number embedded in each filename is used to look up the
corresponding redshift in the log, so **partial datasets** (not every step
has every file type) are handled correctly.

Example — default PKDGrav3 FOF output::

    loader = PKDGravLoader(parameters)

Example — AHF catalogs in a separate directory, with user-supplied catalog::

    loader = PKDGravLoader(
        parameters,
        halo_subpath="AHF_output",
        halo_glob="*.AHF_halos",
        catalog_yml="my_catalog.yml",
    )
"""
from __future__ import annotations

import re
from pathlib import Path
from typing import Optional

import numpy as np

from .nbody_base import NBodyLoader
from ..structs import Parameters


def _guess_n_particles(file_root: Path, log_filename: str = "") -> Optional[int]:
    """Try to infer the particle count from the directory name or log filename.

    PKDGrav3 runs are commonly named ``CDM_<L>Mpc_<N>`` where ``N`` is the
    number of particles per side.  For example ``CDM_200Mpc_2048`` →
    2048³ = 8,589,934,592 particles.  This function checks the directory
    name first, then falls back to the log filename stem.

    Returns ``None`` if the pattern is not recognised.
    """
    # pattern: anything ending in _<digits>Mpc_<digits> or _<digits>
    pattern = r'_(\d+)(?:Mpc)?_(\d+)'
    for name in (file_root.name, Path(log_filename).stem):
        match = re.search(pattern, name)
        if match:
            n_side = int(match.group(2))
            return n_side ** 3
    return None



[docs]
class PKDGravLoader(NBodyLoader):
    """Loader for PKDGrav3 N-body simulation data.

    Reads the PKDGrav3 log file to map step numbers to redshifts, then
    discovers density and halo files by globbing the configured directories.
    The halo-file glob pattern can be changed to support different halo
    finders (PKDGrav3 built-in FOF, AHF, Rockstar, …).

    All snapshot-catalog caching, partial-dataset handling, and informative
    logging is provided by the parent
    :class:`~beorn.load_input_data.nbody_base.NBodyLoader`.

    Args:
        parameters (Parameters): BEoRN parameter container.  Must have
            ``cosmo_sim.file_root`` pointing at the root of the N-body data.
        density_subpath (str): Subdirectory (relative to ``file_root``)
            containing the binary density-grid files.  Default: ``"nc256"``.
        halo_subpath (str): Subdirectory (relative to ``file_root``) containing
            halo catalog files.  Default: ``"haloes"``.
        halo_glob (str): Glob pattern used to discover halo catalog files
            inside ``halo_subpath``.  Change this to match the output format
            of AHF (``"*.AHF_halos"``), Rockstar (``"halos_*.ascii"``), etc.
            Default: ``"*.fof.txt"`` (PKDGrav3 built-in FOF finder).
        density_glob (str): Glob pattern for density files.
            Default: ``"*.den.256.0"``.
        log_filename (str): Name of the PKDGrav3 log file.
            Default: ``"CDM_200Mpc_2048.log"``.
        catalog_yml (Path | str | None): Path to an existing snapshot catalog
            YML file.  If supplied, BEoRN reads that file directly and skips
            the directory scan.  See :class:`~beorn.load_input_data.nbody_base.NBodyLoader`
            for details.  Default: ``None`` (auto-find or auto-create).
        halo_finder (str | None): Override the halo-finder name stored in the
            YML.  If ``None`` (default), the name is inferred from
            ``halo_glob``.
    """

    simulation_code = "PKDGrav3"

    #: Map halo-file glob patterns to human-readable finder names.
    _HALO_FINDER_NAMES: dict[str, str] = {
        "*.fof.txt":      "PKDGrav3 FOF (built-in)",
        "*.AHF_halos":    "AHF",
        "*.rockstar":     "Rockstar",
        "halos_*.ascii":  "Rockstar (ascii)",
    }


[docs]
    def __init__(
        self,
        parameters: Parameters,
        density_subpath: str = "nc256",
        halo_subpath: str = "haloes",
        halo_glob: str = "*.fof.txt",
        density_glob: str = "*.den.256.0",
        log_filename: str = "CDM_200Mpc_2048.log",
        catalog_yml: Optional[Path] = None,
        halo_finder: Optional[str] = None,
    ):
        self._halo_glob = halo_glob
        self._density_glob = density_glob
        self._log_filename = log_filename

        file_root = Path(parameters.cosmo_sim.file_root)

        # Infer halo finder name from the glob pattern if not supplied.
        resolved_halo_finder = (
            halo_finder
            if halo_finder is not None
            else self._HALO_FINDER_NAMES.get(halo_glob, f"unknown (glob: {halo_glob})")
        )

        # Try to infer particle count from the directory or log filename,
        # e.g. "CDM_200Mpc_2048.log" → 2048³ particles.
        n_particles = _guess_n_particles(file_root, log_filename)

        super().__init__(
            parameters,
            density_dir=file_root / density_subpath,
            halo_dir=file_root / halo_subpath,
            catalog_yml=catalog_yml,
            halo_finder=resolved_halo_finder,
            n_particles=n_particles,
        )


    # ── NBodyLoader abstract interface ────────────────────────────────────────


[docs]
    def discover_snapshots(self) -> list[dict]:
        """Scan the PKDGrav3 data directories and return the snapshot inventory.

        Reads the PKDGrav3 log file (column 1 = redshift at each step, with
        ``row_index = step_number − 1``) and matches density and halo files by
        their embedded step number.

        Returns:
            list[dict]: One entry per density file found, with keys
            ``"redshift"``, ``"density_file"``, and ``"halo_file"`` (the
            latter is ``None`` when no matching catalog exists for that step).
            File paths are relative to ``self.file_root``.
        """
        log_path = self.file_root / self._log_filename
        logs = np.loadtxt(log_path)
        log_z = logs[:, 1]  # redshift at step N is log_z[N - 1]

        def _step(path: Path) -> int:
            """Parse the zero-padded step number from a PKDGrav3 filename."""
            return int(path.name.split('.')[1])

        density_by_step = {
            _step(p): p for p in self.density_dir.glob(self._density_glob)
        }
        catalog_by_step = {
            _step(p): p for p in self.halo_dir.glob(self._halo_glob)
        }

        snapshots = []
        for step in sorted(density_by_step):
            row = step - 1
            if row < 0 or row >= len(log_z):
                self.logger.debug(
                    f"Step {step} has no corresponding row in {self._log_filename} — skipping."
                )
                continue

            density_rel = density_by_step[step].relative_to(self.file_root)
            halo_path = catalog_by_step.get(step)
            halo_rel = str(halo_path.relative_to(self.file_root)) if halo_path else None

            snapshots.append({
                "redshift": float(log_z[row]),
                "density_file": str(density_rel),
                "halo_file": halo_rel,
            })

        return snapshots


    def _read_halo_file(self, path: Path) -> tuple[np.ndarray, np.ndarray]:
        """Read a PKDGrav3 FOF catalog (or compatible text format).

        The expected file format has four whitespace-separated columns::

            mass  x  y  z

        where mass is in :math:`M_\\odot / h` and positions are in Mpc/h.

        Args:
            path (Path): Absolute path to the ``.fof.txt`` file.

        Returns:
            tuple[np.ndarray, np.ndarray]:
                - ``masses`` — halo masses in :math:`M_\\odot`.
                - ``positions`` — (N, 3) positions in comoving Mpc/h,
                  shifted so the box centre is at the origin.
        """
        catalog_array = np.loadtxt(path)
        if catalog_array.ndim == 1:
            if catalog_array.size == 0:
                return np.array([]), np.zeros((0, 3))
            catalog_array = catalog_array[np.newaxis, :]

        masses = catalog_array[:, 0] * self.parameters.cosmology.h0
        positions = catalog_array[:, 1:] + self.parameters.simulation.Lbox / 2
        return masses, positions

    def _read_density_file(self, path: Path) -> np.ndarray:
        """Read a PKDGrav3 binary density-grid file.

        The file contains a flat array of ``float32`` values representing the
        dark-matter particle count per cell on an
        ``Ncell × Ncell × Ncell`` grid stored in Fortran (column-major) order.

        Args:
            path (Path): Absolute path to the ``.den.256.0`` file.

        Returns:
            numpy.ndarray: 3D baryonic overdensity field
            :math:`\\delta_b = \\rho / \\langle\\rho\\rangle - 1`.
        """
        # Always reshape to the native file size — the base class applies
        # degrade_resolution block-averaging afterwards if requested.
        density_array = np.fromfile(path, dtype=np.float32)
        n = round(density_array.size ** (1 / 3))
        density = density_array.reshape(n, n, n).T
        return density / np.mean(density, dtype=np.float64) - 1


[docs]
    def load_rsd_fields(self, redshift_index: int):
        """RSD fields are not available for PKDGrav3 in this implementation.

        Raises:
            NotImplementedError: Always.
        """
        raise NotImplementedError(
            "RSD fields are not implemented for PKDGravLoader. "
            "Override this method in a subclass if velocity data is available."
        )