"""Tests for hermes_cli.container_boot — the cont-init.d-time
reconciliation that recreates per-profile gateway s6 service slots
from the persistent profiles directory.

These tests run against a fake $HERMES_HOME under tmp_path; no real
s6 supervision tree is required. The in-container integration test
covering end-to-end "docker restart" survival lives in
tests/docker/test_container_restart.py.
"""
from __future__ import annotations

import json
from pathlib import Path

import pytest

from hermes_cli.container_boot import (
    ReconcileAction,
    reconcile_profile_gateways,
)


# ---------------------------------------------------------------------------
# Fixtures + helpers
# ---------------------------------------------------------------------------


def _make_profile(
    hermes_home: Path,
    name: str,
    *,
    state: str | None,
    with_pid: bool = False,
    config: bool = True,
) -> Path:
    """Create a fake profile directory under hermes_home/profiles/<name>/."""
    p = hermes_home / "profiles" / name
    p.mkdir(parents=True)
    if config:
        # SOUL.md is what the reconciler keys on — it's always seeded by
        # `hermes profile create`. See container_boot._render_run_script.
        (p / "SOUL.md").write_text("# fake profile\n")
    if state is not None:
        (p / "gateway_state.json").write_text(json.dumps({
            "gateway_state": state, "timestamp": 1234567890,
        }))
    if with_pid:
        (p / "gateway.pid").write_text(json.dumps(
            {"pid": 99999, "host": "old-container"},
        ))
        (p / "processes.json").write_text("[]")
    return p


def _seed_default_root(
    hermes_home: Path,
    *,
    state: str | None = None,
    with_pid: bool = False,
) -> None:
    """Populate gateway_state.json / stale runtime files at the
    HERMES_HOME root (the implicit default profile)."""
    if state is not None:
        (hermes_home / "gateway_state.json").write_text(json.dumps({
            "gateway_state": state, "timestamp": 1234567890,
        }))
    if with_pid:
        (hermes_home / "gateway.pid").write_text(json.dumps(
            {"pid": 99999, "host": "old-container"},
        ))
        (hermes_home / "processes.json").write_text("[]")


def _named_actions(actions: list[ReconcileAction]) -> list[ReconcileAction]:
    """Drop the always-present default-profile action so tests that
    only care about named profiles can assert against a clean list."""
    return [a for a in actions if a.profile != "default"]


# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------


def test_running_profile_is_registered_and_autostarted(tmp_path: Path) -> None:
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "coder", state="running")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    assert _named_actions(actions) == [ReconcileAction(
        profile="coder", prior_state="running", action="started",
    )]
    svc = scandir / "gateway-coder"
    assert (svc / "run").exists()
    assert (svc / "run").stat().st_mode & 0o111  # executable
    assert (svc / "type").read_text().strip() == "longrun"
    # Auto-start means no down-marker.
    assert not (svc / "down").exists()


def test_stopped_profile_is_registered_but_not_started(tmp_path: Path) -> None:
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "writer", state="stopped")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    assert _named_actions(actions) == [ReconcileAction(
        profile="writer", prior_state="stopped", action="registered",
    )]
    # down marker tells s6-svscan to NOT start the service.
    assert (scandir / "gateway-writer" / "down").exists()


def test_startup_failed_does_not_autostart(tmp_path: Path) -> None:
    """Avoid crash-loop on restart when the gateway was failing to boot."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "broken", state="startup_failed")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    named = _named_actions(actions)
    assert named[0].action == "registered"
    assert (scandir / "gateway-broken" / "down").exists()


def test_starting_state_does_not_autostart(tmp_path: Path) -> None:
    """`starting` means the gateway died mid-boot last time; treat as
    failed, not as a candidate for auto-restart."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "unlucky", state="starting")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    named = _named_actions(actions)
    assert named[0].action == "registered"


def test_stale_runtime_files_are_removed(tmp_path: Path) -> None:
    scandir = tmp_path / "run-service"; scandir.mkdir()
    profile = _make_profile(tmp_path, "coder", state="running", with_pid=True)
    assert (profile / "gateway.pid").exists()
    assert (profile / "processes.json").exists()

    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    assert not (profile / "gateway.pid").exists()
    assert not (profile / "processes.json").exists()


def test_profile_without_state_file_is_registered_but_not_started(
    tmp_path: Path,
) -> None:
    """A freshly-created profile that's never been started: register
    its slot but don't auto-start."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "fresh", state=None)

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    assert _named_actions(actions) == [ReconcileAction(
        profile="fresh", prior_state=None, action="registered",
    )]
    assert (scandir / "gateway-fresh" / "down").exists()


def test_directory_without_marker_file_is_skipped(tmp_path: Path) -> None:
    """A stray dir under profiles/ that isn't actually a profile (no
    SOUL.md — the marker the reconciler keys on) should be skipped."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    # Create a profile dir but without SOUL.md
    (tmp_path / "profiles" / "stray").mkdir(parents=True)

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    assert _named_actions(actions) == []
    assert not (scandir / "gateway-stray").exists()


def test_corrupt_state_file_treated_as_no_prior_state(tmp_path: Path) -> None:
    """If gateway_state.json is malformed JSON, don't blow up the whole
    reconciliation — register the slot in the down state."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    profile = _make_profile(tmp_path, "junk", state="running")
    (profile / "gateway_state.json").write_text("{ not valid json")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    named = _named_actions(actions)
    assert named[0].action == "registered"  # not "started"
    assert (scandir / "gateway-junk" / "down").exists()


def test_reconcile_log_is_written(tmp_path: Path) -> None:
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "a", state="running")
    _make_profile(tmp_path, "b", state="stopped")

    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    log = (tmp_path / "logs" / "container-boot.log").read_text()
    assert "profile=a" in log
    assert "action=started" in log
    assert "profile=b" in log
    assert "action=registered" in log


def test_reconcile_log_rotates_when_size_exceeded(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    """When container-boot.log exceeds _LOG_ROTATE_BYTES, the existing
    file is rotated to .1 before the new entries are appended."""
    from hermes_cli import container_boot

    # Tighten the threshold so we don't have to write 256 KiB.
    monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 200)

    log_path = tmp_path / "logs" / "container-boot.log"
    log_path.parent.mkdir()
    log_path.write_text("X" * 300)  # already over the threshold

    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "coder", state="running")

    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    rotated = tmp_path / "logs" / "container-boot.log.1"
    assert rotated.exists(), "expected previous log to be rotated to .1"
    assert rotated.read_text().startswith("X" * 300)
    # The new entries land in a fresh container-boot.log (no leftover Xs).
    new_contents = log_path.read_text()
    assert "X" not in new_contents
    assert "profile=coder" in new_contents


def test_reconcile_log_does_not_rotate_below_threshold(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    """A small existing log is appended to in place; no .1 is created."""
    from hermes_cli import container_boot
    monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 10_000_000)

    log_path = tmp_path / "logs" / "container-boot.log"
    log_path.parent.mkdir()
    log_path.write_text("previous entry\n")

    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "coder", state="running")

    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    assert not (tmp_path / "logs" / "container-boot.log.1").exists()
    contents = log_path.read_text()
    assert contents.startswith("previous entry\n")
    assert "profile=coder" in contents


def test_reconcile_log_rotation_overwrites_existing_dot1(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    """Rotating again replaces the prior .1 — we keep at most one
    rotated file (soft cap of ~2 × threshold)."""
    from hermes_cli import container_boot
    monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 200)

    log_dir = tmp_path / "logs"; log_dir.mkdir()
    (log_dir / "container-boot.log.1").write_text("OLD ROTATION")
    (log_dir / "container-boot.log").write_text("Y" * 300)

    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "coder", state="running")

    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    # .1 now contains the previous .log (Ys), not OLD ROTATION.
    rotated = (log_dir / "container-boot.log.1").read_text()
    assert "OLD ROTATION" not in rotated
    assert rotated.startswith("Y" * 300)


def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None:
    scandir = tmp_path / "run-service"; scandir.mkdir()
    profile = _make_profile(tmp_path, "coder", state="running", with_pid=True)

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=True,
    )

    # The action list is still produced...
    assert _named_actions(actions) == [ReconcileAction(
        profile="coder", prior_state="running", action="started",
    )]
    # ...but nothing on disk was touched.
    assert (profile / "gateway.pid").exists()  # not removed under dry_run
    assert not (scandir / "gateway-coder").exists()
    assert not (tmp_path / "logs" / "container-boot.log").exists()


def test_missing_profiles_root_still_registers_default_slot(
    tmp_path: Path,
) -> None:
    """When $HERMES_HOME/profiles doesn't exist (fresh install), the
    reconciliation should still register a gateway-default slot for
    the root profile and return without raising. Previously this
    returned an empty list; the default slot is now always present
    so `hermes gateway start` (no -p) has somewhere to land."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )
    assert actions == [ReconcileAction(
        profile="default", prior_state=None, action="registered",
    )]
    assert (scandir / "gateway-default").is_dir()
    assert (scandir / "gateway-default" / "down").exists()


def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None:
    """A profile dir whose name doesn't match validate_profile_name's
    rules (uppercase, etc.) must surface as a hard error rather than
    silently produce an invalid s6 service dir."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "BadName", state="running")
    with pytest.raises(ValueError):
        reconcile_profile_gateways(
            hermes_home=tmp_path, scandir=scandir, dry_run=False,
        )


def test_register_service_publishes_atomically(tmp_path: Path) -> None:
    """The reconciler should build the new service dir in a sibling
    tmp directory and rename it into place — never leaving a half-
    populated slot visible to a concurrent s6-svscan rescan.

    We verify the invariant indirectly: after a clean reconcile, the
    target directory exists with all required files, and no sibling
    .tmp leftovers remain. (Atomic publication is the only way to
    achieve both with mkdir + write.)
    """
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "coder", state="running")

    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    # No leftover tmp dir.
    leftover = list(scandir.glob("*.tmp"))
    assert leftover == [], f"leftover tmp directories: {leftover}"

    # Target is fully populated.
    svc = scandir / "gateway-coder"
    assert (svc / "type").exists()
    assert (svc / "run").exists()
    assert (svc / "log" / "run").exists()


def test_register_service_overwrites_existing_slot(tmp_path: Path) -> None:
    """A second reconciliation pass cleanly replaces an existing
    slot (the tmp+rename publication overwrites the previous one)."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    profile = _make_profile(tmp_path, "coder", state="running")

    # First pass.
    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )
    first_run = (scandir / "gateway-coder" / "run").read_text()

    # Mutate the profile state so the run-script changes (extra_env
    # rendering would differ if we wired profile config through, but
    # for now just exercise the overwrite path).
    (profile / "gateway_state.json").write_text(
        '{"gateway_state": "stopped"}',
    )
    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    # Slot still exists, no .tmp remnants.
    assert (scandir / "gateway-coder" / "run").read_text() == first_run
    assert list(scandir.glob("*.tmp")) == []
    # Down marker now present (state went from running → stopped).
    assert (scandir / "gateway-coder" / "down").exists()


def test_register_service_cleans_up_stale_tmp_dir(tmp_path: Path) -> None:
    """If a previous interrupted run left a .tmp sibling directory,
    a fresh reconcile must clean it up rather than failing on mkdir."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    # Simulate a leftover from an interrupted run.
    stale_tmp = scandir / "gateway-coder.tmp"
    stale_tmp.mkdir()
    (stale_tmp / "stale-file").write_text("garbage")

    _make_profile(tmp_path, "coder", state="running")
    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    assert not stale_tmp.exists()
    assert (scandir / "gateway-coder" / "run").exists()


# ---------------------------------------------------------------------------
# Default-profile slot — always registered (PR #30136 review item I1)
# ---------------------------------------------------------------------------


def test_default_slot_always_registered_on_empty_home(tmp_path: Path) -> None:
    """Bare HERMES_HOME with nothing under it still produces a
    gateway-default slot (down state)."""
    scandir = tmp_path / "run-service"; scandir.mkdir()

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    assert actions == [ReconcileAction(
        profile="default", prior_state=None, action="registered",
    )]
    svc = scandir / "gateway-default"
    assert svc.is_dir()
    assert (svc / "run").exists()
    assert (svc / "down").exists()


def test_default_slot_run_script_omits_profile_flag(tmp_path: Path) -> None:
    """The default slot's run script must NOT pass `-p default` —
    that would resolve to $HERMES_HOME/profiles/default/ instead of
    the root profile. It must call `hermes gateway run` directly."""
    scandir = tmp_path / "run-service"; scandir.mkdir()

    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    run = (scandir / "gateway-default" / "run").read_text()
    assert "hermes gateway run" in run
    assert "-p default" not in run
    assert "-p 'default'" not in run


def test_default_slot_autostarts_when_root_state_running(tmp_path: Path) -> None:
    """gateway_state.json at the HERMES_HOME root with state=running
    means the default slot auto-starts on container boot."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _seed_default_root(tmp_path, state="running")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    default_action = next(a for a in actions if a.profile == "default")
    assert default_action.prior_state == "running"
    assert default_action.action == "started"
    assert not (scandir / "gateway-default" / "down").exists()


@pytest.mark.parametrize(
    "container_argv",
    [
        ("gateway", "run"),
        ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"),
    ],
)
def test_legacy_gateway_run_cmd_seeds_default_running_state(
    tmp_path: Path,
    container_argv: tuple[str, ...],
) -> None:
    """Pre-s6 Docker users often ran `gateway run` as the container
    command. With no persisted gateway_state.json yet, s6 reconciliation
    must migrate that legacy intent into a running default gateway slot."""
    scandir = tmp_path / "run-service"; scandir.mkdir()

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path,
        scandir=scandir,
        dry_run=False,
        container_argv=container_argv,
    )

    default_action = next(a for a in actions if a.profile == "default")
    assert default_action.prior_state == "running"
    assert default_action.action == "started"
    assert not (scandir / "gateway-default" / "down").exists()
    state = json.loads((tmp_path / "gateway_state.json").read_text())
    assert state["gateway_state"] == "running"
    assert state["migrated_from"] == "legacy-container-cmd"


@pytest.mark.parametrize(
    "container_argv",
    [
        ("gateway", "run", "--no-supervise"),
        ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run", "--no-supervise"),
    ],
)
def test_legacy_gateway_run_no_supervise_does_not_seed_s6_state(
    tmp_path: Path,
    container_argv: tuple[str, ...],
) -> None:
    """`gateway run --no-supervise` is an explicit opt-out from s6 migration."""
    scandir = tmp_path / "run-service"; scandir.mkdir()

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path,
        scandir=scandir,
        dry_run=False,
        container_argv=container_argv,
    )

    default_action = next(a for a in actions if a.profile == "default")
    assert default_action.prior_state is None
    assert default_action.action == "registered"
    assert (scandir / "gateway-default" / "down").exists()
    assert not (tmp_path / "gateway_state.json").exists()


def test_legacy_gateway_run_env_no_supervise_does_not_seed_s6_state(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    """Env opt-out matches the CLI `--no-supervise` flag."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    monkeypatch.setenv("HERMES_GATEWAY_NO_SUPERVISE", "1")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path,
        scandir=scandir,
        dry_run=False,
        container_argv=("gateway", "run"),
    )

    default_action = next(a for a in actions if a.profile == "default")
    assert default_action.prior_state is None
    assert default_action.action == "registered"
    assert (scandir / "gateway-default" / "down").exists()
    assert not (tmp_path / "gateway_state.json").exists()


def test_default_slot_does_not_autostart_when_root_state_stopped(
    tmp_path: Path,
) -> None:
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _seed_default_root(tmp_path, state="stopped")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path,
        scandir=scandir,
        dry_run=False,
        container_argv=("gateway", "run"),
    )

    default_action = next(a for a in actions if a.profile == "default")
    assert default_action.action == "registered"
    assert (scandir / "gateway-default" / "down").exists()
    state = json.loads((tmp_path / "gateway_state.json").read_text())
    assert state["gateway_state"] == "stopped"


def test_default_slot_does_not_autostart_when_root_state_startup_failed(
    tmp_path: Path,
) -> None:
    """Crash-loop guard applies to the default slot too."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _seed_default_root(tmp_path, state="startup_failed")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    default_action = next(a for a in actions if a.profile == "default")
    assert default_action.action == "registered"


def test_default_slot_cleans_up_stale_runtime_files_at_root(
    tmp_path: Path,
) -> None:
    """gateway.pid and processes.json at the HERMES_HOME root (left
    over from the previous container's default gateway) must be
    swept the same way as for named profiles."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _seed_default_root(tmp_path, state="running", with_pid=True)
    assert (tmp_path / "gateway.pid").exists()

    reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    assert not (tmp_path / "gateway.pid").exists()
    assert not (tmp_path / "processes.json").exists()


def test_default_slot_appears_before_named_profiles(tmp_path: Path) -> None:
    """The action list is ordered: default first, then named profiles
    in directory order. Operators and the boot-log reader rely on
    this ordering being stable."""
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "z-last-alphabetically", state="stopped")
    _make_profile(tmp_path, "a-first-alphabetically", state="stopped")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    assert [a.profile for a in actions] == [
        "default",
        "a-first-alphabetically",
        "z-last-alphabetically",
    ]


def test_profiles_default_subdir_is_skipped_with_warning(
    tmp_path: Path,
    caplog: pytest.LogCaptureFixture,
) -> None:
    """A user-created profiles/default/ collides with the reserved
    root-profile slot — the named entry is skipped (with a warning)
    so we don't double-register gateway-default."""
    import logging
    caplog.set_level(logging.WARNING)
    scandir = tmp_path / "run-service"; scandir.mkdir()
    _make_profile(tmp_path, "default", state="running")

    actions = reconcile_profile_gateways(
        hermes_home=tmp_path, scandir=scandir, dry_run=False,
    )

    # Only the root-profile default slot appears — not the colliding
    # named profile.
    default_actions = [a for a in actions if a.profile == "default"]
    assert len(default_actions) == 1
    # And the warning surfaces so operators know the named profile
    # was ignored.
    assert any(
        "profiles/default/" in record.message for record in caplog.records
    )
