# RT-bench baseline: pre-RT-4..8 cache-locality work.
# Captured: 8500db0 (RT-2 + bn_gc).
# Machine: w64devkit gcc 12.2.0 / Win64 / Intel i7-12700H class.
# Methodology: single representative run; numbers are ±10-15%
# variance even with the bench-setup-windows.ps1 mitigations.
#
# NOTE: my first capture above the line was taken on a cold/
# throttled CPU and showed numbers ~30% higher than the steady-
# state median; the values below are the re-measured median of
# 3 runs at steady state, which is the meaningful before-point
# for "did RT-8a help?".
#
# Goal: track cache-locality work (RT-4 / RT-5 / RT-6 / RT-7 / RT-8)
# against these reference points.  Each RT item's "after" baseline
# goes in its own file (baseline-post-rt8a.txt, etc).

[bn_loop]
count    30000000:  ~17 ns/op
xor      30000000:  ~17 ns/op

[bn_arith]
mix1      5000000:  ~21 ns/op
mix2      5000000:  ~24 ns/op
bits      5000000:  ~19 ns/op

[bn_branch]
alt       5000000:  ~26 ns/op
tri3      5000000:  ~45 ns/op

[bn_call]
1arg      2000000:  ~38 ns/op
2arg      2000000:  ~40 ns/op

[bn_mcall]
negneg    1000000:  ~43 ns/op
abs       1000000:  ~40 ns/op

[bn_list]
iget      2000000:  ~36 ns/op
iset      2000000:  ~18 ns/op
lsize     2000000:  ~26 ns/op

[bn_gc.wbarrier]   # RT-4 target
wbarrier   500000:  ~48 ns/op

[bn_gc.megamcall]  # RT-6/7 target
megamcall  500000:  ~60 ns/op

[bn_gc.closure]    # RT-5 target
closure    500000:  ~395 ns/op

[bn_gc.deepgc]     # RT-8b target
deepgc        200:  ~0.9-1.2 ms per 64-frame gc()
