# RT benchmark baseline: computed-goto threaded dispatch (-DSBC_THREADED_DISPATCH).
# Captured: dd78c53 + RT-1 work-in-progress (May 2026)
# Machine: w64devkit gcc 12.2.0 / Win64 / Intel i7-12700H class
# Methodology: 5 reps, dropped fastest+slowest, mean of middle 3.
# Variance: ±5-15% run-to-run.
#
# Finding: threaded dispatch is *not* a win on this codebase.  Most
# benchmarks come out 3-33% slower than the switch path; a couple
# (bits, alt) come out marginally faster.  Mean degradation: ~8%.
# Root cause inferred from disassembly: GCC re-emits the dispatch
# table address (lea dt(%rip),%reg) at every per-opcode dispatch
# site instead of pinning it into a register, inflating the per-site
# dispatch cost.  The classical "switch can't beat threaded" win
# doesn't apply here -- modern GCC's switch optimizer plus a strong
# CPU indirect-branch predictor close the gap.  Keeping the
# SBC_THREADED_DISPATCH compile-time toggle in place for future
# experimentation (e.g. retry on Linux with -fno-pic or with a
# different CPU class), but the default stays on switch.

[bn_loop]
# N=30000000 warmup=100000
count 30000000: 0.48 s, 16 ns/op
xor   30000000: 0.44 s, 14 ns/op

[bn_arith]
# N=5000000 warmup=50000
mix1  5000000: 0.090 s, 17 ns/op
mix2  5000000: 0.100 s, 20 ns/op
bits  5000000: 0.070 s, 14 ns/op

[bn_branch]
# N=5000000 warmup=50000
alt   5000000: 0.110 s, 22 ns/op
tri3  5000000: 0.205 s, 40 ns/op

[bn_call]
# N=2000000 warmup=20000
1arg  2000000: 0.090 s, 44 ns/op
2arg  2000000: 0.105 s, 52 ns/op

[bn_mcall]
# N=1000000 warmup=10000
negneg 1000000: 0.054 s, 54 ns/op
abs    1000000: 0.044 s, 44 ns/op

[bn_list]
# N=2000000 warmup=20000
iget  2000000: 0.092 s, 46 ns/op
iset  2000000: 0.048 s, 24 ns/op
lsize 2000000: 0.068 s, 34 ns/op
