Ztorch Benchmarking

Performance measurement and validation strategy.

Philosophy

Napkin math first - Estimate before measuring
Prove every gain - Optimizations must show measurable improvement
Track regressions - Continuous benchmarking catches slowdowns
Document results - Publish benchmarks for transparency

Napkin Math

Before implementing any operation, estimate its performance.

Example: MatMul

Operation: C = A @ B
Shapes: A(M, K), B(K, N), C(M, N)
Example: (1024, 1024) @ (1024, 1024)

FLOPs:
  2 * M * K * N = 2 * 1024^3 = 2,147,483,648 FLOPs ≈ 2.1 GFLOPs

Memory:
  Read: M*K + K*N = 1024*1024 + 1024*1024 = 2*1024^2 = 2,097,152 elements
  Write: M*N = 1024*1024 = 1,048,576 elements
  Total: 3 * 1024^2 * 4 bytes = 12 MB

GPU: RTX 4090
  Peak compute: 82 TFLOPS (FP32)
  Peak bandwidth: 1 TB/s

  Compute bound if: FLOPs/byte > Peak FLOPs / Peak bandwidth
    2.1 GFLOPs / 12 MB = 175 FLOPs/byte
    82 TFLOPS / 1 TB/s = 82 FLOPs/byte
    175 > 82, so compute bound ✓

  Expected time (compute bound):
    2.1 GFLOPs / 82 TFLOPS = 25.6 µs

  Expected time (memory bound):
    12 MB / 1 TB/s = 12 µs

  Realistically: ~25-50 µs (accounting for overhead)

CPU: Single core, 5 GHz
  Peak (optimistic): ~100 GFLOPS (AVX2)
  Realistic: ~20 GFLOPS

  Expected time:
    2.1 GFLOPs / 20 GFLOPS = 105 ms

The estimate gives us:

Performance targets
Expected speedup ratios
Sanity checks

Benchmark Framework

Implementation

// bench/framework.zig
const std = @import("std");
const ztorch = @import("ztorch");
const duration = ztorch.util.duration;

pub const BenchResult = struct {
    name: []const u8,
    iterations: usize,
    total_ns: u64,
    mean_ns: u64,
    median_ns: u64,
    min_ns: u64,
    max_ns: u64,
    p99_ns: u64,

    pub fn print(self: BenchResult) !void {
        std.debug.print("=== Benchmark: {s} ===\n", .{self.name});
        std.debug.print("Iterations: {}\n", .{self.iterations});
        try printStat("Mean", self.mean_ns);
        try printStat("Median", self.median_ns);
        try printStat("Min", self.min_ns);
        try printStat("Max", self.max_ns);
        try printStat("P99", self.p99_ns);
    }
};

pub fn benchmark(
    comptime name: []const u8,
    comptime iterations: usize,
    func: anytype,
    args: anytype,
) !BenchResult {
    var times = try std.heap.page_allocator.alloc(u64, iterations);
    defer std.heap.page_allocator.free(times);

    const warmup_iterations: usize = if (iterations < 10) iterations else 10;
    for (0..warmup_iterations) |_| {
        try @call(.auto, func, args);
    }

    // Measure
    for (0..iterations) |i| {
        var timer = try std.time.Timer.start();
        try @call(.auto, func, args);
        times[i] = timer.read();
    }

    // Sort for median and percentiles
    std.sort.pdq(u64, times, {}, comptime std.sort.asc(u64));

    // Compute statistics
    var total: u64 = 0;
    for (times) |t| total += t;

    return BenchResult{
        .name = name,
        .iterations = iterations,
        .total_ns = total,
        .mean_ns = total / iterations,
        .median_ns = times[iterations / 2],
        .min_ns = times[0],
        .max_ns = times[iterations - 1],
        .p99_ns = times[(iterations * 99) / 100],
    };
}

fn printStat(label: []const u8, ns: u64) !void {
    const text = try duration.formatDuration(std.heap.page_allocator, ns);
    defer std.heap.page_allocator.free(text);
    std.debug.print("{s}: {s}\n", .{ label, text });
}

The duration formatter is shared with the custom test runner, ensuring benchmarks and tests report timings in a consistent style.

Usage

const std = @import("std");
const framework = @import("../framework.zig");

fn step(allocator: std.mem.Allocator) !void {
    // code-under-test goes here
    _ = allocator;
}

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer _ = gpa.deinit();

    const result = try framework.benchmark(
        "demo.step",
        100,
        step,
        .{ gpa.allocator() },
    );
    try result.print();
}

Operation Benchmarks

MatMul Benchmarks

$ zig build bench
...
=== Benchmark: matmul.cpu_scalar ===
Iterations: 10
Mean: 52.52 ms
Median: 52.49 ms
Min: 52.27 ms
Max: 53.38 ms
P99: 53.38 ms
Size: 256x256, GFLOPS: 0.64

Activation Benchmarks

$ zig build bench
...
=== Benchmark: relu.cpu_scalar.forward/1M_f32 ===
Iterations: 200
Mean: 1.78 ms
Median: 1.79 ms
Min: 1.75 ms
Max: 1.84 ms
P99: 1.83 ms
Elements: 1000000, GFLOPS: 0.56

Comparison Benchmarks

Compare against established libraries.

// bench/comparison/vs_numpy.zig
pub fn benchmarkVsNumPy() !void {
    // Generate test data
    // Run NumPy matmul (via Python subprocess)
    // Run Ztorch matmul
    // Compare times

    std.debug.print("NumPy:  {d:.2} ms\n", .{numpy_time});
    std.debug.print("Ztorch: {d:.2} ms\n", .{ztorch_time});
    std.debug.print("Speedup: {d:.2}x\n", .{numpy_time / ztorch_time});
}

Continuous Benchmarking

Regression Detection

Run benchmarks on every commit and compare against baseline.

# Save baseline
zig build bench --save-baseline main.json

# Compare current against baseline
zig build bench --compare main.json

# Output
=== Regression Report ===
MatMul 1024x1024:
  Baseline: 12.5 ms
  Current:  15.3 ms
  Change:   +22.4% ✗ REGRESSION

ReLU 1M elements:
  Baseline: 2.5 ms
  Current:  2.4 ms
  Change:   -4.0% ✓ Improvement

CI Integration

# .github/workflows/bench.yml
name: Benchmark

on:
  pull_request:
    branches: [main]

jobs:
  benchmark:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0 # Need history for comparison

      - name: Setup Zig
        uses: goto-bus-stop/setup-zig@v2

      - name: Download baseline
        run: |
          gh run download --name benchmark-baseline --repo ${{ github.repository }}
        env:
          GH_TOKEN: ${{ github.token }}

      - name: Run benchmarks
        run: zig build bench --compare baseline.json --output results.json

      - name: Comment PR
        uses: actions/github-script@v6
        with:
          script: |
            const fs = require('fs');
            const results = JSON.parse(fs.readFileSync('results.json'));

            let comment = '## Benchmark Results\n\n';
            for (const result of results) {
              const change = ((result.current - result.baseline) / result.baseline * 100).toFixed(1);
              const emoji = change > 5 ? '⚠️' : change < -5 ? '✅' : '➖';
              comment += `${emoji} ${result.name}: ${change}%\n`;
            }

            github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: comment
            });

Performance Targets

Minimum Requirements (v0.1)

Operation	Size	CPU Scalar	CPU SIMD	CUDA (RTX 4090)
MatMul	1024²	5 GFLOPS	>10 GFLOPS	>1000 GFLOPS
ReLU	1M	0.4 GFLOPS	>2 GFLOPS	>10 GFLOPS
Softmax	1M	0.07 GFLOPS	>0.2 GFLOPS	>5 GFLOPS

Stretch Goals (v0.2)

Match or exceed PyTorch performance on CPU
Reach 50%+ of theoretical peak on GPU
Sub-millisecond inference for small models

Publishing Results

All benchmark results are published in the repository.

benchmarks/results/

results/
├── cpu-scalar/
│   ├── matmul.json
│   ├── relu.json
│   └── ...
├── cpu-avx2/
│   └── ...
├── cuda-rtx4090/
│   └── ...
└── comparisons/
    ├── vs-pytorch.md
    └── vs-numpy.md

Example: benchmarks/results/cpu-scalar/matmul.json

{
  "operation": "matmul",
  "backend": "cpu-scalar",
  "hardware": "Intel i9-13980HX",
  "date": "2024-11-06",
  "results": [
    {
      "size": [1024, 1024, 1024],
      "iterations": 100,
      "mean_ns": 12500000,
      "gflops": 0.17
    }
  ]
}

Tools

# Run all benchmarks
zig build bench

# Run specific operation
zig build bench-matmul
zig build bench-activations

# Compare backends
zig build bench-compare-backends

# Generate report
zig build bench-report --output report.md

# Profile (Linux only)
zig build bench-matmul --profile
# Generates flamegraph.svg

Keyboard shortcuts

Ztorch Documentation