Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Ztorch Testing Strategy

Comprehensive testing approach following Tiger Style principles.

Testing Philosophy

  1. TDD from day 0 - Write tests before implementation
  2. Test all platforms - Linux, macOS, Windows on x86_64 and aarch64
  3. Reference-based verification - CPU scalar is ground truth
  4. No untested code - Every line exercised by tests
  5. Fail fast - Assertions in code, strict validation in tests

Test Categories

1. Unit Tests

Test individual operations with known inputs/outputs.

Location: test/ops/

Example:

// test/ops/matmul_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;

test "matmul: 2x2 identity matrix" {
    // I @ A = A
    const identity = [_]f32{ 1, 0, 0, 1 };
    const matrix = [_]f32{ 5, 6, 7, 8 };
    var result: [4]f32 = undefined;

    ztorch.ops.matmul_cpu(.{2, 2}, &identity, &matrix, &result);

    try testing.expectEqual(@as(f32, 5), result[0]);
    try testing.expectEqual(@as(f32, 6), result[1]);
    try testing.expectEqual(@as(f32, 7), result[2]);
    try testing.expectEqual(@as(f32, 8), result[3]);
}

test "matmul: 2x2 known result" {
    // [[1, 2], [3, 4]] @ [[5, 6], [7, 8]]
    // = [[1*5+2*7, 1*6+2*8], [3*5+4*7, 3*6+4*8]]
    // = [[19, 22], [43, 50]]
    const a = [_]f32{ 1, 2, 3, 4 };
    const b = [_]f32{ 5, 6, 7, 8 };
    var c: [4]f32 = undefined;

    ztorch.ops.matmul_cpu(.{2, 2}, &a, &b, &c);

    const epsilon = 1e-5;
    try testing.expectApproxEqAbs(@as(f32, 19), c[0], epsilon);
    try testing.expectApproxEqAbs(@as(f32, 22), c[1], epsilon);
    try testing.expectApproxEqAbs(@as(f32, 43), c[2], epsilon);
    try testing.expectApproxEqAbs(@as(f32, 50), c[3], epsilon);
}

test "matmul: non-square matrices" {
    // (2, 3) @ (3, 4) = (2, 4)
    const a = [_]f32{ 1, 2, 3, 4, 5, 6 };
    const b = [_]f32{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
    var c: [8]f32 = undefined;

    ztorch.ops.matmul_cpu(.{2, 3, 4}, &a, &b, &c);

    // Verify all elements (computed manually)
    const expected = [_]f32{ 38, 44, 50, 56, 83, 98, 113, 128 };
    const epsilon = 1e-5;
    for (expected, c) |exp, act| {
        try testing.expectApproxEqAbs(exp, act, epsilon);
    }
}

test "matmul: large matrix stress test" {
    const allocator = testing.allocator;
    const n = 1024;

    var a = try allocator.alloc(f32, n * n);
    defer allocator.free(a);
    var b = try allocator.alloc(f32, n * n);
    defer allocator.free(b);
    var c = try allocator.alloc(f32, n * n);
    defer allocator.free(c);

    // Fill with known pattern
    for (0..n*n) |i| {
        a[i] = @floatFromInt(i % 100);
        b[i] = @floatFromInt((i * 7) % 100);
    }

    // Should complete without error
    ztorch.ops.matmul_cpu(.{n, n}, a, b, c);

    // Spot check a few values (not full verification)
    try testing.expect(c[0] != 0);
    try testing.expect(!std.math.isNan(c[n*n - 1]));
}

Requirements:

  • At least 3 tests per operation (simple, known result, edge cases)
  • Cover edge cases (zeros, negatives, large values, etc.)
  • Test various sizes (small, medium, large)

2. Gradient Check Tests

Verify autograd correctness using numerical differentiation.

Location: test/autograd/

Example:

// test/autograd/gradient_check_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;

fn numericalGradient(
    comptime f: anytype,
    x: []f32,
    epsilon: f32,
) ![]f32 {
    const allocator = testing.allocator;
    var grad = try allocator.alloc(f32, x.len);

    for (0..x.len) |i| {
        // f(x + h)
        x[i] += epsilon;
        const f_plus = try f(x);

        // f(x - h)
        x[i] -= 2 * epsilon;
        const f_minus = try f(x);

        // (f(x+h) - f(x-h)) / 2h
        grad[i] = (f_plus - f_minus) / (2 * epsilon);

        // Restore
        x[i] += epsilon;
    }

    return grad;
}

test "matmul: gradient check" {
    const allocator = testing.allocator;

    // Small matrices for numerical stability
    const a = [_]f32{ 1, 2, 3, 4 };
    const b = [_]f32{ 5, 6, 7, 8 };

    // Forward pass
    var c: [4]f32 = undefined;
    ztorch.ops.matmul_cpu(.{2, 2}, &a, &b, &c);

    // Backward pass (autograd)
    const d_c = [_]f32{ 1, 1, 1, 1 }; // Gradient of loss
    var d_a: [4]f32 = undefined;
    var d_b: [4]f32 = undefined;
    ztorch.ops.matmul_backward_cpu(.{2, 2}, &d_c, &a, &b, &d_a, &d_b);

    // Numerical gradient
    var a_copy = a;
    const num_grad_a = try numericalGradient(
        struct {
            fn f(x: []f32) !f32 {
                var tmp: [4]f32 = undefined;
                ztorch.ops.matmul_cpu(.{2, 2}, x, &b, &tmp);
                return tmp[0] + tmp[1] + tmp[2] + tmp[3]; // sum
            }
        }.f,
        &a_copy,
        1e-4,
    );
    defer allocator.free(num_grad_a);

    // Compare autograd vs numerical
    const epsilon = 1e-3; // Numerical gradients are approximate
    for (d_a, num_grad_a) |auto_grad, num_grad| {
        try testing.expectApproxEqAbs(auto_grad, num_grad, epsilon);
    }
}

test "relu: gradient check" {
    const input = [_]f32{ -2, -1, 0, 1, 2 };
    var output: [5]f32 = undefined;

    // Forward
    ztorch.ops.relu_cpu(&input, &output);

    // Backward
    const d_output = [_]f32{ 1, 1, 1, 1, 1 };
    var d_input: [5]f32 = undefined;
    ztorch.ops.relu_backward_cpu(&d_output, &input, &d_input);

    // Expected gradients: ReLU'(x) = x > 0 ? 1 : 0
    const expected = [_]f32{ 0, 0, 0, 1, 1 };

    for (expected, d_input) |exp, act| {
        try testing.expectEqual(exp, act);
    }
}

Requirements:

  • Every differentiable operation must have gradient check
  • Use numerical differentiation as reference
  • Epsilon tolerance based on operation complexity

3. Backend Parity Tests

Verify all backends produce identical results.

Location: test/backends/

Example:

// test/backends/parity_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;

test "matmul: cpu scalar vs cpu simd" {
    if (!ztorch.cpu.hasSimd()) return error.SkipZigTest;

    const allocator = testing.allocator;
    const n = 256;

    // Random input
    var a = try allocator.alloc(f32, n * n);
    defer allocator.free(a);
    var b = try allocator.alloc(f32, n * n);
    defer allocator.free(b);

    var rng = std.rand.DefaultPrng.init(42);
    for (0..n*n) |i| {
        a[i] = rng.random().float(f32) * 2 - 1; // [-1, 1]
        b[i] = rng.random().float(f32) * 2 - 1;
    }

    // CPU scalar
    var c_scalar = try allocator.alloc(f32, n * n);
    defer allocator.free(c_scalar);
    ztorch.ops.matmul_cpu_scalar(.{n, n}, a, b, c_scalar);

    // CPU SIMD
    var c_simd = try allocator.alloc(f32, n * n);
    defer allocator.free(c_simd);
    ztorch.ops.matmul_cpu_simd(.{n, n}, a, b, c_simd);

    // Compare
    const epsilon = 1e-4; // Allow small numerical differences
    for (c_scalar, c_simd) |scalar, simd| {
        try testing.expectApproxEqAbs(scalar, simd, epsilon);
    }
}

test "matmul: cpu vs cuda" {
    if (!ztorch.cuda.isAvailable()) return error.SkipZigTest;

    const allocator = testing.allocator;
    const n = 1024;

    // Random input
    var a = try allocator.alloc(f32, n * n);
    defer allocator.free(a);
    var b = try allocator.alloc(f32, n * n);
    defer allocator.free(b);

    var rng = std.rand.DefaultPrng.init(42);
    for (0..n*n) |i| {
        a[i] = rng.random().float(f32) * 2 - 1;
        b[i] = rng.random().float(f32) * 2 - 1;
    }

    // CPU result
    var c_cpu = try allocator.alloc(f32, n * n);
    defer allocator.free(c_cpu);
    ztorch.ops.matmul_cpu(.{n, n}, a, b, c_cpu);

    // CUDA result
    const a_gpu = try ztorch.cuda.allocAndCopy(a);
    defer ztorch.cuda.free(a_gpu);
    const b_gpu = try ztorch.cuda.allocAndCopy(b);
    defer ztorch.cuda.free(b_gpu);
    const c_gpu = try ztorch.cuda.alloc(n * n * @sizeOf(f32));
    defer ztorch.cuda.free(c_gpu);

    try ztorch.ops.matmul_cuda(.{n, n}, a_gpu, b_gpu, c_gpu);

    var c_cuda = try allocator.alloc(f32, n * n);
    defer allocator.free(c_cuda);
    try ztorch.cuda.copyToHost(c_gpu, c_cuda);

    // Compare
    const epsilon = 1e-3; // GPU may have slightly different rounding
    var max_diff: f32 = 0;
    for (c_cpu, c_cuda) |cpu, cuda| {
        const diff = @abs(cpu - cuda);
        max_diff = @max(max_diff, diff);
        try testing.expectApproxEqAbs(cpu, cuda, epsilon);
    }

    std.debug.print("Max difference: {d:.6}\n", .{max_diff});
}

Requirements:

  • Test all backend combinations
  • Use random inputs to catch edge cases
  • Report maximum difference for debugging

4. Integration Tests

Test complete workflows (model training, inference).

Location: test/integration/

Example:

// test/integration/mnist_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;

test "integration: train simple MLP on synthetic data" {
    const allocator = testing.allocator;

    // Define model
    const Model = ztorch.Sequential(.{
        ztorch.Linear(10, 20),
        ztorch.ReLU(),
        ztorch.Linear(20, 2),
    });

    var model = try Model.compile(.cpu, allocator);
    defer model.deinit();

    // Synthetic data (linearly separable)
    const batch_size = 32;
    var input = try ztorch.Tensor.zeros(.{batch_size, 10}, .f32, .cpu);
    defer input.deinit();
    var labels = try ztorch.Tensor.zeros(.{batch_size}, .i32, .cpu);
    defer labels.deinit();

    // Fill with pattern
    for (0..batch_size) |i| {
        const label: i32 = if (i < batch_size / 2) 0 else 1;
        labels.data[i] = label;

        for (0..10) |j| {
            input.data[i * 10 + j] = if (label == 0) -1.0 else 1.0;
        }
    }

    // Train for a few steps
    var initial_loss: f32 = 0;
    var final_loss: f32 = 0;

    for (0..100) |step| {
        const output = try model.forward(input);
        defer output.deinit();

        const loss = try ztorch.crossEntropy(output, labels);
        defer loss.deinit();

        if (step == 0) initial_loss = loss.item();
        if (step == 99) final_loss = loss.item();

        try model.backward(loss);
        try model.step(.{ .sgd = .{ .lr = 0.01 } });
    }

    // Loss should decrease
    try testing.expect(final_loss < initial_loss);

    // Should converge to low loss on this simple problem
    try testing.expect(final_loss < 0.1);
}

test "integration: save and load model" {
    // TODO: Implement serialization
    return error.SkipZigTest;
}

Requirements:

  • Test complete training loops
  • Verify loss decreases
  • Test inference
  • Test model serialization (future)

5. Property-Based Tests

Test properties that should hold for all inputs.

Example:

test "property: matmul associativity" {
    // (A @ B) @ C = A @ (B @ C)
    const allocator = testing.allocator;

    var rng = std.rand.DefaultPrng.init(42);

    // Small matrices for speed
    const n = 16;
    var a = try allocator.alloc(f32, n * n);
    defer allocator.free(a);
    var b = try allocator.alloc(f32, n * n);
    defer allocator.free(b);
    var c = try allocator.alloc(f32, n * n);
    defer allocator.free(c);

    // Random values
    for (0..n*n) |i| {
        a[i] = rng.random().float(f32);
        b[i] = rng.random().float(f32);
        c[i] = rng.random().float(f32);
    }

    // (A @ B) @ C
    var ab = try allocator.alloc(f32, n * n);
    defer allocator.free(ab);
    ztorch.ops.matmul_cpu(.{n, n}, a, b, ab);
    var abc_left = try allocator.alloc(f32, n * n);
    defer allocator.free(abc_left);
    ztorch.ops.matmul_cpu(.{n, n}, ab, c, abc_left);

    // A @ (B @ C)
    var bc = try allocator.alloc(f32, n * n);
    defer allocator.free(bc);
    ztorch.ops.matmul_cpu(.{n, n}, b, c, bc);
    var abc_right = try allocator.alloc(f32, n * n);
    defer allocator.free(abc_right);
    ztorch.ops.matmul_cpu(.{n, n}, a, bc, abc_right);

    // Should be approximately equal
    const epsilon = 1e-3; // Numerical error accumulates
    for (abc_left, abc_right) |left, right| {
        try testing.expectApproxEqAbs(left, right, epsilon);
    }
}

test "property: softmax sums to 1" {
    const allocator = testing.allocator;
    var rng = std.rand.DefaultPrng.init(42);

    const n = 100;
    var input = try allocator.alloc(f32, n);
    defer allocator.free(input);
    var output = try allocator.alloc(f32, n);
    defer allocator.free(output);

    for (0..10) |_| {
        // Random input
        for (0..n) |i| {
            input[i] = rng.random().float(f32) * 20 - 10; // [-10, 10]
        }

        ztorch.ops.softmax_cpu(input, output);

        // Sum should be 1
        var sum: f32 = 0;
        for (output) |val| sum += val;

        try testing.expectApproxEqAbs(@as(f32, 1.0), sum, 1e-5);
    }
}

CI Configuration

.github/workflows/ci.yml:

name: CI

on:
  push:
    branches: [main, dev]
  pull_request:
    branches: [main]

jobs:
  test:
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [x86_64, aarch64]
        exclude:
          - os: windows-latest
            arch: aarch64

    runs-on: ${{ matrix.os }}

    steps:
      - uses: actions/checkout@v4

      - name: Setup Zig
        uses: goto-bus-stop/setup-zig@v2
        with:
          version: master

      - name: Check formatting
        run: zig fmt --check .

      - name: Build
        run: zig build --summary all

      - name: Run unit tests
        run: zig build test --summary all

      - name: Run integration tests
        run: zig build test-integration --summary all

      - name: Run benchmarks (smoke test)
        run: zig build bench --summary all
        env:
          BENCH_ITERATIONS: 10 # Quick smoke test

  test-cuda:
    runs-on: ubuntu-latest
    # Requires self-hosted runner with GPU
    # if: github.event_name == 'push'

    steps:
      - uses: actions/checkout@v4

      - name: Setup Zig
        uses: goto-bus-stop/setup-zig@v2
        with:
          version: master

      - name: Setup CUDA
        uses: Jimver/cuda-toolkit@v0.2.11
        with:
          cuda: "12.1.0"

      - name: Run CUDA tests
        run: zig build test-cuda --summary all

      - name: Run backend parity tests
        run: zig build test-parity --summary all

  coverage:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v4

      - name: Setup Zig
        uses: goto-bus-stop/setup-zig@v2
        with:
          version: master

      - name: Run tests with coverage
        run: zig build test -Dcoverage

      - name: Upload coverage
        uses: codecov/codecov-action@v3
        with:
          files: ./zig-out/coverage.txt

Test Execution

# Run all tests
zig build test

# Run specific test file
zig build test -- test/ops/matmul_test.zig

# Run with verbose output
zig build test --summary all

# Run benchmarks
zig build bench

# Run only fast tests (no GPU, no integration)
zig build test-fast

# Run GPU tests only
zig build test-cuda

# Run backend parity tests
zig build test-parity

Test Coverage Requirements

  • Minimum: 90% line coverage
  • Target: 95%+ line coverage
  • Every public API must be tested
  • Every backend implementation must be tested

Continuous Benchmarking

Track performance over time to catch regressions.

# Run benchmarks and save results
zig build bench --output bench-results.json

# Compare against baseline
zig build bench-compare --baseline main

Example output:

=== Benchmark Comparison ===
MatMul 1024x1024:
  main:    12.3 µs (baseline)
  current: 11.8 µs (4.1% faster ✓)

ReLU 1M elements:
  main:    0.5 ms (baseline)
  current: 0.6 ms (20% slower ✗)  <-- REGRESSION!

Test Writing Guidelines

  1. Name tests clearly
   test "matmul: 2x2 identity matrix"  // ✓ Good
   test "test1"                         // ✗ Bad
  1. One concept per test

    • Test identity matrix separately from known result
    • Makes failures easier to diagnose
  2. Use descriptive assertions

   try testing.expectEqual(@as(f32, 19), result[0]);  // ✓ Shows expected
   try testing.expect(result[0] == 19);                // ✗ Less clear
  1. Clean up resources
   var tensor = try Tensor.zeros(.{100}, .f32, .cpu);
   defer tensor.deinit();  // Always cleanup
  1. Document complex setups
   // Testing matmul with non-square matrices
   // A: (2, 3), B: (3, 4), Expected C: (2, 4)

Performance Testing

See benchmarking.md for full details on performance testing.