Ztorch Testing Strategy

Comprehensive testing approach following Tiger Style principles.

Testing Philosophy

TDD from day 0 - Write tests before implementation
Test all platforms - Linux, macOS, Windows on x86_64 and aarch64
Reference-based verification - CPU scalar is ground truth
No untested code - Every line exercised by tests
Fail fast - Assertions in code, strict validation in tests

Test Categories

1. Unit Tests

Test individual operations with known inputs/outputs.

Location: test/ops/

Example:

// test/ops/matmul_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;

test "matmul: 2x2 identity matrix" {
    // I @ A = A
    const identity = [_]f32{ 1, 0, 0, 1 };
    const matrix = [_]f32{ 5, 6, 7, 8 };
    var result: [4]f32 = undefined;

    ztorch.ops.matmul_cpu(.{2, 2}, &identity, &matrix, &result);

    try testing.expectEqual(@as(f32, 5), result[0]);
    try testing.expectEqual(@as(f32, 6), result[1]);
    try testing.expectEqual(@as(f32, 7), result[2]);
    try testing.expectEqual(@as(f32, 8), result[3]);
}

test "matmul: 2x2 known result" {
    // [[1, 2], [3, 4]] @ [[5, 6], [7, 8]]
    // = [[1*5+2*7, 1*6+2*8], [3*5+4*7, 3*6+4*8]]
    // = [[19, 22], [43, 50]]
    const a = [_]f32{ 1, 2, 3, 4 };
    const b = [_]f32{ 5, 6, 7, 8 };
    var c: [4]f32 = undefined;

    ztorch.ops.matmul_cpu(.{2, 2}, &a, &b, &c);

    const epsilon = 1e-5;
    try testing.expectApproxEqAbs(@as(f32, 19), c[0], epsilon);
    try testing.expectApproxEqAbs(@as(f32, 22), c[1], epsilon);
    try testing.expectApproxEqAbs(@as(f32, 43), c[2], epsilon);
    try testing.expectApproxEqAbs(@as(f32, 50), c[3], epsilon);
}

test "matmul: non-square matrices" {
    // (2, 3) @ (3, 4) = (2, 4)
    const a = [_]f32{ 1, 2, 3, 4, 5, 6 };
    const b = [_]f32{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
    var c: [8]f32 = undefined;

    ztorch.ops.matmul_cpu(.{2, 3, 4}, &a, &b, &c);

    // Verify all elements (computed manually)
    const expected = [_]f32{ 38, 44, 50, 56, 83, 98, 113, 128 };
    const epsilon = 1e-5;
    for (expected, c) |exp, act| {
        try testing.expectApproxEqAbs(exp, act, epsilon);
    }
}

test "matmul: large matrix stress test" {
    const allocator = testing.allocator;
    const n = 1024;

    var a = try allocator.alloc(f32, n * n);
    defer allocator.free(a);
    var b = try allocator.alloc(f32, n * n);
    defer allocator.free(b);
    var c = try allocator.alloc(f32, n * n);
    defer allocator.free(c);

    // Fill with known pattern
    for (0..n*n) |i| {
        a[i] = @floatFromInt(i % 100);
        b[i] = @floatFromInt((i * 7) % 100);
    }

    // Should complete without error
    ztorch.ops.matmul_cpu(.{n, n}, a, b, c);

    // Spot check a few values (not full verification)
    try testing.expect(c[0] != 0);
    try testing.expect(!std.math.isNan(c[n*n - 1]));
}

Requirements:

At least 3 tests per operation (simple, known result, edge cases)
Cover edge cases (zeros, negatives, large values, etc.)
Test various sizes (small, medium, large)

2. Gradient Check Tests

Verify autograd correctness using numerical differentiation.

Location: test/autograd/

Example:

// test/autograd/gradient_check_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;

fn numericalGradient(
    comptime f: anytype,
    x: []f32,
    epsilon: f32,
) ![]f32 {
    const allocator = testing.allocator;
    var grad = try allocator.alloc(f32, x.len);

    for (0..x.len) |i| {
        // f(x + h)
        x[i] += epsilon;
        const f_plus = try f(x);

        // f(x - h)
        x[i] -= 2 * epsilon;
        const f_minus = try f(x);

        // (f(x+h) - f(x-h)) / 2h
        grad[i] = (f_plus - f_minus) / (2 * epsilon);

        // Restore
        x[i] += epsilon;
    }

    return grad;
}

test "matmul: gradient check" {
    const allocator = testing.allocator;

    // Small matrices for numerical stability
    const a = [_]f32{ 1, 2, 3, 4 };
    const b = [_]f32{ 5, 6, 7, 8 };

    // Forward pass
    var c: [4]f32 = undefined;
    ztorch.ops.matmul_cpu(.{2, 2}, &a, &b, &c);

    // Backward pass (autograd)
    const d_c = [_]f32{ 1, 1, 1, 1 }; // Gradient of loss
    var d_a: [4]f32 = undefined;
    var d_b: [4]f32 = undefined;
    ztorch.ops.matmul_backward_cpu(.{2, 2}, &d_c, &a, &b, &d_a, &d_b);

    // Numerical gradient
    var a_copy = a;
    const num_grad_a = try numericalGradient(
        struct {
            fn f(x: []f32) !f32 {
                var tmp: [4]f32 = undefined;
                ztorch.ops.matmul_cpu(.{2, 2}, x, &b, &tmp);
                return tmp[0] + tmp[1] + tmp[2] + tmp[3]; // sum
            }
        }.f,
        &a_copy,
        1e-4,
    );
    defer allocator.free(num_grad_a);

    // Compare autograd vs numerical
    const epsilon = 1e-3; // Numerical gradients are approximate
    for (d_a, num_grad_a) |auto_grad, num_grad| {
        try testing.expectApproxEqAbs(auto_grad, num_grad, epsilon);
    }
}

test "relu: gradient check" {
    const input = [_]f32{ -2, -1, 0, 1, 2 };
    var output: [5]f32 = undefined;

    // Forward
    ztorch.ops.relu_cpu(&input, &output);

    // Backward
    const d_output = [_]f32{ 1, 1, 1, 1, 1 };
    var d_input: [5]f32 = undefined;
    ztorch.ops.relu_backward_cpu(&d_output, &input, &d_input);

    // Expected gradients: ReLU'(x) = x > 0 ? 1 : 0
    const expected = [_]f32{ 0, 0, 0, 1, 1 };

    for (expected, d_input) |exp, act| {
        try testing.expectEqual(exp, act);
    }
}

Requirements:

Every differentiable operation must have gradient check
Use numerical differentiation as reference
Epsilon tolerance based on operation complexity

3. Backend Parity Tests

Verify all backends produce identical results.

Location: test/backends/

Example:

// test/backends/parity_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;

test "matmul: cpu scalar vs cpu simd" {
    if (!ztorch.cpu.hasSimd()) return error.SkipZigTest;

    const allocator = testing.allocator;
    const n = 256;

    // Random input
    var a = try allocator.alloc(f32, n * n);
    defer allocator.free(a);
    var b = try allocator.alloc(f32, n * n);
    defer allocator.free(b);

    var rng = std.rand.DefaultPrng.init(42);
    for (0..n*n) |i| {
        a[i] = rng.random().float(f32) * 2 - 1; // [-1, 1]
        b[i] = rng.random().float(f32) * 2 - 1;
    }

    // CPU scalar
    var c_scalar = try allocator.alloc(f32, n * n);
    defer allocator.free(c_scalar);
    ztorch.ops.matmul_cpu_scalar(.{n, n}, a, b, c_scalar);

    // CPU SIMD
    var c_simd = try allocator.alloc(f32, n * n);
    defer allocator.free(c_simd);
    ztorch.ops.matmul_cpu_simd(.{n, n}, a, b, c_simd);

    // Compare
    const epsilon = 1e-4; // Allow small numerical differences
    for (c_scalar, c_simd) |scalar, simd| {
        try testing.expectApproxEqAbs(scalar, simd, epsilon);
    }
}

test "matmul: cpu vs cuda" {
    if (!ztorch.cuda.isAvailable()) return error.SkipZigTest;

    const allocator = testing.allocator;
    const n = 1024;

    // Random input
    var a = try allocator.alloc(f32, n * n);
    defer allocator.free(a);
    var b = try allocator.alloc(f32, n * n);
    defer allocator.free(b);

    var rng = std.rand.DefaultPrng.init(42);
    for (0..n*n) |i| {
        a[i] = rng.random().float(f32) * 2 - 1;
        b[i] = rng.random().float(f32) * 2 - 1;
    }

    // CPU result
    var c_cpu = try allocator.alloc(f32, n * n);
    defer allocator.free(c_cpu);
    ztorch.ops.matmul_cpu(.{n, n}, a, b, c_cpu);

    // CUDA result
    const a_gpu = try ztorch.cuda.allocAndCopy(a);
    defer ztorch.cuda.free(a_gpu);
    const b_gpu = try ztorch.cuda.allocAndCopy(b);
    defer ztorch.cuda.free(b_gpu);
    const c_gpu = try ztorch.cuda.alloc(n * n * @sizeOf(f32));
    defer ztorch.cuda.free(c_gpu);

    try ztorch.ops.matmul_cuda(.{n, n}, a_gpu, b_gpu, c_gpu);

    var c_cuda = try allocator.alloc(f32, n * n);
    defer allocator.free(c_cuda);
    try ztorch.cuda.copyToHost(c_gpu, c_cuda);

    // Compare
    const epsilon = 1e-3; // GPU may have slightly different rounding
    var max_diff: f32 = 0;
    for (c_cpu, c_cuda) |cpu, cuda| {
        const diff = @abs(cpu - cuda);
        max_diff = @max(max_diff, diff);
        try testing.expectApproxEqAbs(cpu, cuda, epsilon);
    }

    std.debug.print("Max difference: {d:.6}\n", .{max_diff});
}

Requirements:

Test all backend combinations
Use random inputs to catch edge cases
Report maximum difference for debugging

4. Integration Tests

Test complete workflows (model training, inference).

Location: test/integration/

Example:

// test/integration/mnist_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;

test "integration: train simple MLP on synthetic data" {
    const allocator = testing.allocator;

    // Define model
    const Model = ztorch.Sequential(.{
        ztorch.Linear(10, 20),
        ztorch.ReLU(),
        ztorch.Linear(20, 2),
    });

    var model = try Model.compile(.cpu, allocator);
    defer model.deinit();

    // Synthetic data (linearly separable)
    const batch_size = 32;
    var input = try ztorch.Tensor.zeros(.{batch_size, 10}, .f32, .cpu);
    defer input.deinit();
    var labels = try ztorch.Tensor.zeros(.{batch_size}, .i32, .cpu);
    defer labels.deinit();

    // Fill with pattern
    for (0..batch_size) |i| {
        const label: i32 = if (i < batch_size / 2) 0 else 1;
        labels.data[i] = label;

        for (0..10) |j| {
            input.data[i * 10 + j] = if (label == 0) -1.0 else 1.0;
        }
    }

    // Train for a few steps
    var initial_loss: f32 = 0;
    var final_loss: f32 = 0;

    for (0..100) |step| {
        const output = try model.forward(input);
        defer output.deinit();

        const loss = try ztorch.crossEntropy(output, labels);
        defer loss.deinit();

        if (step == 0) initial_loss = loss.item();
        if (step == 99) final_loss = loss.item();

        try model.backward(loss);
        try model.step(.{ .sgd = .{ .lr = 0.01 } });
    }

    // Loss should decrease
    try testing.expect(final_loss < initial_loss);

    // Should converge to low loss on this simple problem
    try testing.expect(final_loss < 0.1);
}

test "integration: save and load model" {
    // TODO: Implement serialization
    return error.SkipZigTest;
}

Requirements:

Test complete training loops
Verify loss decreases
Test inference
Test model serialization (future)

5. Property-Based Tests

Test properties that should hold for all inputs.

Example:

test "property: matmul associativity" {
    // (A @ B) @ C = A @ (B @ C)
    const allocator = testing.allocator;

    var rng = std.rand.DefaultPrng.init(42);

    // Small matrices for speed
    const n = 16;
    var a = try allocator.alloc(f32, n * n);
    defer allocator.free(a);
    var b = try allocator.alloc(f32, n * n);
    defer allocator.free(b);
    var c = try allocator.alloc(f32, n * n);
    defer allocator.free(c);

    // Random values
    for (0..n*n) |i| {
        a[i] = rng.random().float(f32);
        b[i] = rng.random().float(f32);
        c[i] = rng.random().float(f32);
    }

    // (A @ B) @ C
    var ab = try allocator.alloc(f32, n * n);
    defer allocator.free(ab);
    ztorch.ops.matmul_cpu(.{n, n}, a, b, ab);
    var abc_left = try allocator.alloc(f32, n * n);
    defer allocator.free(abc_left);
    ztorch.ops.matmul_cpu(.{n, n}, ab, c, abc_left);

    // A @ (B @ C)
    var bc = try allocator.alloc(f32, n * n);
    defer allocator.free(bc);
    ztorch.ops.matmul_cpu(.{n, n}, b, c, bc);
    var abc_right = try allocator.alloc(f32, n * n);
    defer allocator.free(abc_right);
    ztorch.ops.matmul_cpu(.{n, n}, a, bc, abc_right);

    // Should be approximately equal
    const epsilon = 1e-3; // Numerical error accumulates
    for (abc_left, abc_right) |left, right| {
        try testing.expectApproxEqAbs(left, right, epsilon);
    }
}

test "property: softmax sums to 1" {
    const allocator = testing.allocator;
    var rng = std.rand.DefaultPrng.init(42);

    const n = 100;
    var input = try allocator.alloc(f32, n);
    defer allocator.free(input);
    var output = try allocator.alloc(f32, n);
    defer allocator.free(output);

    for (0..10) |_| {
        // Random input
        for (0..n) |i| {
            input[i] = rng.random().float(f32) * 20 - 10; // [-10, 10]
        }

        ztorch.ops.softmax_cpu(input, output);

        // Sum should be 1
        var sum: f32 = 0;
        for (output) |val| sum += val;

        try testing.expectApproxEqAbs(@as(f32, 1.0), sum, 1e-5);
    }
}

CI Configuration

.github/workflows/ci.yml:

name: CI

on:
  push:
    branches: [main, dev]
  pull_request:
    branches: [main]

jobs:
  test:
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [x86_64, aarch64]
        exclude:
          - os: windows-latest
            arch: aarch64

    runs-on: ${{ matrix.os }}

    steps:
      - uses: actions/checkout@v4

      - name: Setup Zig
        uses: goto-bus-stop/setup-zig@v2
        with:
          version: master

      - name: Check formatting
        run: zig fmt --check .

      - name: Build
        run: zig build --summary all

      - name: Run unit tests
        run: zig build test --summary all

      - name: Run integration tests
        run: zig build test-integration --summary all

      - name: Run benchmarks (smoke test)
        run: zig build bench --summary all
        env:
          BENCH_ITERATIONS: 10 # Quick smoke test

  test-cuda:
    runs-on: ubuntu-latest
    # Requires self-hosted runner with GPU
    # if: github.event_name == 'push'

    steps:
      - uses: actions/checkout@v4

      - name: Setup Zig
        uses: goto-bus-stop/setup-zig@v2
        with:
          version: master

      - name: Setup CUDA
        uses: Jimver/cuda-toolkit@v0.2.11
        with:
          cuda: "12.1.0"

      - name: Run CUDA tests
        run: zig build test-cuda --summary all

      - name: Run backend parity tests
        run: zig build test-parity --summary all

  coverage:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v4

      - name: Setup Zig
        uses: goto-bus-stop/setup-zig@v2
        with:
          version: master

      - name: Run tests with coverage
        run: zig build test -Dcoverage

      - name: Upload coverage
        uses: codecov/codecov-action@v3
        with:
          files: ./zig-out/coverage.txt

Test Execution

# Run all tests
zig build test

# Run specific test file
zig build test -- test/ops/matmul_test.zig

# Run with verbose output
zig build test --summary all

# Run benchmarks
zig build bench

# Run only fast tests (no GPU, no integration)
zig build test-fast

# Run GPU tests only
zig build test-cuda

# Run backend parity tests
zig build test-parity

Test Coverage Requirements

Minimum: 90% line coverage
Target: 95%+ line coverage
Every public API must be tested
Every backend implementation must be tested

Continuous Benchmarking

Track performance over time to catch regressions.

# Run benchmarks and save results
zig build bench --output bench-results.json

# Compare against baseline
zig build bench-compare --baseline main

Example output:

=== Benchmark Comparison ===
MatMul 1024x1024:
  main:    12.3 µs (baseline)
  current: 11.8 µs (4.1% faster ✓)

ReLU 1M elements:
  main:    0.5 ms (baseline)
  current: 0.6 ms (20% slower ✗)  <-- REGRESSION!

Test Writing Guidelines

Name tests clearly

   test "matmul: 2x2 identity matrix"  // ✓ Good
   test "test1"                         // ✗ Bad

One concept per test
- Test identity matrix separately from known result
- Makes failures easier to diagnose
Use descriptive assertions

   try testing.expectEqual(@as(f32, 19), result[0]);  // ✓ Shows expected
   try testing.expect(result[0] == 19);                // ✗ Less clear

Clean up resources

   var tensor = try Tensor.zeros(.{100}, .f32, .cpu);
   defer tensor.deinit();  // Always cleanup

Document complex setups

   // Testing matmul with non-square matrices
   // A: (2, 3), B: (3, 4), Expected C: (2, 4)

Performance Testing

See benchmarking.md for full details on performance testing.

Keyboard shortcuts

Ztorch Documentation