Ztorch Testing Strategy
Comprehensive testing approach following Tiger Style principles.
Testing Philosophy
- TDD from day 0 - Write tests before implementation
- Test all platforms - Linux, macOS, Windows on x86_64 and aarch64
- Reference-based verification - CPU scalar is ground truth
- No untested code - Every line exercised by tests
- Fail fast - Assertions in code, strict validation in tests
Test Categories
1. Unit Tests
Test individual operations with known inputs/outputs.
Location: test/ops/
Example:
// test/ops/matmul_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;
test "matmul: 2x2 identity matrix" {
// I @ A = A
const identity = [_]f32{ 1, 0, 0, 1 };
const matrix = [_]f32{ 5, 6, 7, 8 };
var result: [4]f32 = undefined;
ztorch.ops.matmul_cpu(.{2, 2}, &identity, &matrix, &result);
try testing.expectEqual(@as(f32, 5), result[0]);
try testing.expectEqual(@as(f32, 6), result[1]);
try testing.expectEqual(@as(f32, 7), result[2]);
try testing.expectEqual(@as(f32, 8), result[3]);
}
test "matmul: 2x2 known result" {
// [[1, 2], [3, 4]] @ [[5, 6], [7, 8]]
// = [[1*5+2*7, 1*6+2*8], [3*5+4*7, 3*6+4*8]]
// = [[19, 22], [43, 50]]
const a = [_]f32{ 1, 2, 3, 4 };
const b = [_]f32{ 5, 6, 7, 8 };
var c: [4]f32 = undefined;
ztorch.ops.matmul_cpu(.{2, 2}, &a, &b, &c);
const epsilon = 1e-5;
try testing.expectApproxEqAbs(@as(f32, 19), c[0], epsilon);
try testing.expectApproxEqAbs(@as(f32, 22), c[1], epsilon);
try testing.expectApproxEqAbs(@as(f32, 43), c[2], epsilon);
try testing.expectApproxEqAbs(@as(f32, 50), c[3], epsilon);
}
test "matmul: non-square matrices" {
// (2, 3) @ (3, 4) = (2, 4)
const a = [_]f32{ 1, 2, 3, 4, 5, 6 };
const b = [_]f32{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
var c: [8]f32 = undefined;
ztorch.ops.matmul_cpu(.{2, 3, 4}, &a, &b, &c);
// Verify all elements (computed manually)
const expected = [_]f32{ 38, 44, 50, 56, 83, 98, 113, 128 };
const epsilon = 1e-5;
for (expected, c) |exp, act| {
try testing.expectApproxEqAbs(exp, act, epsilon);
}
}
test "matmul: large matrix stress test" {
const allocator = testing.allocator;
const n = 1024;
var a = try allocator.alloc(f32, n * n);
defer allocator.free(a);
var b = try allocator.alloc(f32, n * n);
defer allocator.free(b);
var c = try allocator.alloc(f32, n * n);
defer allocator.free(c);
// Fill with known pattern
for (0..n*n) |i| {
a[i] = @floatFromInt(i % 100);
b[i] = @floatFromInt((i * 7) % 100);
}
// Should complete without error
ztorch.ops.matmul_cpu(.{n, n}, a, b, c);
// Spot check a few values (not full verification)
try testing.expect(c[0] != 0);
try testing.expect(!std.math.isNan(c[n*n - 1]));
}
Requirements:
- At least 3 tests per operation (simple, known result, edge cases)
- Cover edge cases (zeros, negatives, large values, etc.)
- Test various sizes (small, medium, large)
2. Gradient Check Tests
Verify autograd correctness using numerical differentiation.
Location: test/autograd/
Example:
// test/autograd/gradient_check_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;
fn numericalGradient(
comptime f: anytype,
x: []f32,
epsilon: f32,
) ![]f32 {
const allocator = testing.allocator;
var grad = try allocator.alloc(f32, x.len);
for (0..x.len) |i| {
// f(x + h)
x[i] += epsilon;
const f_plus = try f(x);
// f(x - h)
x[i] -= 2 * epsilon;
const f_minus = try f(x);
// (f(x+h) - f(x-h)) / 2h
grad[i] = (f_plus - f_minus) / (2 * epsilon);
// Restore
x[i] += epsilon;
}
return grad;
}
test "matmul: gradient check" {
const allocator = testing.allocator;
// Small matrices for numerical stability
const a = [_]f32{ 1, 2, 3, 4 };
const b = [_]f32{ 5, 6, 7, 8 };
// Forward pass
var c: [4]f32 = undefined;
ztorch.ops.matmul_cpu(.{2, 2}, &a, &b, &c);
// Backward pass (autograd)
const d_c = [_]f32{ 1, 1, 1, 1 }; // Gradient of loss
var d_a: [4]f32 = undefined;
var d_b: [4]f32 = undefined;
ztorch.ops.matmul_backward_cpu(.{2, 2}, &d_c, &a, &b, &d_a, &d_b);
// Numerical gradient
var a_copy = a;
const num_grad_a = try numericalGradient(
struct {
fn f(x: []f32) !f32 {
var tmp: [4]f32 = undefined;
ztorch.ops.matmul_cpu(.{2, 2}, x, &b, &tmp);
return tmp[0] + tmp[1] + tmp[2] + tmp[3]; // sum
}
}.f,
&a_copy,
1e-4,
);
defer allocator.free(num_grad_a);
// Compare autograd vs numerical
const epsilon = 1e-3; // Numerical gradients are approximate
for (d_a, num_grad_a) |auto_grad, num_grad| {
try testing.expectApproxEqAbs(auto_grad, num_grad, epsilon);
}
}
test "relu: gradient check" {
const input = [_]f32{ -2, -1, 0, 1, 2 };
var output: [5]f32 = undefined;
// Forward
ztorch.ops.relu_cpu(&input, &output);
// Backward
const d_output = [_]f32{ 1, 1, 1, 1, 1 };
var d_input: [5]f32 = undefined;
ztorch.ops.relu_backward_cpu(&d_output, &input, &d_input);
// Expected gradients: ReLU'(x) = x > 0 ? 1 : 0
const expected = [_]f32{ 0, 0, 0, 1, 1 };
for (expected, d_input) |exp, act| {
try testing.expectEqual(exp, act);
}
}
Requirements:
- Every differentiable operation must have gradient check
- Use numerical differentiation as reference
- Epsilon tolerance based on operation complexity
3. Backend Parity Tests
Verify all backends produce identical results.
Location: test/backends/
Example:
// test/backends/parity_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;
test "matmul: cpu scalar vs cpu simd" {
if (!ztorch.cpu.hasSimd()) return error.SkipZigTest;
const allocator = testing.allocator;
const n = 256;
// Random input
var a = try allocator.alloc(f32, n * n);
defer allocator.free(a);
var b = try allocator.alloc(f32, n * n);
defer allocator.free(b);
var rng = std.rand.DefaultPrng.init(42);
for (0..n*n) |i| {
a[i] = rng.random().float(f32) * 2 - 1; // [-1, 1]
b[i] = rng.random().float(f32) * 2 - 1;
}
// CPU scalar
var c_scalar = try allocator.alloc(f32, n * n);
defer allocator.free(c_scalar);
ztorch.ops.matmul_cpu_scalar(.{n, n}, a, b, c_scalar);
// CPU SIMD
var c_simd = try allocator.alloc(f32, n * n);
defer allocator.free(c_simd);
ztorch.ops.matmul_cpu_simd(.{n, n}, a, b, c_simd);
// Compare
const epsilon = 1e-4; // Allow small numerical differences
for (c_scalar, c_simd) |scalar, simd| {
try testing.expectApproxEqAbs(scalar, simd, epsilon);
}
}
test "matmul: cpu vs cuda" {
if (!ztorch.cuda.isAvailable()) return error.SkipZigTest;
const allocator = testing.allocator;
const n = 1024;
// Random input
var a = try allocator.alloc(f32, n * n);
defer allocator.free(a);
var b = try allocator.alloc(f32, n * n);
defer allocator.free(b);
var rng = std.rand.DefaultPrng.init(42);
for (0..n*n) |i| {
a[i] = rng.random().float(f32) * 2 - 1;
b[i] = rng.random().float(f32) * 2 - 1;
}
// CPU result
var c_cpu = try allocator.alloc(f32, n * n);
defer allocator.free(c_cpu);
ztorch.ops.matmul_cpu(.{n, n}, a, b, c_cpu);
// CUDA result
const a_gpu = try ztorch.cuda.allocAndCopy(a);
defer ztorch.cuda.free(a_gpu);
const b_gpu = try ztorch.cuda.allocAndCopy(b);
defer ztorch.cuda.free(b_gpu);
const c_gpu = try ztorch.cuda.alloc(n * n * @sizeOf(f32));
defer ztorch.cuda.free(c_gpu);
try ztorch.ops.matmul_cuda(.{n, n}, a_gpu, b_gpu, c_gpu);
var c_cuda = try allocator.alloc(f32, n * n);
defer allocator.free(c_cuda);
try ztorch.cuda.copyToHost(c_gpu, c_cuda);
// Compare
const epsilon = 1e-3; // GPU may have slightly different rounding
var max_diff: f32 = 0;
for (c_cpu, c_cuda) |cpu, cuda| {
const diff = @abs(cpu - cuda);
max_diff = @max(max_diff, diff);
try testing.expectApproxEqAbs(cpu, cuda, epsilon);
}
std.debug.print("Max difference: {d:.6}\n", .{max_diff});
}
Requirements:
- Test all backend combinations
- Use random inputs to catch edge cases
- Report maximum difference for debugging
4. Integration Tests
Test complete workflows (model training, inference).
Location: test/integration/
Example:
// test/integration/mnist_test.zig
const std = @import("std");
const ztorch = @import("ztorch");
const testing = std.testing;
test "integration: train simple MLP on synthetic data" {
const allocator = testing.allocator;
// Define model
const Model = ztorch.Sequential(.{
ztorch.Linear(10, 20),
ztorch.ReLU(),
ztorch.Linear(20, 2),
});
var model = try Model.compile(.cpu, allocator);
defer model.deinit();
// Synthetic data (linearly separable)
const batch_size = 32;
var input = try ztorch.Tensor.zeros(.{batch_size, 10}, .f32, .cpu);
defer input.deinit();
var labels = try ztorch.Tensor.zeros(.{batch_size}, .i32, .cpu);
defer labels.deinit();
// Fill with pattern
for (0..batch_size) |i| {
const label: i32 = if (i < batch_size / 2) 0 else 1;
labels.data[i] = label;
for (0..10) |j| {
input.data[i * 10 + j] = if (label == 0) -1.0 else 1.0;
}
}
// Train for a few steps
var initial_loss: f32 = 0;
var final_loss: f32 = 0;
for (0..100) |step| {
const output = try model.forward(input);
defer output.deinit();
const loss = try ztorch.crossEntropy(output, labels);
defer loss.deinit();
if (step == 0) initial_loss = loss.item();
if (step == 99) final_loss = loss.item();
try model.backward(loss);
try model.step(.{ .sgd = .{ .lr = 0.01 } });
}
// Loss should decrease
try testing.expect(final_loss < initial_loss);
// Should converge to low loss on this simple problem
try testing.expect(final_loss < 0.1);
}
test "integration: save and load model" {
// TODO: Implement serialization
return error.SkipZigTest;
}
Requirements:
- Test complete training loops
- Verify loss decreases
- Test inference
- Test model serialization (future)
5. Property-Based Tests
Test properties that should hold for all inputs.
Example:
test "property: matmul associativity" {
// (A @ B) @ C = A @ (B @ C)
const allocator = testing.allocator;
var rng = std.rand.DefaultPrng.init(42);
// Small matrices for speed
const n = 16;
var a = try allocator.alloc(f32, n * n);
defer allocator.free(a);
var b = try allocator.alloc(f32, n * n);
defer allocator.free(b);
var c = try allocator.alloc(f32, n * n);
defer allocator.free(c);
// Random values
for (0..n*n) |i| {
a[i] = rng.random().float(f32);
b[i] = rng.random().float(f32);
c[i] = rng.random().float(f32);
}
// (A @ B) @ C
var ab = try allocator.alloc(f32, n * n);
defer allocator.free(ab);
ztorch.ops.matmul_cpu(.{n, n}, a, b, ab);
var abc_left = try allocator.alloc(f32, n * n);
defer allocator.free(abc_left);
ztorch.ops.matmul_cpu(.{n, n}, ab, c, abc_left);
// A @ (B @ C)
var bc = try allocator.alloc(f32, n * n);
defer allocator.free(bc);
ztorch.ops.matmul_cpu(.{n, n}, b, c, bc);
var abc_right = try allocator.alloc(f32, n * n);
defer allocator.free(abc_right);
ztorch.ops.matmul_cpu(.{n, n}, a, bc, abc_right);
// Should be approximately equal
const epsilon = 1e-3; // Numerical error accumulates
for (abc_left, abc_right) |left, right| {
try testing.expectApproxEqAbs(left, right, epsilon);
}
}
test "property: softmax sums to 1" {
const allocator = testing.allocator;
var rng = std.rand.DefaultPrng.init(42);
const n = 100;
var input = try allocator.alloc(f32, n);
defer allocator.free(input);
var output = try allocator.alloc(f32, n);
defer allocator.free(output);
for (0..10) |_| {
// Random input
for (0..n) |i| {
input[i] = rng.random().float(f32) * 20 - 10; // [-10, 10]
}
ztorch.ops.softmax_cpu(input, output);
// Sum should be 1
var sum: f32 = 0;
for (output) |val| sum += val;
try testing.expectApproxEqAbs(@as(f32, 1.0), sum, 1e-5);
}
}
CI Configuration
.github/workflows/ci.yml:
name: CI
on:
push:
branches: [main, dev]
pull_request:
branches: [main]
jobs:
test:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
arch: [x86_64, aarch64]
exclude:
- os: windows-latest
arch: aarch64
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Setup Zig
uses: goto-bus-stop/setup-zig@v2
with:
version: master
- name: Check formatting
run: zig fmt --check .
- name: Build
run: zig build --summary all
- name: Run unit tests
run: zig build test --summary all
- name: Run integration tests
run: zig build test-integration --summary all
- name: Run benchmarks (smoke test)
run: zig build bench --summary all
env:
BENCH_ITERATIONS: 10 # Quick smoke test
test-cuda:
runs-on: ubuntu-latest
# Requires self-hosted runner with GPU
# if: github.event_name == 'push'
steps:
- uses: actions/checkout@v4
- name: Setup Zig
uses: goto-bus-stop/setup-zig@v2
with:
version: master
- name: Setup CUDA
uses: Jimver/cuda-toolkit@v0.2.11
with:
cuda: "12.1.0"
- name: Run CUDA tests
run: zig build test-cuda --summary all
- name: Run backend parity tests
run: zig build test-parity --summary all
coverage:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Zig
uses: goto-bus-stop/setup-zig@v2
with:
version: master
- name: Run tests with coverage
run: zig build test -Dcoverage
- name: Upload coverage
uses: codecov/codecov-action@v3
with:
files: ./zig-out/coverage.txt
Test Execution
# Run all tests
zig build test
# Run specific test file
zig build test -- test/ops/matmul_test.zig
# Run with verbose output
zig build test --summary all
# Run benchmarks
zig build bench
# Run only fast tests (no GPU, no integration)
zig build test-fast
# Run GPU tests only
zig build test-cuda
# Run backend parity tests
zig build test-parity
Test Coverage Requirements
- Minimum: 90% line coverage
- Target: 95%+ line coverage
- Every public API must be tested
- Every backend implementation must be tested
Continuous Benchmarking
Track performance over time to catch regressions.
# Run benchmarks and save results
zig build bench --output bench-results.json
# Compare against baseline
zig build bench-compare --baseline main
Example output:
=== Benchmark Comparison ===
MatMul 1024x1024:
main: 12.3 µs (baseline)
current: 11.8 µs (4.1% faster ✓)
ReLU 1M elements:
main: 0.5 ms (baseline)
current: 0.6 ms (20% slower ✗) <-- REGRESSION!
Test Writing Guidelines
- Name tests clearly
test "matmul: 2x2 identity matrix" // ✓ Good
test "test1" // ✗ Bad
-
One concept per test
- Test identity matrix separately from known result
- Makes failures easier to diagnose
-
Use descriptive assertions
try testing.expectEqual(@as(f32, 19), result[0]); // ✓ Shows expected
try testing.expect(result[0] == 19); // ✗ Less clear
- Clean up resources
var tensor = try Tensor.zeros(.{100}, .f32, .cpu);
defer tensor.deinit(); // Always cleanup
- Document complex setups
// Testing matmul with non-square matrices
// A: (2, 3), B: (3, 4), Expected C: (2, 4)
Performance Testing
See benchmarking.md for full details on performance testing.