Tiger Style for Ztorch

Ztorch follows Tiger Style development philosophy, adapted from TigerBeetle.

Core Principles

1. Safety First

No Undefined Behavior

// ✓ Good: Explicit size
const n: u32 = 1024;

// ✗ Bad: Architecture-dependent
const n: usize = 1024;

Fixed Limits

// ✓ Good: Bounded
const MAX_DIMS = 8;
for (0..@min(ndim, MAX_DIMS)) |i| { ... }

// ✗ Bad: Unbounded
while (has_more_dims()) { ... }

Static Allocation

// ✓ Good: Allocate at init
pub fn init(allocator: Allocator) !Model {
    const weights = try allocator.alloc(f32, WEIGHT_SIZE);
    return Model{ .weights = weights };
}

// ✗ Bad: Dynamic allocation in hot path
pub fn forward(self: *Model) !Tensor {
    const temp = try self.allocator.alloc(f32, runtime_size); // ✗
    // ...
}

Explicit Error Handling

// ✓ Good: Handle all errors
const result = matmul(a, b) catch |err| {
    log.err("MatMul failed: {}", .{err});
    return err;
};

// ✗ Bad: Ignore errors
const result = matmul(a, b) catch unreachable; // ✗

2. Performance

Napkin Math First

// Before implementing, calculate:
// MatMul (1024, 1024, 1024):
//   FLOPs: 2 * 1024^3 = 2.1 GFLOPs
//   Memory: 12 MB
//   Expected time: ~50µs on RTX 4090
//
// Then implement and measure actual vs expected.

Batch Operations

// ✓ Good: Process multiple items
pub fn forward_batch(inputs: []Tensor) ![]Tensor {
    // Single kernel launch for all inputs
}

// ✗ Bad: One at a time
for (inputs) |input| {
    _ = try forward(input); // Multiple kernel launches
}

Optimize Resources in Order

Network (if distributed)
Disk (if I/O bound)
Memory (usually the bottleneck for ML)
CPU/GPU (optimize last)

3. Developer Experience

Clear Naming

// ✓ Good: Descriptive
pub fn matmul_cpu_scalar(a: Tensor, b: Tensor) Tensor { ... }
const latency_ms_max: f32 = 100.0;

// ✗ Bad: Abbreviated
pub fn mm(a: T, b: T) T { ... }
const max_lat: f32 = 100.0;

Document the Why

// ✓ Good: Explains reason
// We subtract max before exp() to prevent overflow.
// For input [1000, 1001], exp() would overflow, but
// exp([0, 1]) / sum(exp([0, 1])) gives same result.
const max_val = max(input);
for (input) |val| {
    exp(val - max_val);
}

// ✗ Bad: Just describes what
// Subtract max
const max_val = max(input);

Organize Logically

// ✓ Good: Grouped by domain
src/
├── ops/           # Operations
│   ├── matmul.zig
│   ├── relu.zig
│   └── softmax.zig
├── backends/      # Backend implementations
│   ├── cpu.zig
│   └── cuda.zig
└── ir/            # Internal representation

// ✗ Bad: Mixed concerns
src/
├── stuff.zig
├── more_stuff.zig
└── utils.zig

Specific Guidelines for Ztorch

Tensor Operations

Always Check Shapes

pub fn matmul(a: Tensor, b: Tensor) !Tensor {
    // Tiger Style: Validate inputs
    if (a.shape.dims[a.shape.ndim - 1] != b.shape.dims[0]) {
        return error.ShapeMismatch;
    }
    // ...
}

Use Comptime When Possible

// ✓ Good: Shapes known at compile time
const Model = ztorch.Sequential(.{
    ztorch.Linear(784, 128),  // comptime validation
    ztorch.ReLU(),
    ztorch.Linear(128, 10),   // 128 matches!
});

// ✗ Bad: Shapes only checked at runtime
var model = Model.init();
model.addLayer(Linear.init(784, 128));
model.addLayer(ReLU.init());
model.addLayer(Linear.init(64, 10));  // Oops, 64 != 128!

Backend Implementation

Reference Implementation First

// Step 1: CPU scalar (obviously correct)
pub fn relu_cpu_scalar(input: []f32, output: []f32) void {
    for (input, output) |in, *out| {
        out.* = @max(0, in);
    }
}

// Step 2: Test thoroughly
test "relu: cpu scalar" { ... }

// Step 3: Optimize (SIMD)
pub fn relu_cpu_simd(input: []f32, output: []f32) void {
    // ...
}

// Step 4: Verify against reference
test "relu: cpu simd vs scalar" {
    try expectEqualSlices(scalar_result, simd_result);
}

Prove GPU Optimizations

# Before optimization
MatMul 1024x1024: 100µs (21 GFLOPS)

# After tiling optimization
MatMul 1024x1024: 25µs (84 GFLOPS)  # 4x speedup ✓

# Document in code:
// Tiled implementation achieves 84 GFLOPS vs 21 GFLOPS naive (4x).
// Uses 32x32 tiles to maximize shared memory reuse.

Error Handling

Fail Fast on Programmer Errors

pub fn forward(self: *Model, input: Tensor) !Tensor {
    // Programmer error: wrong input shape
    if (input.shape.dims[1] != self.input_size) {
        // Tiger Style: This is a bug, not a runtime error
        std.debug.panic(
            "Input shape mismatch: expected {}, got {}",
            .{self.input_size, input.shape.dims[1]}
        );
    }

    // Runtime error: GPU out of memory
    const output = self.backend.alloc(output_size) catch |err| {
        // This can happen, return error
        return err;
    };

    return output;
}

Memory Management

Explicit Lifetimes

// ✓ Good: Clear ownership
pub fn forward(self: *Model, input: Tensor) !Tensor {
    const output = try Tensor.zeros(.{32, 10}, .f32, .cpu);
    // Caller owns output, caller must free
    return output;
}

pub fn example() !void {
    const output = try model.forward(input);
    defer output.deinit();  // Explicit cleanup
}

// ✗ Bad: Unclear ownership
pub fn forward(self: *Model, input: Tensor) !Tensor {
    // Who frees this? Model? Caller?
    const output = try self.allocator.create(Tensor);
    // ...
}

Anti-Patterns to Avoid

Magic Numbers

// ✗ Bad
if (size > 1024) { ... }

// ✓ Good
const MAX_BATCH_SIZE = 1024;
if (size > MAX_BATCH_SIZE) { ... }

Premature Abstraction

// ✗ Bad: Over-engineered
pub const BackendFactory = struct {
    pub fn create(comptime T: type) Backend(T) { ... }
};

// ✓ Good: Simple
pub fn createCpuBackend() Backend { ... }
pub fn createCudaBackend() Backend { ... }

Hidden Allocations

// ✗ Bad: Surprise allocation
pub fn concat(a: Tensor, b: Tensor) Tensor {
    const result = Tensor.alloc(...);  // Hidden!
    // ...
}

// ✓ Good: Explicit
pub fn concat(allocator: Allocator, a: Tensor, b: Tensor) !Tensor {
    const result = try Tensor.alloc(allocator, ...);
    // ...
}

Ignoring Errors

// ✗ Bad
const result = riskyOperation() catch unreachable;

// ✓ Good
const result = riskyOperation() catch |err| {
    log.err("Operation failed: {}", .{err});
    return err;
};

Checklist for PRs

Before submitting, verify:

All tests pass on all platforms
Code formatted with zig fmt
No undefined behavior (checked with assertions)
Fixed resource limits where applicable
Error handling is explicit
Napkin math documented for performance code
Benchmarks prove optimizations
Public APIs documented
Complex logic has comments explaining "why"
No magic numbers
Memory ownership is clear

Resources

Questions?

If you're unsure whether something follows Tiger Style, ask in a PR or issue. We're here to help!

Keyboard shortcuts

Ztorch Documentation