Tiger Style for Ztorch
Ztorch follows Tiger Style development philosophy, adapted from TigerBeetle.
Core Principles
1. Safety First
No Undefined Behavior
// ✓ Good: Explicit size
const n: u32 = 1024;
// ✗ Bad: Architecture-dependent
const n: usize = 1024;
Fixed Limits
// ✓ Good: Bounded
const MAX_DIMS = 8;
for (0..@min(ndim, MAX_DIMS)) |i| { ... }
// ✗ Bad: Unbounded
while (has_more_dims()) { ... }
Static Allocation
// ✓ Good: Allocate at init
pub fn init(allocator: Allocator) !Model {
const weights = try allocator.alloc(f32, WEIGHT_SIZE);
return Model{ .weights = weights };
}
// ✗ Bad: Dynamic allocation in hot path
pub fn forward(self: *Model) !Tensor {
const temp = try self.allocator.alloc(f32, runtime_size); // ✗
// ...
}
Explicit Error Handling
// ✓ Good: Handle all errors
const result = matmul(a, b) catch |err| {
log.err("MatMul failed: {}", .{err});
return err;
};
// ✗ Bad: Ignore errors
const result = matmul(a, b) catch unreachable; // ✗
2. Performance
Napkin Math First
// Before implementing, calculate:
// MatMul (1024, 1024, 1024):
// FLOPs: 2 * 1024^3 = 2.1 GFLOPs
// Memory: 12 MB
// Expected time: ~50µs on RTX 4090
//
// Then implement and measure actual vs expected.
Batch Operations
// ✓ Good: Process multiple items
pub fn forward_batch(inputs: []Tensor) ![]Tensor {
// Single kernel launch for all inputs
}
// ✗ Bad: One at a time
for (inputs) |input| {
_ = try forward(input); // Multiple kernel launches
}
Optimize Resources in Order
- Network (if distributed)
- Disk (if I/O bound)
- Memory (usually the bottleneck for ML)
- CPU/GPU (optimize last)
3. Developer Experience
Clear Naming
// ✓ Good: Descriptive
pub fn matmul_cpu_scalar(a: Tensor, b: Tensor) Tensor { ... }
const latency_ms_max: f32 = 100.0;
// ✗ Bad: Abbreviated
pub fn mm(a: T, b: T) T { ... }
const max_lat: f32 = 100.0;
Document the Why
// ✓ Good: Explains reason
// We subtract max before exp() to prevent overflow.
// For input [1000, 1001], exp() would overflow, but
// exp([0, 1]) / sum(exp([0, 1])) gives same result.
const max_val = max(input);
for (input) |val| {
exp(val - max_val);
}
// ✗ Bad: Just describes what
// Subtract max
const max_val = max(input);
Organize Logically
// ✓ Good: Grouped by domain
src/
├── ops/ # Operations
│ ├── matmul.zig
│ ├── relu.zig
│ └── softmax.zig
├── backends/ # Backend implementations
│ ├── cpu.zig
│ └── cuda.zig
└── ir/ # Internal representation
// ✗ Bad: Mixed concerns
src/
├── stuff.zig
├── more_stuff.zig
└── utils.zig
Specific Guidelines for Ztorch
Tensor Operations
Always Check Shapes
pub fn matmul(a: Tensor, b: Tensor) !Tensor {
// Tiger Style: Validate inputs
if (a.shape.dims[a.shape.ndim - 1] != b.shape.dims[0]) {
return error.ShapeMismatch;
}
// ...
}
Use Comptime When Possible
// ✓ Good: Shapes known at compile time
const Model = ztorch.Sequential(.{
ztorch.Linear(784, 128), // comptime validation
ztorch.ReLU(),
ztorch.Linear(128, 10), // 128 matches!
});
// ✗ Bad: Shapes only checked at runtime
var model = Model.init();
model.addLayer(Linear.init(784, 128));
model.addLayer(ReLU.init());
model.addLayer(Linear.init(64, 10)); // Oops, 64 != 128!
Backend Implementation
Reference Implementation First
// Step 1: CPU scalar (obviously correct)
pub fn relu_cpu_scalar(input: []f32, output: []f32) void {
for (input, output) |in, *out| {
out.* = @max(0, in);
}
}
// Step 2: Test thoroughly
test "relu: cpu scalar" { ... }
// Step 3: Optimize (SIMD)
pub fn relu_cpu_simd(input: []f32, output: []f32) void {
// ...
}
// Step 4: Verify against reference
test "relu: cpu simd vs scalar" {
try expectEqualSlices(scalar_result, simd_result);
}
Prove GPU Optimizations
# Before optimization
MatMul 1024x1024: 100µs (21 GFLOPS)
# After tiling optimization
MatMul 1024x1024: 25µs (84 GFLOPS) # 4x speedup ✓
# Document in code:
// Tiled implementation achieves 84 GFLOPS vs 21 GFLOPS naive (4x).
// Uses 32x32 tiles to maximize shared memory reuse.
Error Handling
Fail Fast on Programmer Errors
pub fn forward(self: *Model, input: Tensor) !Tensor {
// Programmer error: wrong input shape
if (input.shape.dims[1] != self.input_size) {
// Tiger Style: This is a bug, not a runtime error
std.debug.panic(
"Input shape mismatch: expected {}, got {}",
.{self.input_size, input.shape.dims[1]}
);
}
// Runtime error: GPU out of memory
const output = self.backend.alloc(output_size) catch |err| {
// This can happen, return error
return err;
};
return output;
}
Memory Management
Explicit Lifetimes
// ✓ Good: Clear ownership
pub fn forward(self: *Model, input: Tensor) !Tensor {
const output = try Tensor.zeros(.{32, 10}, .f32, .cpu);
// Caller owns output, caller must free
return output;
}
pub fn example() !void {
const output = try model.forward(input);
defer output.deinit(); // Explicit cleanup
}
// ✗ Bad: Unclear ownership
pub fn forward(self: *Model, input: Tensor) !Tensor {
// Who frees this? Model? Caller?
const output = try self.allocator.create(Tensor);
// ...
}
Anti-Patterns to Avoid
Magic Numbers
// ✗ Bad
if (size > 1024) { ... }
// ✓ Good
const MAX_BATCH_SIZE = 1024;
if (size > MAX_BATCH_SIZE) { ... }
Premature Abstraction
// ✗ Bad: Over-engineered
pub const BackendFactory = struct {
pub fn create(comptime T: type) Backend(T) { ... }
};
// ✓ Good: Simple
pub fn createCpuBackend() Backend { ... }
pub fn createCudaBackend() Backend { ... }
Hidden Allocations
// ✗ Bad: Surprise allocation
pub fn concat(a: Tensor, b: Tensor) Tensor {
const result = Tensor.alloc(...); // Hidden!
// ...
}
// ✓ Good: Explicit
pub fn concat(allocator: Allocator, a: Tensor, b: Tensor) !Tensor {
const result = try Tensor.alloc(allocator, ...);
// ...
}
Ignoring Errors
// ✗ Bad
const result = riskyOperation() catch unreachable;
// ✓ Good
const result = riskyOperation() catch |err| {
log.err("Operation failed: {}", .{err});
return err;
};
Checklist for PRs
Before submitting, verify:
- All tests pass on all platforms
-
Code formatted with
zig fmt - No undefined behavior (checked with assertions)
- Fixed resource limits where applicable
- Error handling is explicit
- Napkin math documented for performance code
- Benchmarks prove optimizations
- Public APIs documented
- Complex logic has comments explaining "why"
- No magic numbers
- Memory ownership is clear
Resources
Questions?
If you're unsure whether something follows Tiger Style, ask in a PR or issue. We're here to help!