Skip to content

Commit

Permalink
feat: more nested AD rules (#1151)
Browse files Browse the repository at this point in the history
* feat: softmax and logsoftmax jvp rules

* feat: add pooling rules

* test: logsoftmax and softmax forwarddiff rules

* fix: patch meanpool

* test: more tests fixed
  • Loading branch information
avik-pal authored Jan 1, 2025
1 parent 3c3a432 commit 63d3434
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 29 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ ComponentArrays = "0.15.18"
ConcreteStructs = "0.2.3"
DispatchDoctor = "0.4.12"
Enzyme = "0.13.16"
EnzymeCore = "0.8.6"
EnzymeCore = "0.8.8"
FastClosures = "0.3.2"
Flux = "0.15, 0.16"
ForwardDiff = "0.10.36"
Expand Down
4 changes: 2 additions & 2 deletions lib/LuxLib/Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
version = "1.3.11"
version = "1.4.0"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down Expand Up @@ -66,7 +66,7 @@ Compat = "4.16"
CpuId = "0.3"
DispatchDoctor = "0.4.12"
Enzyme = "0.13.16"
EnzymeCore = "0.8.6"
EnzymeCore = "0.8.8"
FastClosures = "0.3.2"
ForwardDiff = "0.10.36"
Hwloc = "3.2"
Expand Down
14 changes: 12 additions & 2 deletions lib/LuxLib/ext/LuxLibCUDAExt/LuxLibCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
module LuxLibCUDAExt

using CUDA: CUDA, CUBLAS, StridedCuMatrix, StridedCuVector, CuPtr
using CUDA: CUDA, CUBLAS, CuArray, StridedCuMatrix, StridedCuVector, CuPtr
using ForwardDiff: ForwardDiff
using LinearAlgebra: LinearAlgebra, Transpose, Adjoint
using LuxLib: LuxLib, Optional
using LuxLib: LuxLib, Impl, Optional
using LuxLib.Utils: ofeltype_array
using NNlib: NNlib
using Static: True, False

# Hacky Type Piracy for ForwardDiff rules
for op in (:logsoftmax, :softmax)
dual_op = Symbol(op, :_dual)
@eval function NNlib.$(op)(
x::CuArray{<:ForwardDiff.Dual{Tag, T, P}}; dims=1) where {Tag, T, P}
return Impl.$(dual_op)(x; dims)
end
end

# Low level functions
include("cublaslt.jl")

Expand Down
81 changes: 80 additions & 1 deletion lib/LuxLib/src/impl/forward_diff.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
for op in [:conv, :depthwiseconv, :∇conv_data, :∇conv_filter]
for op in (:conv, :depthwiseconv, :∇conv_data, :∇conv_filter)
patched_op = op !== :depthwiseconv ? eval(op) : getfield(NNlib, op)

@eval function NNlib.$(op)(x1::AbstractArray{<:ForwardDiff.Dual{Tag, V, P}, N},
Expand Down Expand Up @@ -48,3 +48,82 @@ for op in [:conv, :depthwiseconv, :∇conv_data, :∇conv_filter]
return ForwardDiff.Dual{Tag, eltype(y), P}.(y, partials)
end
end

for op in (:logsoftmax, :softmax)
dual_op = Symbol(op, :_dual)
@eval function NNlib.$(op)(
x::AbstractArray{<:ForwardDiff.Dual{Tag, T, P}}; dims=1) where {Tag, T, P}
return Impl.$(dual_op)(x; dims)
end
end

function softmax_dual(
x::AbstractArray{<:ForwardDiff.Dual{Tag, T, P}}; dims=1) where {Tag, T, P}
value_fn(x) = ForwardDiff.value(Tag, x)
partial_fn(x, i) = ForwardDiff.partials(Tag, x, i)

x_data = value_fn.(x)

y = NNlib.softmax(x_data; dims)
dysᵢ = ntuple(P) do i
v = partial_fn.(x, i)
return y .* (v .- sum(y .* v; dims))
end

partials = ForwardDiff.Partials.(tuple.(dysᵢ...))
return ForwardDiff.Dual{Tag, eltype(y), P}.(y, partials)
end

function logsoftmax_dual(
x::AbstractArray{<:ForwardDiff.Dual{Tag, T, P}}; dims=1) where {Tag, T, P}
value_fn(x) = ForwardDiff.value(Tag, x)
partial_fn(x, i) = ForwardDiff.partials(Tag, x, i)

x_data = value_fn.(x)

y = NNlib.softmax(x_data; dims)
dysᵢ = ntuple(P) do i
v = partial_fn.(x, i)
return v .- sum(y .* v; dims)
end

partials = ForwardDiff.Partials.(tuple.(dysᵢ...))
return ForwardDiff.Dual{Tag, eltype(y), P}.(y, partials)
end

@eval function NNlib.meanpool(
x::AbstractArray{<:ForwardDiff.Dual{Tag, T, P}}, pdims::NNlib.PoolDims;
kwargs...) where {Tag, T, P}
value_fn(x) = ForwardDiff.value(Tag, x)
partial_fn(x, i) = ForwardDiff.partials(Tag, x, i)

y = NNlib.meanpool(value_fn.(x), pdims; kwargs...)
dysᵢ = ntuple(P) do i
return NNlib.meanpool(partial_fn.(x, i), pdims; kwargs...)
end

partials = ForwardDiff.Partials.(tuple.(dysᵢ...))
return ForwardDiff.Dual{Tag, eltype(y), P}.(y, partials)
end

function NNlib.∇meanpool(
dy::AbstractArray{<:ForwardDiff.Dual{Tag, T1, P}},
y::AbstractArray{<:ForwardDiff.Dual{Tag, T1, P}},
x::AbstractArray{<:ForwardDiff.Dual{Tag, T2, P}},
pdims::NNlib.PoolDims; kwargs...) where {Tag, T1, T2, P}
value_fn(x) = ForwardDiff.value(Tag, x)
partial_fn(x, i) = ForwardDiff.partials(Tag, x, i)

dy_data, y_data, x_data = value_fn.(dy), value_fn.(y), value_fn.(x)

dx = NNlib.∇meanpool(dy_data, y_data, x_data, pdims; kwargs...)
dysᵢ = ntuple(P) do i
∇y₁ = NNlib.∇meanpool(partial_fn.(dy, i), y_data, x_data, pdims; kwargs...)
∇y₂ = NNlib.∇meanpool(dy_data, partial_fn.(y, i), x_data, pdims; kwargs...)
@. ∇y₁ = (∇y₁ + ∇y₂) * partial_fn(x, i)
return ∇y₁
end

partials = ForwardDiff.Partials.(tuple.(dysᵢ...))
return ForwardDiff.Dual{Tag, eltype(dx), P}.(dx, partials)
end
2 changes: 1 addition & 1 deletion lib/LuxLib/test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ BenchmarkTools = "1.5"
ChainRulesCore = "1.24"
ComponentArrays = "0.15.18"
Enzyme = "0.13.16"
EnzymeCore = "0.8.6"
EnzymeCore = "0.8.8"
ExplicitImports = "1.9.0"
ForwardDiff = "0.10.36"
Hwloc = "3.2"
Expand Down
58 changes: 54 additions & 4 deletions lib/LuxLib/test/others/forwarddiff_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,20 @@

function test_jvp_computation(f::F, x, u, ongpu, nested=false) where {F}
jvp₁ = jvp_forwarddiff(f, x, u)

if !(x isa ComponentArray && ongpu)
# ComponentArray + ForwardDiff on GPU don't play nice
jvp₂ = jvp_forwarddiff_concrete(f, x, u)
@test check_approx(jvp₁, jvp₂; atol=1e-5, rtol=1e-5)
@testset "JVP ForwardDiff Concrete" begin
jvp₂ = jvp_forwarddiff_concrete(f, x, u)
@test check_approx(jvp₁, jvp₂; atol=1e-5, rtol=1e-5)
end
end

if !nested
jvp₃ = jvp_zygote(f, x, u)
@test check_approx(jvp₁, jvp₃; atol=1e-5, rtol=1e-5)
@testset "JVP Zygote" begin
jvp₃ = jvp_zygote(f, x, u)
@test check_approx(jvp₁, jvp₃; atol=1e-5, rtol=1e-5)
end
end
end

Expand Down Expand Up @@ -89,6 +94,51 @@
true)
end
end

@testset for op in (logsoftmax, softmax)
@testset for (input_dim, dim) in zip(
(
(2, 3), (2, 3), (2, 3, 4, 5),
(2, 3, 4, 5), (2, 3, 4, 5), (2, 3, 4, 5)
),
(1, 2, 1, 2, 3, 4)
)
x = randn(Float32, input_dim) |> aType
u = randn(Float32, input_dim) |> aType

test_jvp_computation(x -> op(x; dims=dim), x, u, ongpu)
test_jvp_computation(
x -> op(x; dims=dim), ComponentArray(; x), u, ongpu)

test_jvp_computation(
x -> only(Zygote.gradient(x -> sum(op(x; dims=dim)), x)),
x, u, ongpu, true
)
end
end

@testset for op in (meanpool,)
@testset for (input_dim, kernel_size, stride, pad) in (
((8, 3, 2), (4,), (2,), (0,)),
((8, 3, 2), (4,), (3,), (0,)),
((8, 3, 2), (4,), (3,), (1,)),
((8, 8, 3, 2), (4, 4), (2, 2), (0, 0)),
((8, 8, 3, 2), (4, 4), (3, 3), (0, 0)),
((8, 8, 3, 2), (4, 4), (3, 3), (1, 1))
)
x = randn(Float32, input_dim) |> aType
u = randn(Float32, input_dim) |> aType

test_jvp_computation(
x -> op(x, kernel_size; stride, pad), x, u, ongpu)

test_jvp_computation(
x -> only(Zygote.gradient(
x -> sum(op(x, kernel_size; stride, pad)), x)),
x, u, ongpu, true
)
end
end
end
end

Expand Down
26 changes: 8 additions & 18 deletions test/layers/normalize_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,6 @@ end
@jet __f(z)
end

broken_backends = VERSION v"1.11-" ? Any[AutoEnzyme()] : []

@testset "Conv" begin
c = Conv((3, 3), 3 => 3; init_bias=Lux.ones32)

Expand All @@ -165,35 +163,31 @@ end
x = randn(rng, Float32, 3, 3, 3, 1) |> aType

@jet wn(x, ps, st)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3,
broken_backends)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3)

wn = WeightNorm(c, (:weight,))
display(wn)
ps, st = Lux.setup(rng, wn) |> dev
x = randn(rng, Float32, 3, 3, 3, 1) |> aType

@jet wn(x, ps, st)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3,
broken_backends)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3)

wn = WeightNorm(c, (:weight, :bias), (2, 2))
display(wn)
ps, st = Lux.setup(rng, wn) |> dev
x = randn(rng, Float32, 3, 3, 3, 1) |> aType

@jet wn(x, ps, st)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3,
broken_backends)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3)

wn = WeightNorm(c, (:weight,), (2,))
display(wn)
ps, st = Lux.setup(rng, wn) |> dev
x = randn(rng, Float32, 3, 3, 3, 1) |> aType

@jet wn(x, ps, st)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3,
broken_backends)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3)
end

@testset "Dense" begin
Expand All @@ -205,35 +199,31 @@ end
x = randn(rng, Float32, 3, 1) |> aType

@jet wn(x, ps, st)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3,
broken_backends)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3)

wn = WeightNorm(d, (:weight,))
display(wn)
ps, st = Lux.setup(rng, wn) |> dev
x = randn(rng, Float32, 3, 1) |> aType

@jet wn(x, ps, st)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3,
broken_backends)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3)

wn = WeightNorm(d, (:weight, :bias), (2, 2))
display(wn)
ps, st = Lux.setup(rng, wn) |> dev
x = randn(rng, Float32, 3, 1) |> aType

@jet wn(x, ps, st)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3,
broken_backends)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3)

wn = WeightNorm(d, (:weight,), (2,))
display(wn)
ps, st = Lux.setup(rng, wn) |> dev
x = randn(rng, Float32, 3, 1) |> aType

@jet wn(x, ps, st)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3,
broken_backends)
@test_gradients(sumabs2first, wn, x, ps, st; atol=1.0f-3, rtol=1.0f-3)
end

# See https://github.com/LuxDL/Lux.jl/issues/95
Expand Down

3 comments on commit 63d3434

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register subdir=lib/LuxLib

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/122233

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a LuxLib-v1.4.0 -m "<description of version>" 63d343452fb184fb54f49dd7ebc02e4adb396f18
git push origin LuxLib-v1.4.0

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 63d3434 Previous: ac2879b Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4083.5 ns 3625 ns 1.13
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4042 ns 4541 ns 0.89
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4917 ns 5125 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3833 ns 3791 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 59941 ns 61743 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 11250 ns 10125 ns 1.11
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10500 ns 10875 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 11541 ns 10334 ns 1.12
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10958 ns 10417 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 421187 ns 430910 ns 0.98
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1167 ns 1209 ns 0.97
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1250 ns 1209 ns 1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1417 ns 1500 ns 0.94
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1167 ns 1042 ns 1.12
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 17939 ns 18223.5 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4125 ns 4000 ns 1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 3958 ns 4042 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4292 ns 4334 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4062.5 ns 3875 ns 1.05
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 108432 ns 110886 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57333 ns 56709 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46250 ns 38334 ns 1.21
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47041 ns 46917 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82125 ns 81750 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36736 ns 37932 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1991000.5 ns 2043708.5 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2094313 ns 2096520.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2094167 ns 2096437.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1997041.5 ns 1991167 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 194384.5 ns 197294.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 143854.5 ns 144625 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 143125 ns 145667 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 147041 ns 144916 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144750 ns 144854.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 165602 ns 166157.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1114896 ns 1116791 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1128937.5 ns 1150459 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1128792 ns 1128083 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1114542 ns 1121458 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 526049 ns 535998 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3458 ns 3417 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3416 ns 4042 ns 0.85
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4145.5 ns 4459 ns 0.93
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3584 ns 3187.5 ns 1.12
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 70040 ns 72464.5 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8917 ns 9417 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9042 ns 9458 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9459 ns 9750 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8917 ns 8708 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 447136 ns 469472 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 15041 ns 14375 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17541.5 ns 16208 ns 1.08
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17625 ns 18750 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 15917 ns 16875 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 54471 ns 54038 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 217417 ns 213375 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 213417 ns 220000 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214979.5 ns 217250 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 225771 ns 213916 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 270355 ns 270771 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 791 ns 541 ns 1.46
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 625 ns 542 ns 1.15
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 708 ns 708 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 667 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17190 ns 17308 ns 0.99
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1500 ns 1417 ns 1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1500 ns 1375 ns 1.09
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1666 ns 1541 ns 1.08
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1500 ns 1417 ns 1.06
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 101385 ns 101606.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7208 ns 7083 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5916 ns 5250 ns 1.13
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5917 ns 5958 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9875 ns 10084 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23163 ns 23383 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 223083 ns 221709 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 228500 ns 229750 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 230208 ns 229125 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 217000 ns 214125 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 166961 ns 167775.5 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3958 ns 4000 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23600 ns 23070 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16792 ns 17083 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16750 ns 16625 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17041 ns 17083 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 17000 ns 16833 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 161078 ns 162035 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 577750 ns 575083 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 572709 ns 571792 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 574833 ns 570750 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 575625 ns 577208 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 112893 ns 113295 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1420292 ns 1418250 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1425209 ns 1422875 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1426583 ns 1422500 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1429020.5 ns 1425750 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 211317.5 ns 211866.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1077500 ns 1081041.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 960792 ns 946916.5 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1350854.5 ns 1353229.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1298750 ns 1292458 ns 1.00
lenet(28, 28, 1, 64)/forward/GPU/CUDA 273506 ns 269913.5 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 6004937.5 ns 6001958 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4547292 ns 4632042 ns 0.98
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4929708.5 ns 4929041.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5555333 ns 5549750.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1074648 ns 1070564 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 583 ns 542 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23430 ns 23780 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2167 ns 2209 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2084 ns 2209 ns 0.94
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2167 ns 2208 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2084 ns 2084 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 173597 ns 170642 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4292 ns 3667 ns 1.17
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 3750 ns 4750 ns 0.79
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 4917 ns 5208 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3958 ns 4041 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 65160 ns 65525 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11209 ns 11084 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11250 ns 12083 ns 0.93
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12208 ns 12208 ns 1
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11125 ns 10834 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 447745.5 ns 445478.5 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6166 ns 5917 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6375 ns 6666 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8125 ns 8167 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6583 ns 6166 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 52163 ns 52877 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16750 ns 18250 ns 0.92
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 18209 ns 18458 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18500 ns 18542 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17000 ns 17520.5 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 298259.5 ns 296963 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 583 ns 583 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 583 ns 625 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 667 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 542 ns 542 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 32532 ns 32928.5 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8208 ns 9271 ns 0.89
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8667 ns 9208 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9333 ns 9354.5 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8083 ns 8375 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 158900.5 ns 157633 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64500 ns 64458 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64500 ns 64917 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64458 ns 64583 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64375 ns 64375 ns 1
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111633.5 ns 111288 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 274542 ns 278375 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 287042 ns 292291 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 274708 ns 278833 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 280292 ns 279500 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 186083 ns 186917 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3329333 ns 3287958 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 3017229 ns 2909792 ns 1.04
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 3024687.5 ns 3017771 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 3956250 ns 3935292 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 577429 ns 579655 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7623958 ns 7602875 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7210334 ns 7372333 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7453270.5 ns 7461313 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8209375 ns 8220167 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1359043.5 ns 1357048 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 17513124.5 ns 17533125 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 17530146 ns 17557125 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 17518395.5 ns 17531667 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14128813 ns 9214250 ns 1.53
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23645979.5 ns 23446917 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33821104.5 ns 43586125 ns 0.78
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37080041 ns 37247062.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34888834 ns 35028291.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1866294 ns 1855921.5 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 189046208 ns 189114500 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 164619624.5 ns 178190333 ns 0.92
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 152711479 ns 153393396 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 436948083 ns 434855500 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13894254.5 ns 13947546 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 289373791 ns 290046875 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 251042625 ns 271392771 ns 0.93
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 296809167 ns 284812041.5 ns 1.04
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 474994229.5 ns 473569708.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22250 ns 23021 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24542 ns 22458 ns 1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23188 ns 23625 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 22417 ns 22708 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 96027 ns 96516 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 116584 ns 115458.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 113125 ns 103250 ns 1.10
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 117833 ns 104375 ns 1.13
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103854 ns 105042 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 510213 ns 508001.5 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5833 ns 5750 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5917 ns 6500 ns 0.91
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6812.5 ns 6708 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6292 ns 6125 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 68158.5 ns 68991.5 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14875 ns 14042 ns 1.06
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14812.5 ns 15500 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14875 ns 15687.5 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15042 ns 14500 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 478636.5 ns 478721 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3009146 ns 2979083.5 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2061334 ns 2084000 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2279208 ns 2281500 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4871541.5 ns 4814250 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 589315.5 ns 585630.5 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23547375 ns 23560375 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 17982875.5 ns 18266583.5 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 16893209 ns 16959209 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 34849958 ns 34863041.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2772744 ns 2766675 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33314834 ns 33305667 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 27464208 ns 27994104 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27410208 ns 27448959 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41078500 ns 40756916 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 72375 ns 74000 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 74375 ns 73333 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 75166 ns 74917 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 75167 ns 74500 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 102682 ns 104050 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 286145.5 ns 218083 ns 1.31
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 210021.5 ns 210625 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 315000 ns 296708.5 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 218458 ns 217792 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 553543 ns 558286.5 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11875 ns 11750 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11708 ns 12417 ns 0.94
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13334 ns 12458.5 ns 1.07
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 13125 ns 11834 ns 1.11
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 71259 ns 72847.5 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26833.5 ns 26125 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26375 ns 27167 ns 0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27417 ns 27375 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 25854.5 ns 26458 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 477064.5 ns 484580 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 12041.5 ns 11583 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12229.5 ns 12167 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13958 ns 14000 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12584 ns 11792 ns 1.07
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 53895.5 ns 55176 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 25875 ns 25542 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 25834 ns 26417 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26125 ns 28709 ns 0.91
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 25667 ns 26042 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 305285 ns 307604.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 179417 ns 179208 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 179417 ns 181042 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 181041 ns 184333.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 180042 ns 179416 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 58113 ns 57654 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 590084 ns 590646 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 585083 ns 591479 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 591062.5 ns 593500 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 584333 ns 582749.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 289662.5 ns 291261 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6083 ns 6083.5 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5500 ns 6375 ns 0.86
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7542 ns 6708 ns 1.12
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6604.5 ns 6292 ns 1.05
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 70599 ns 71643 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14291 ns 14250 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14209 ns 15167 ns 0.94
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14917 ns 15292 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13062.5 ns 14042 ns 0.93
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 466681.5 ns 470922.5 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1223541.5 ns 1203770.5 ns 1.02
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1236625 ns 1236645.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1285666.5 ns 1343083 ns 0.96
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1007959 ns 1024395.5 ns 0.98
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301986 ns 300123 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4226959 ns 4091000 ns 1.03
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4384249.5 ns 4576917 ns 0.96
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4572312.5 ns 4574875.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 3695104.5 ns 3718250 ns 0.99
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1047036 ns 1038641 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1833 ns 1875 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1833 ns 1875 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 24200 ns 23874.5 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4875 ns 5083 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4833 ns 5000 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4875 ns 4959 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 192268.5 ns 193867 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5458 ns 5500 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5542 ns 5709 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6791.5 ns 6875 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5792 ns 5416 ns 1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 56595.5 ns 57200 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10500 ns 11042 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10416 ns 11584 ns 0.90
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11375 ns 11500 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10875 ns 10625 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 335979.5 ns 332575 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 334 ns 375 ns 0.89
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 333 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 333 ns 334 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 334 ns 334 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 23172 ns 22978 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2833 ns 2834 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2709 ns 2792 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3042 ns 3000 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2791 ns 2833 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 162255.5 ns 163496 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11084 ns 11625 ns 0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11000 ns 11292 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13563 ns 12875 ns 1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11458 ns 11209 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 58685.5 ns 58225 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24542 ns 24958 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24542 ns 25208 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25167 ns 25375 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25000 ns 25042 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 298266 ns 299318 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4208 ns 4250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4208 ns 4250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4250 ns 4250 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4250 ns 4250 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 25307 ns 25190 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16166 ns 16209 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16292 ns 16083 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16334 ns 16625 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16084 ns 16500 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 199542 ns 202972 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5709 ns 5833 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5917 ns 5792 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5792 ns 5959 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5834 ns 5792 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 33833 ns 34611 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20292 ns 20625 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20375 ns 21042 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 20875 ns 21083 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 20250 ns 20125 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 178083 ns 178483.5 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 420500 ns 414125 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 372625 ns 367771 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 482833 ns 480813 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 103292 ns 104146 ns 0.99
batchedmm(16, Bsize=512)/forward/GPU/CUDA 67723.5 ns 67750.5 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 922417 ns 927125 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 955208.5 ns 964354 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1180875 ns 1186833 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 379083 ns 376584 ns 1.01
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 192988 ns 192974.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 136917 ns 77583 ns 1.76
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 79854.5 ns 79125 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 82750 ns 83542 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81167 ns 79958 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194081 ns 193934 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1915042 ns 1917959 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1919750 ns 1933541 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1926125 ns 1931521.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1915750 ns 1860375 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 401908.5 ns 392771 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 333 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 22364 ns 22416 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1833 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1875 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1834 ns 1875 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 174295 ns 174762 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6042 ns 6562.5 ns 0.92
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6500 ns 6417 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7812.5 ns 8166 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6541 ns 6208 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 61489.5 ns 59227 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9000 ns 9292 ns 0.97
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8792 ns 9250 ns 0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9375 ns 9375 ns 1
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9459 ns 9083 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 308375 ns 304901.5 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 118419979.5 ns 120543687.5 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 173770000 ns 181954416.5 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 148397083 ns 148126750 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 104919541 ns 106134709 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5493586 ns 5492614.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 611739750.5 ns 609833750 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 553521958 ns 578593208 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 449841709 ns 451045708.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 631089333.5 ns 627478333.5 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 38209825 ns 35107131 ns 1.09
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 652096250 ns 652518625 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 661126562.5 ns 683671437.5 ns 0.97
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 580970687.5 ns 587115583.5 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 848782167 ns 852245209 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58667 ns 58000 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47500 ns 39209 ns 1.21
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 48250 ns 48208 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83625 ns 85167 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37628 ns 38635 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1919312.5 ns 1920104 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1980333.5 ns 1988000 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1982541.5 ns 1980667 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1895625 ns 1907896 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 176341 ns 176329 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 266208 ns 267041 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 265334 ns 270500 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 288604 ns 268750 ns 1.07
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 268167 ns 265291 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 130454.5 ns 123893.5 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 664646 ns 596166 ns 1.11
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 671062.5 ns 698625 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 665875 ns 702916.5 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 597542 ns 589292 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 690208 ns 677537.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2192312.5 ns 2180187.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2179542 ns 2215229 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2181333.5 ns 2212000 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2207146 ns 2207792 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 134808 ns 133207 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5469791 ns 5497667 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5472958.5 ns 5581500 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5499916 ns 5516125 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5442583.5 ns 5545124.5 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 720984 ns 717120 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 644667 ns 656041 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 644084 ns 642917 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 642042 ns 637375 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 644167 ns 644167 ns 1
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 47636.5 ns 46463 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1819917 ns 1822875 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1720500 ns 1668958.5 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1721792 ns 1723334 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2100000 ns 2101084 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 224071 ns 222123 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57667 ns 57667 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46666 ns 38708 ns 1.21
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46583 ns 46916 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83750 ns 85084 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28795 ns 28664 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2029583 ns 2028604.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2087375 ns 2097916.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2087791.5 ns 2087625 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1991416.5 ns 2005812 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 190320 ns 188609 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13371041.5 ns 13343604 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12439187.5 ns 12536250 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12491875 ns 12547834 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15195833.5 ns 15250271 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 516777 ns 510611.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47119104.5 ns 47204500 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41727062.5 ns 41927292 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 41051417 ns 40799666 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58599458 ns 58864104 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2892052.5 ns 2889030 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 74212666 ns 73523334 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 67877750 ns 91557750 ns 0.74
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90536499.5 ns 90571250.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 98549792 ns 75976041 ns 1.30
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58375 ns 58083 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46459 ns 38875 ns 1.20
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47708 ns 47709 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83958 ns 82042 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 47165 ns 48950 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1919583.5 ns 1916542 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1980791 ns 1982083 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1979229.5 ns 1947333 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1886958 ns 1876854 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 193816.5 ns 195268 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 333 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 32624 ns 32997 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 5833 ns 5834 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6083 ns 6500 ns 0.94
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6416.5 ns 6458.5 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 5833 ns 5958 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 171378.5 ns 171034 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 291 ns 250 ns 1.16
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32204 ns 32918 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2583 ns 2750 ns 0.94
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2625 ns 2750 ns 0.95
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2875 ns 2917 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2625 ns 2625 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 159764 ns 161268 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 286393770.5 ns 286917729.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 340253500 ns 347948583.5 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 313806270.5 ns 314136145.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 268566520.5 ns 267700542 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7103110 ns 7080984 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1012043792 ns 1009676125 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 955581708 ns 974877416 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 855297583 ns 854637270.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1259239875 ns 1260982959 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 33847341 ns 34048271 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1418325958.5 ns 1387098104 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1338395020.5 ns 1694333625 ns 0.79
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1636087292 ns 1631003167 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1775858125 ns 1358038896 ns 1.31
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1409833 ns 1411604.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1414458.5 ns 1409250 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1465562.5 ns 1407354.5 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1413458.5 ns 1405916 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 127951 ns 128067 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5027250 ns 5023999.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5036354 ns 5051396 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5030437.5 ns 5029104.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5027250.5 ns 5040479 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 479205.5 ns 514176 ns 0.93
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 170869291 ns 170919250 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 128735708 ns 183735542 ns 0.70
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 105431542 ns 115460229.5 ns 0.91
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 167706958 ns 168486416 ns 1.00
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4877746.5 ns 4853309 ns 1.01
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 511068334 ns 627387000 ns 0.81
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 490911792 ns 561666625 ns 0.87
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 385742875 ns 453969542 ns 0.85
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 650161000 ns 654142166 ns 0.99
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 16340937 ns 17017885 ns 0.96
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 9003042 ns 8912729 ns 1.01
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8983042 ns 9063708 ns 0.99
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7909375 ns 7941979 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 9604229.5 ns 9820979.5 ns 0.98
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1611438.5 ns 1590505 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 36334167 ns 36015084 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 37265291.5 ns 38799959 ns 0.96
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 33553354 ns 33679959 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 37555333 ns 37936417 ns 0.99
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6454550 ns 6472671 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47333 ns 47459 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47500 ns 47708 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47625 ns 47625 ns 1
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47417 ns 47209 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18252 ns 17832 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50417 ns 50416 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50666 ns 50292 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50625 ns 50458 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50250 ns 50291 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 164880 ns 162828 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6417 ns 6208 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6792 ns 7083 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7583.5 ns 7562.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6792 ns 6292 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 76692.5 ns 74130 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10125 ns 9375 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9750 ns 10250 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10250 ns 10375 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9875 ns 9917 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 448214.5 ns 422862.5 ns 1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5666 ns 5666 ns 1
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5791 ns 6500 ns 0.89
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7583 ns 6916 ns 1.10
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6042 ns 5375 ns 1.12
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 81735 ns 78877.5 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13208 ns 12875 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12709 ns 13583 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13375 ns 13583 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13417 ns 13208 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 399198.5 ns 370972.5 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 959 ns 1083 ns 0.89
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1000 ns 1083 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1042 ns 1083 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 32447 ns 33127 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7666 ns 7792 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7708 ns 8167 ns 0.94
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7958 ns 8083 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8166 ns 7792 ns 1.05
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 187787.5 ns 187081.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23167 ns 23333 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23209 ns 23417 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23250 ns 23583 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23292 ns 23084 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18320.5 ns 18527 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52917 ns 52042 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52167 ns 52750 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52917 ns 52875 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52875 ns 52542 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 214503.5 ns 204233 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1398125 ns 1398875 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1402146 ns 1455625 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1406437.5 ns 1404042 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1448937.5 ns 1406584 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 196187.5 ns 196492.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5003458 ns 4999875 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5029708 ns 5037708 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5015042 ns 5003083 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5005729.5 ns 5024916 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 509817 ns 495167 ns 1.03
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3051834 ns 3047396 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2076520.5 ns 2106521 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2302500 ns 2296895.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4658291.5 ns 4962229.5 ns 0.94
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 581685 ns 583841 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24315708 ns 24384458 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18877250 ns 19075709 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17822166 ns 17765562.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35790999.5 ns 35955916.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2842698 ns 2836787 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33982916.5 ns 33991937.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28228208.5 ns 28748917 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27940958 ns 28081042 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41757334 ns 41668854.5 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 143078500 ns 142678458 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 146668125 ns 147270333 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 127355624.5 ns 126985770.5 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 171841729.5 ns 174826021 ns 0.98
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22550146 ns 22556485 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 1234730083.5 ns 1026522125 ns 1.20
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1060723417 ns 866022875.5 ns 1.22
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1027004875 ns 743843334 ns 1.38
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 674561583 ns 682878792 ns 0.99
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 117659213 ns 116543149 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74125 ns 76083 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 73146 ns 76250 ns 0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 76000 ns 77625 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 85834 ns 75833.5 ns 1.13
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 175925 ns 163749.5 ns 1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215750 ns 275437.5 ns 0.78
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 192541.5 ns 283542 ns 0.68
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 284542 ns 275959 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 285708 ns 282375 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 952026.5 ns 882740 ns 1.08
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35486000 ns 35483000 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 36428646.5 ns 36565000 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32475229 ns 32543896 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40408041.5 ns 40679500 ns 0.99
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5831517 ns 5828412 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 146000771 ns 147536708 ns 0.99
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 154808750 ns 157209875 ns 0.98
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 137043083.5 ns 136063312.5 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 285556542 ns 286255000 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34852076.5 ns 34875549.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 121592083 ns 122158104.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174639125 ns 181447688 ns 0.96
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 148027541 ns 147872917 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 105917833 ns 104774833.5 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5344344 ns 5433572 ns 0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 468650958 ns 468969166 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 466713000 ns 487732687.5 ns 0.96
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 437158458 ns 437061208 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 744371959 ns 745602708 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 35992005 ns 31632434 ns 1.14
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 712765167 ns 708533125.5 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 641204167 ns 662068729.5 ns 0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 624084979.5 ns 625681375 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 856208084 ns 856533500 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1270583 ns 1243917 ns 1.02
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 995709 ns 778625 ns 1.28
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 995875 ns 961709 ns 1.04
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2037625 ns 2098041.5 ns 0.97
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 569478 ns 581626.5 ns 0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2961229.5 ns 2966062.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2647792 ns 2513979 ns 1.05
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2621500 ns 2620167 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3709750 ns 3551916 ns 1.04
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1587708.5 ns 1532656 ns 1.04
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 5785812.5 ns 5803146 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 5824083 ns 5896375 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 5785375 ns 5798708 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 2904896 ns 2924083 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7083 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 5291 ns 1.16
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6042 ns 6208 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10042 ns 10166 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 24479.5 ns 25159 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 223812.5 ns 212500 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 222667 ns 220625 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220792 ns 220709 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 240666 ns 213625 ns 1.13
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 212315.5 ns 199491.5 ns 1.06
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 296229125 ns 297113041 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 216728584 ns 291058458 ns 0.74
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 190254604.5 ns 193310291.5 ns 0.98
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 304954521 ns 304396812.5 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7671461.5 ns 7678125.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1229817167 ns 1231332166.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 902846291.5 ns 973933875 ns 0.93
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 824304209 ns 836913500 ns 0.98
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1157856750.5 ns 1148765416.5 ns 1.01
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26996841 ns 26856489.5 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5292 ns 4792 ns 1.10
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5291.5 ns 5875 ns 0.90
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6375 ns 6354 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5250 ns 4667 ns 1.12
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 112898 ns 93183 ns 1.21
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6875 ns 7000 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6958 ns 7625 ns 0.91
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7583 ns 7458 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7125 ns 7395.5 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 535221.5 ns 440751 ns 1.21
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 584 ns 667 ns 0.88
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 584 ns 584 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 541 ns 500 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 23660 ns 24653 ns 0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 8625 ns 8625 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9084 ns 9500 ns 0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9417 ns 9917 ns 0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 8708 ns 8792 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 195936.5 ns 176547.5 ns 1.11
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 352958.5 ns 353584 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 352792 ns 353833 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 351479 ns 352208 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 356708.5 ns 351500 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 20962 ns 21275 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 775625 ns 807916.5 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 825833 ns 789854 ns 1.05
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 812229.5 ns 776042 ns 1.05
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 834959 ns 778833 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 234827 ns 215262.5 ns 1.09
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 341562.5 ns 339229 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 341958 ns 321000 ns 1.07
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 455917 ns 454187 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 11083 ns 10916 ns 1.02
batchedmm(16, Bsize=32)/forward/GPU/CUDA 17699 ns 18631 ns 0.95
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 712500 ns 714125 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 739896 ns 731625 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1007854 ns 1006333 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 26459 ns 26667 ns 0.99
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 214680.5 ns 196596.5 ns 1.09
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 381042 ns 381833.5 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 346750 ns 330959 ns 1.05
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 449187.5 ns 444916.5 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 39042 ns 31417 ns 1.24
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22537 ns 23162 ns 0.97
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 733792 ns 727875 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 788958 ns 783542 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1032500 ns 1030146 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 105583 ns 90750 ns 1.16
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 200835.5 ns 193002.5 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3791 ns 3583 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3541 ns 3709 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3708 ns 3625 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3708 ns 3375 ns 1.10
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 17542 ns 17634 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4250 ns 4291 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4167 ns 4208 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4250 ns 4333 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4250 ns 4125 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 204574.5 ns 200435.5 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3834 ns 3500 ns 1.10
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3667 ns 4167 ns 0.88
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4250 ns 4375 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3625 ns 3583 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 160115.5 ns 151437.5 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8292 ns 8458 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8166 ns 8583 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8458 ns 8333 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8333 ns 8458 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 989699 ns 927946.5 ns 1.07
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203375 ns 204583 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 212791 ns 209000 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 210666 ns 210500 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200834 ns 199084 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34428 ns 35183 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 652624.5 ns 602833.5 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 622667 ns 629209 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 631604.5 ns 625584 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 632750 ns 582250 ns 1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 280400.5 ns 266930.5 ns 1.05
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 994229.5 ns 990542 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1040292 ns 1053625 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 956020.5 ns 954292 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 853917 ns 901104 ns 0.95
batchedmm(128, Bsize=128)/forward/GPU/CUDA 208023.5 ns 206789.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4502437.5 ns 4511208 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4668229.5 ns 4854542 ns 0.96
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4455084 ns 4490209 ns 0.99
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 4280937 ns 4299083.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 935555 ns 930739 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3292 ns 3084 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3458 ns 3500 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4042 ns 4083.5 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3209 ns 3000 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 159049 ns 144120 ns 1.10
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7291 ns 7250 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7333 ns 7333 ns 1
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7334 ns 7500 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6833 ns 7041 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 850635.5 ns 806482 ns 1.05
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1640041 ns 1636250 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1196604.5 ns 1158208.5 ns 1.03
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1383250 ns 1368083 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2417500 ns 2308063 ns 1.05
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215018 ns 214505 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12333396 ns 12270583 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9592791.5 ns 9567750 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9267625 ns 9243645.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18011459 ns 18134146 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1959459 ns 1954133 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17332937.5 ns 17281250 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14386792 ns 14453375 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14369396.5 ns 14325333 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21112291.5 ns 21045500 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 87708 ns 85708 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 88542 ns 91520.5 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 92833 ns 93250 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 116000 ns 87833.5 ns 1.32
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 126352.5 ns 126207 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2022959 ns 2017958 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2049666 ns 2050542 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2035562.5 ns 2029834 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2025938 ns 2026959 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 878938 ns 841405 ns 1.04
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 2750 ns 1375 ns 2
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 3209 ns 1917 ns 1.67
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 3417 ns 3583.5 ns 0.95
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 2792 ns 2375 ns 1.18
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16283 ns 16017 ns 1.02
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2542 ns 2875 ns 0.88
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2708 ns 2833 ns 0.96
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 2875 ns 2750 ns 1.05
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2834 ns 2792 ns 1.02
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 176848 ns 165765.5 ns 1.07
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7083 ns 7208 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 5333 ns 1.13
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6041 ns 5958 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10042 ns 10084 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34134 ns 34231 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221583 ns 214458 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220000 ns 220042 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220417 ns 221416 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215333 ns 235834 ns 0.91
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 285763.5 ns 263066.5 ns 1.09
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3750 ns 3708 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3750 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3750 ns 3750 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3708 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22875 ns 22879.5 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14500 ns 14459 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14375 ns 14375 ns 1
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14458 ns 14541 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14500 ns 14500 ns 1
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 410580 ns 399546.5 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 92125 ns 94312.5 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 92916 ns 95875 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 96979 ns 97583 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 138000 ns 94354.5 ns 1.46
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125660 ns 125486.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1923792 ns 1919437.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1935291 ns 1938250 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1932916.5 ns 1927084 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1920500 ns 1803750 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 861874.5 ns 794850 ns 1.08
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 873916 ns 875354.5 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 826583 ns 802104.5 ns 1.03
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1222000 ns 1225042 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 963750 ns 970374.5 ns 0.99
lenet(28, 28, 1, 32)/forward/GPU/CUDA 276546 ns 273954 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2791083 ns 2714354 ns 1.03
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2445687.5 ns 2504167 ns 0.98
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3347916 ns 3360375 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3371375 ns 3360334 ns 1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1487194.5 ns 1467965 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17250 ns 17542 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17959 ns 16937.5 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17875 ns 18708 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17417 ns 14584 ns 1.19
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 130892 ns 129735 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 218625 ns 214709 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 260667 ns 215958.5 ns 1.21
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 227792 ns 215562.5 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 256083 ns 217958 ns 1.17
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 584591.5 ns 539139.5 ns 1.08
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 222000 ns 223375 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 222667 ns 220958 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 222312.5 ns 222645.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 220833 ns 219625 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 243596.5 ns 217203.5 ns 1.12
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 501417 ns 495895.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 496084 ns 506625 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 508541.5 ns 510958 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 561833 ns 561375 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1202534 ns 1153506.5 ns 1.04
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 3895.5 ns 3917 ns 0.99
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 4270.5 ns 4667 ns 0.92
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 5708 ns 4834 ns 1.18
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 4458.5 ns 4833 ns 0.92
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16584 ns 17326 ns 0.96
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 7208.5 ns 7520.5 ns 0.96
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 7000 ns 7625 ns 0.92
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 7625 ns 7458 ns 1.02
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 7500 ns 7417 ns 1.01
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 179332 ns 176736 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17687 ns 16646 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17917 ns 18500 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18625 ns 19625 ns 0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18729 ns 18042 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 135434 ns 133143.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 211041 ns 213000 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220417 ns 212916 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 212542 ns 213667 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212271 ns 224895.5 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 847267 ns 820129 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 3959 ns 4354.5 ns 0.91
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4209 ns 4625 ns 0.91
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 4875 ns 4917 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4291 ns 3875 ns 1.11
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 187480.5 ns 175343 ns 1.07
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10459 ns 10208 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10541.5 ns 10333 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10042 ns 10834 ns 0.93
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10125 ns 10208 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 955985 ns 980341 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3145.5 ns 3250 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 2937.5 ns 3687.5 ns 0.80
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4000 ns 4292 ns 0.93
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3167 ns 2917 ns 1.09
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 188520.5 ns 215866 ns 0.87
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7375 ns 7166 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7209 ns 7625 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7625 ns 7792 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7333 ns 7375 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 987324 ns 1015020 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23406938 ns 23687417 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 35765125 ns 42666354 ns 0.84
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37705500 ns 37344478.5 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34946604 ns 34948333.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1830206.5 ns 1824017 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 183995333 ns 183871416 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 165575375 ns 182812313 ns 0.91
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 146468292 ns 145975437.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 274483625 ns 274277542 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16521685 ns 16507012 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 276817937 ns 273782791 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 246377395.5 ns 257949042 ns 0.96
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 231576042 ns 231995083.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 325032833.5 ns 323882958.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 182896.5 ns 183541 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 184292 ns 184000 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 184958 ns 185292 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 183167 ns 182542 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 200810.5 ns 191911.5 ns 1.05
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 635333 ns 629458.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 633354.5 ns 587334 ns 1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 600291 ns 587125.5 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 597271 ns 649291 ns 0.92
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 958799 ns 963628 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3842750 ns 3851750 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3997500 ns 3983792 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3542792 ns 3579833 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 4556625 ns 4612292 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/CUDA 532425 ns 531156 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17396104 ns 17385812.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 18078958 ns 18439958.5 ns 0.98
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16589917 ns 16577084 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 19981167 ns 20232667 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2633170 ns 2638769 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 625 ns 0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 625 ns 0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32094 ns 32361 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8917 ns 9312.5 ns 0.96
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8750 ns 9604.5 ns 0.91
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9041 ns 9541 ns 0.95
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9042 ns 8750 ns 1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 249030 ns 248738 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 652464437.5 ns 650277229.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 394034604 ns 513797917 ns 0.77
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 326393417 ns 364513416 ns 0.90
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 748745833 ns 753229708 ns 0.99
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12466975 ns 11759811 ns 1.06
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 1885107791.5 ns 1878034500 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1638827875 ns 1671899375 ns 0.98
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1512914354 ns 1507608416.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2208603583.5 ns 2202946667 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49231175.5 ns 49516620 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1616792 ns 1535958.5 ns 1.05
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1200917 ns 1179292 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1389625 ns 1380729.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2477916.5 ns 2368083 ns 1.05
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215338 ns 215337 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12691834 ns 12730083 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9979354.5 ns 9937625 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9689896 ns 9659583.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18371271 ns 18459917 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1985308 ns 2010689 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17676916 ns 17677292 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14722000 ns 14810083 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14613667 ns 14573229.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21413395.5 ns 21483000 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26292 ns 26292 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26291 ns 26250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26250 ns 26208 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23721 ns 23665 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 67333 ns 67166 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 67333 ns 66875 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67209 ns 67250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 67333 ns 66958 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 367128.5 ns 367986.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203542 ns 204583 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 208625 ns 209292 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209584 ns 210500 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199792 ns 199625 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25494 ns 26073 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 604625 ns 613125 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 670666.5 ns 625459 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 632166.5 ns 633583 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 630000 ns 632083 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 321975.5 ns 320857.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 639021 ns 592750 ns 1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 643458 ns 647000 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 658750 ns 648834 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 632750 ns 671792 ns 0.94
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131332 ns 131354 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2244229 ns 2247291 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2277708.5 ns 2303208 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2240167 ns 2243604 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2235458.5 ns 2314875.5 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1075922 ns 1083962 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17167 ns 16687.5 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17916 ns 18458 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18167 ns 19770.5 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18208 ns 18146 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 130720.5 ns 132087.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 258584 ns 229375 ns 1.13
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 227459 ns 262896 ns 0.87
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 232750 ns 231208 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 230791 ns 258624.5 ns 0.89
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 887768.5 ns 885149.5 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 666 ns 667 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23104 ns 23686 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9750 ns 8708 ns 1.12
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9250 ns 10000 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9208 ns 10000 ns 0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9417 ns 9250 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 242418 ns 241904 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5208 ns 5417 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5125 ns 5583 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6375 ns 6417 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5375 ns 4770.5 ns 1.13
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 193804 ns 194851.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7167 ns 7667 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7250 ns 7417 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7375 ns 7792 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7042 ns 7250 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 706410 ns 705733 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2125 ns 2167 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2250 ns 2208 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2209 ns 2542 ns 0.87
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2208 ns 2208 ns 1
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 17672 ns 17804 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6458 ns 6541 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6291 ns 6500 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6709 ns 6875 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6500 ns 6417 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 300575 ns 294742 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 749459 ns 746916 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 748959 ns 761333 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 750854 ns 750541 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 749167 ns 749459 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 20805 ns 20924 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 775208 ns 790875 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 795916.5 ns 777375 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 792791 ns 792500 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 792792 ns 778250 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 274546.5 ns 268681.5 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7208 ns 7375 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5917 ns 5250 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5959 ns 5875 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10250 ns 10292 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33244 ns 32725 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 219625 ns 219208 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 240291 ns 230937.5 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 237583 ns 236625 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 260042 ns 214312.5 ns 1.21
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 337443 ns 332717.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10084 ns 10291 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9583 ns 10937.5 ns 0.88
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10750 ns 10625 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10167 ns 9916 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 223296.5 ns 219475.5 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25125 ns 24416 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24312.5 ns 25417 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24917 ns 24875 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24667 ns 24354.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1047460.5 ns 1060762 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 106018062.5 ns 106190416 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 118144520.5 ns 126215417 ns 0.94
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 120409292 ns 120200125 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117468833 ns 117655917 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2652084 ns 2587994 ns 1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 373672500 ns 395454916.5 ns 0.94
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 359102771.5 ns 372350083.5 ns 0.96
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 356068521.5 ns 355285895.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 543525042 ns 542892500 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15230726 ns 15209611 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 605345333 ns 607219000 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 584604208 ns 775694542 ns 0.75
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 744606604.5 ns 743546708 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 793208583.5 ns 606917208 ns 1.31
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6500 ns 6729.5 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6375 ns 7458 ns 0.85
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8062 ns 8791 ns 0.92
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7146 ns 6084 ns 1.17
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 216878 ns 214170 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13625 ns 14645.5 ns 0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13625 ns 14167 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14125 ns 14334 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14084 ns 13417 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1010131 ns 1010027 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5625 ns 6042 ns 0.93
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6000 ns 6708.5 ns 0.89
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7895.5 ns 6958 ns 1.13
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5958 ns 5166.5 ns 1.15
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 211472.5 ns 211003 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12583 ns 12916 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12333 ns 12979.5 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12708 ns 13041 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12709 ns 12375 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 725788 ns 725511 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 5583 ns 5792 ns 0.96
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 5875 ns 6084 ns 0.97
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 6583.5 ns 7166 ns 0.92
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 6167 ns 5979.5 ns 1.03
batchedmm(2, Bsize=128)/forward/GPU/CUDA 17002 ns 16985 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 15916 ns 16375 ns 0.97
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 15250 ns 15917 ns 0.96
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 16125 ns 15750 ns 1.02
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 15834 ns 15750 ns 1.01
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 187784.5 ns 184955.5 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 334 ns 292 ns 1.14
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 23531 ns 23469 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6167 ns 6375 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6292 ns 6292 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6459 ns 6458 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6084 ns 6020.5 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 228744 ns 226513 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5834 ns 5917 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5916 ns 6000 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5959 ns 6083 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5959 ns 5833 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 24273 ns 24637 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 20833 ns 21375 ns 0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 20750 ns 21083 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21292 ns 21167 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21041 ns 20875 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 251207.5 ns 248819 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 185375 ns 144938 ns 1.28
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 144625 ns 147666 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 147917 ns 147500 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144417 ns 144208 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166909.5 ns 166863.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1321833 ns 1328917 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1350479 ns 1366916.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1337166 ns 1323667 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1323625 ns 1330125 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1251196 ns 1231201 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24833 ns 21917 ns 1.13
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25041 ns 23250 ns 1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23958 ns 25417 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24271 ns 24583 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 315591 ns 261684.5 ns 1.21
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 131292 ns 126249.5 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 118396 ns 132125 ns 0.90
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 176916 ns 180458 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 129458 ns 182166 ns 0.71
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1353120 ns 1329052 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 333 ns 334 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 417 ns 375 ns 1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23127 ns 23064 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6125 ns 6417 ns 0.95
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6459 ns 6500 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6333 ns 6583 ns 0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6125 ns 6083 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 245064.5 ns 241726 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4208 ns 4583 ns 0.92
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4875 ns 4875 ns 1
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5125 ns 5062.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4667 ns 4375 ns 1.07
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 228957.5 ns 230879.5 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9875 ns 9792 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9875 ns 10375 ns 0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10334 ns 10333 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10208 ns 10125 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1285818.5 ns 1281938 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1584 ns 1584 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1625 ns 1583 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23344 ns 23016.5 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5750 ns 5709 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5709 ns 5750 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6000 ns 6042 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5666 ns 5625 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 264086.5 ns 260870.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6807541.5 ns 6736854 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6433375 ns 6358292 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6489875 ns 6526333 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7649521 ns 7511917 ns 1.02
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 214938 ns 214549 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24073959 ns 24072542 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21296000 ns 21309271.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21044062.5 ns 21010584 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29805771 ns 29840125 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2104181 ns 2110310.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 37247625 ns 37228250 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 34089791 ns 45827250 ns 0.74
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45725979.5 ns 45480416 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 49397750 ns 38465479 ns 1.28
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5500 ns 5708 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5708 ns 5708 ns 1
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6541 ns 6729.5 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5708 ns 5208.5 ns 1.10
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 208256 ns 215925.5 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8084 ns 8833 ns 0.92
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8125 ns 8417 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8375 ns 8625 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8375 ns 8145.5 ns 1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 991485 ns 1004537.5 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1509000 ns 1503813 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1282542 ns 1243541.5 ns 1.03
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1634916.5 ns 1631312.5 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2162000.5 ns 2004542 ns 1.08
lenet(28, 28, 1, 128)/forward/GPU/CUDA 271116.5 ns 280207 ns 0.97
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7902209 ns 7912062.5 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6449312.5 ns 6650042 ns 0.97
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7195708 ns 7185875 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10462229 ns 10076645.5 ns 1.04
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1752716.5 ns 1812720 ns 0.97
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 371187.5 ns 371770.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 374208 ns 359708 ns 1.04
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 461250 ns 457000 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 22208 ns 27125 ns 0.82
batchedmm(128, Bsize=4)/forward/GPU/CUDA 42428.5 ns 47414 ns 0.89
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 745437.5 ns 728042 ns 1.02
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 815833 ns 792916 ns 1.03
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1062958 ns 1060625 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 117396 ns 122625 ns 0.96
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 283256.5 ns 280856 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397208 ns 397666 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288667 ns 213417 ns 1.35
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287875 ns 288291 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 750917 ns 754041 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43636 ns 44363 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 667000 ns 669875 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 531375 ns 474875 ns 1.12
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 531417 ns 529792 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 974083 ns 975625 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 188745 ns 194646.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 644833 ns 678312.5 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 648750 ns 642583 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 644479 ns 646625 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 652458.5 ns 638374.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131347.5 ns 132515 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2445334 ns 2433792 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2500021 ns 2525125 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2463250 ns 2458416 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2463375 ns 2464167 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1238313 ns 1286025 ns 0.96
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 3417 ns 4270.5 ns 0.80
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 3625 ns 2791 ns 1.30
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 4250 ns 4334 ns 0.98
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 3437.5 ns 3021 ns 1.14
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16066 ns 17018 ns 0.94
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 5375 ns 5583 ns 0.96
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 5292 ns 5542 ns 0.95
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 5750 ns 5500 ns 1.05
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 5583 ns 5584 ns 1.00
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 182995 ns 187936.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1458042 ns 1463042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1499750 ns 1495875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1503250 ns 1503458 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1437708 ns 1446334 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 40191 ns 41308.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5113291 ns 5127000 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5287958 ns 5300416.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5307041.5 ns 5293458 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4985125 ns 4725667 ns 1.05
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 196599 ns 195229 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3709 ns 3709 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3709 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3709 ns 3709 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3708 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33557 ns 33264.5 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15125 ns 15250 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15167 ns 15083 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15416 ns 15417 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15208 ns 15125 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 349206 ns 350238 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71125 ns 71333 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71542 ns 71417 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71209 ns 71208 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 71041 ns 71500 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113114 ns 112408 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 317667 ns 318125 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 324125 ns 327584 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 318292 ns 319500 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 317625 ns 320333 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 193277 ns 194166 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 958 ns 1000 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1041 ns 1084 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1125 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1125 ns 1000 ns 1.13
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 23048 ns 23803 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7750 ns 8000 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8270.5 ns 8417 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8250 ns 8417 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8041 ns 7708 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 245757.5 ns 246141 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 502770.5 ns 501979.5 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 484500 ns 480104 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 561750 ns 566979 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 219917 ns 220416 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129178 ns 128980 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1387645.5 ns 1391667 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1473958 ns 1479770.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1779041.5 ns 1756604 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 862917 ns 864792 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 273950 ns 275170 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 334 ns 417 ns 0.80
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31657.5 ns 31717 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6125 ns 6625 ns 0.92
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6208 ns 6542 ns 0.95
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6541 ns 6500 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6042 ns 5958 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 251419 ns 248251 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1733792 ns 1776021 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1721208 ns 1733687.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1724250 ns 1727458 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1773541 ns 1726125 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168671 ns 167904 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4114542 ns 4363208 ns 0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4392834 ns 4382750 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4368208.5 ns 4374000 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4369208.5 ns 4367334 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1291475.5 ns 1079923 ns 1.20
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6834 ns 6875 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6667 ns 6708 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7999.5 ns 6792 ns 1.18
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 7041 ns 6666 ns 1.06
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20138.5 ns 19517 ns 1.03
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 51250 ns 59895.5 ns 0.86
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 32625 ns 49208 ns 0.66
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 73833 ns 52583 ns 1.40
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 51084 ns 32417 ns 1.58
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 340107 ns 267079.5 ns 1.27
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 17833 ns 18084 ns 0.99
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 18083 ns 18292 ns 0.99
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 18875 ns 19709 ns 0.96
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 18208 ns 18292 ns 1.00
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18400 ns 18390 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 53250 ns 53833 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 53041 ns 53375 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 53375 ns 53375 ns 1
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 53542 ns 53625 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 319083.5 ns 319120 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75166 ns 75333 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75625 ns 75583 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75291.5 ns 75250 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75083 ns 75500 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 47469 ns 46304 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 324958 ns 324291 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 342000 ns 336479.5 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 325000 ns 324708 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 324542 ns 327458 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 211595 ns 209708.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1484959 ns 1487583 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1526854.5 ns 1522083 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1527250 ns 1529334 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1462542 ns 1471333 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 51799 ns 52335 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5111083.5 ns 5126125 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5312417 ns 5305125 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5299333.5 ns 5295000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4982354 ns 4684000 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 204934 ns 202194.5 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28208 ns 28333 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28250 ns 28333 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28187.5 ns 28292 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28250 ns 28209 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24742 ns 24238 ns 1.02
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66500 ns 66500 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66709 ns 66250 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66500 ns 66416 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66541 ns 66625 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 484630.5 ns 495044 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1480583.5 ns 1478812 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1136563 ns 933416.5 ns 1.22
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1136750 ns 1129625 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2265937.5 ns 2267917 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 579622.5 ns 577563.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3074562.5 ns 3095187.5 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2788145.5 ns 2641125 ns 1.06
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2743021 ns 2747417 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3819500.5 ns 3815833.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 1931643 ns 1965829 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 7902458 ns 7798041 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 7834062.5 ns 8017625 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 7920375 ns 7904083.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 4826312.5 ns 4861812 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 77625 ns 119833.5 ns 0.65
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81167 ns 81604 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 84041.5 ns 82000 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 111396 ns 80604 ns 1.38
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193746 ns 193857.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2012875 ns 2020000 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2046292 ns 2021083 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2031354 ns 2024292 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2015417 ns 1749917 ns 1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 746361.5 ns 744082.5 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.