Skip to content

Commit

Permalink
fix: mark kwargs in functor as leaf (#1085)
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal authored Nov 16, 2024
1 parent eecae90 commit 04494b5
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Lux"
uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
version = "1.3.0"
version = "1.3.1"

[deps]
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
Expand Down
7 changes: 4 additions & 3 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ const dev = gpu_device()

:::

## Want XLA Support?
## Want Reactant (XLA) Support?

Install the following package:

Expand All @@ -134,13 +134,14 @@ using Pkg;
Pkg.add("Reactant")
```

Run the following to access a device:
Run the following to access a device (Reactant automatically selects the best backend by
default):

:::code-group

```julia [CPU Backend]
using Reactant, Lux
Reactant.set_default_backend("cpu") # default
Reactant.set_default_backend("cpu")

const dev = reactant_device()
```
Expand Down
22 changes: 19 additions & 3 deletions src/helpers/compact.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ is useful when using it with SciML tools which require passing in the parameters
If you are passing in kwargs by splatting them, they will be passed as is to the function
body. This means if your splatted kwargs contain a lux layer that won't be registered
in the CompactLuxLayer.
in the CompactLuxLayer. Additionally all of the device functions treat these kwargs as
leaves.
## Special Syntax
Expand Down Expand Up @@ -314,7 +315,13 @@ function initialstates(rng::AbstractRNG, m::CompactLuxLayer)
initialstates(rng, m.layers)..., initialstates(rng, m.value_storage)...)
length(first(m.stored_kwargs)) == 0 && return base_states
return merge(
base_states, (; ₋₋₋kwargs₋₋₋=NamedTuple{m.stored_kwargs[1]}(m.stored_kwargs[2])))
base_states,
(;
₋₋₋kwargs₋₋₋=CompactMacroImpl.KwargsStorage(
NamedTuple{m.stored_kwargs[1]}(m.stored_kwargs[2])
)
)
)
end

function CompactLuxLayer(dispatch::StaticSymbol, f::F, name::NAME_TYPE,
Expand Down Expand Up @@ -419,6 +426,7 @@ module CompactMacroImpl
using ChainRulesCore: @non_differentiable
using ConcreteStructs: @concrete
using MacroTools: MacroTools, @capture, combinedef, splitdef
using Functors: Functors
using Random: AbstractRNG
using Static: static

Expand Down Expand Up @@ -517,7 +525,9 @@ function supportself(fex::Expr, vars, splatted_kwargs)
end
for var in splatted_kwargs
push!(calls,
:($var = $(safe_getproperty)(getproperty($st, :₋₋₋kwargs₋₋₋), $(Val(var)))))
:($var = $(safe_getproperty)(
getproperty(getproperty($st, :₋₋₋kwargs₋₋₋), :kws), $(Val(var))
)))
end
custom_param && push!(calls, :($(sdef[:args][2]) = $ps))

Expand Down Expand Up @@ -631,6 +641,12 @@ function LuxCore.initialstates(rng::AbstractRNG, v::ValueStorage)
for (n, fn) in pairs(v.st_init_fns)])
end

@concrete struct KwargsStorage
kws <: NamedTuple
end

Functors.@leaf KwargsStorage

function kwarg_descriptor(val)
val isa NonTrainable && return "@non_trainable($(kwarg_descriptor(val.value)))"
val isa Number && return string(val)
Expand Down

3 comments on commit 04494b5

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/119558

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.3.1 -m "<description of version>" 04494b5184f02ef4986334ca8987cfbfe014f4ab
git push origin v1.3.1

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 04494b5 Previous: 0c45cf2 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4125 ns 3917 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4375 ns 4125 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5084 ns 5292 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3792 ns 3791.5 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 62298.5 ns 60493 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10709 ns 10125 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 12042 ns 10125 ns 1.19
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 11458 ns 10042 ns 1.14
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10375 ns 10041 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 437714.5 ns 426840 ns 1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1250 ns 1125 ns 1.11
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1291 ns 1208 ns 1.07
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1417 ns 1375 ns 1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1125 ns 1333 ns 0.84
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 18695.5 ns 18326.5 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4125 ns 4125 ns 1
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4167 ns 4000 ns 1.04
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4375 ns 4250 ns 1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4041 ns 4000 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 113193.5 ns 111007 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57458 ns 57208 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38458 ns 46584 ns 0.83
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46208 ns 46875 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82584 ns 82834 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38573 ns 37385 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2036958 ns 2030208 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2091833.5 ns 2090166 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2080833.5 ns 2097541.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2003542 ns 2024291 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 199791 ns 199762 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 144291.5 ns 151479.5 ns 0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 142396 ns 143895.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 145709 ns 145166.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144354 ns 147166 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167203.5 ns 166256.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1119417 ns 1118083 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1142208 ns 1121604 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1106187.5 ns 1123333.5 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1118333 ns 1147792 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 538894 ns 530774 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3375 ns 3459 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4042 ns 3833 ns 1.05
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4500 ns 4167 ns 1.08
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3062.5 ns 3625 ns 0.84
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 69914 ns 67815 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8500 ns 8708 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9541 ns 9292 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10791 ns 8708 ns 1.24
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8750 ns 8875 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 508206.5 ns 492282.5 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 14270.5 ns 16020.5 ns 0.89
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15937.5 ns 16166.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17167 ns 16395.5 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16416 ns 15395.5 ns 1.07
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 56662 ns 54331 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213541.5 ns 225750 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 215375 ns 214542 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 215500 ns 213708 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 221500 ns 214708 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 281033.5 ns 273073 ns 1.03
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 625 ns 542 ns 1.15
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 667 ns 667 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 875 ns 750 ns 1.17
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 583 ns 667 ns 0.87
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17768.5 ns 17433 ns 1.02
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1375 ns 1541 ns 0.89
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1584 ns 1542 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1625 ns 1417 ns 1.15
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1584 ns 1625 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 105523 ns 102453 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7125 ns 7209 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5250 ns 5917 ns 0.89
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6041 ns 5958 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9958 ns 10292 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 24475 ns 23456 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 232875 ns 229875 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 230687.5 ns 230687 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 228875 ns 230041 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 228291.5 ns 220833 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 172887 ns 170399.5 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3875 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23927 ns 23777 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16584 ns 16625 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16875 ns 16708 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17125 ns 16667 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16625 ns 16459 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 163843.5 ns 162269 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 575875 ns 574500 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 602666 ns 568000 ns 1.06
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 574625 ns 572792 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 586875 ns 575667 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113393.5 ns 113429.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1420792 ns 1419458 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1449750 ns 1419209 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1421833 ns 1414729.5 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1420666 ns 1420875 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 214515 ns 211299.5 ns 1.02
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1072291 ns 1072937.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 943646 ns 965417 ns 0.98
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1355083 ns 1352709 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1299500 ns 1268500 ns 1.02
lenet(28, 28, 1, 64)/forward/GPU/CUDA 279673 ns 273664 ns 1.02
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5776625 ns 5908520.5 ns 0.98
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4547375 ns 4453354 ns 1.02
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4956667 ns 4968833.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5681291.5 ns 5709812.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1100789 ns 1074376 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 541 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 583 ns 500 ns 1.17
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 541 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23781 ns 24117 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2042 ns 2084 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2167 ns 2084 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2208 ns 2209 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2084 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 174061 ns 175449 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4000 ns 4041 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4041 ns 3917 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5041 ns 4833 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3520.5 ns 3458 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 66653 ns 65395 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11042 ns 10709 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11667 ns 11208 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11917 ns 11834 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11333 ns 11084 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 459089 ns 450961.5 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5625 ns 7979.5 ns 0.70
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7583 ns 6250 ns 1.21
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7500 ns 7979.5 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6167 ns 6292 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 53360 ns 52467 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16542 ns 17291 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17209 ns 17375 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18333 ns 18875 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16791 ns 16667 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 310536.5 ns 305695 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 542 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 584 ns 666 ns 0.88
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 33641 ns 32659 ns 1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8542 ns 8584 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9042 ns 9000 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9312.5 ns 9000 ns 1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8709 ns 8542 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 163903 ns 159178.5 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64708 ns 64459 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64375 ns 64666 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64584 ns 64417 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64541 ns 64500 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111608.5 ns 111598.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 279917 ns 289312.5 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 284542 ns 277542 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 283917 ns 289083 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 282500 ns 289417 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 187507 ns 185068 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3287833.5 ns 3359958.5 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 2780208 ns 3026438 ns 0.92
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 3046979 ns 3022125 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 4045333 ns 3951146 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 571508 ns 587969 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7619500 ns 7494500 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7346729 ns 7453875 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7476208 ns 7451896 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8208937.5 ns 8244416.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1337153 ns 1382663 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 18827750 ns 18772542 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 19152375 ns 19139125 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 19142542 ns 19128542 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 15656917 ns 16197250 ns 0.97
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23640000 ns 23953437.5 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43637146 ns 34373209 ns 1.27
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37268333.5 ns 37031750 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34800709 ns 35339917 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1857649.5 ns 1848447 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 189644917 ns 188047583 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 178178021 ns 164639729 ns 1.08
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 153631625 ns 152806417 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 441917583 ns 448441291 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13878768 ns 13907488 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 290268250 ns 289867458 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 356472291 ns 338595687.5 ns 1.05
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 297083542 ns 299072917 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 333841541 ns 413224792 ns 0.81
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 21645.5 ns 21917 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 23083.5 ns 22958 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 25417 ns 24791 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 22458 ns 22459 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 98111.5 ns 99382.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103708 ns 104062.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 104645.5 ns 103833 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 104833 ns 103792 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103709 ns 117021 ns 0.89
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 513446 ns 522084 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5895.5 ns 5875 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6292 ns 6500 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7042 ns 6854.5 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5792 ns 6208 ns 0.93
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 69892 ns 70401.5 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14708 ns 14875 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16333 ns 15520.5 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16375 ns 16000 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14895.5 ns 15083 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 489627 ns 486902.5 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 2830166.5 ns 3019896 ns 0.94
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2097041.5 ns 2093333 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2268875 ns 2249333 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4810500 ns 4929333 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 591815.5 ns 589363 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23521125 ns 23557417 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18356854 ns 18041937 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 16883187.5 ns 16958375 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35797333.5 ns 36564937.5 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3103730 ns 3109138 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33291833 ns 33331458.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28074625 ns 27714666.5 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27392042 ns 27590000 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41001146 ns 42335041.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 71583 ns 74208 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 80375 ns 73500 ns 1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 74479 ns 76208.5 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72333.5 ns 73292 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 103944 ns 106173 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 297959 ns 257312.5 ns 1.16
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 216229 ns 207375 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 209917 ns 208750 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 272250 ns 224458.5 ns 1.21
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 558503.5 ns 569917.5 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11792 ns 11500 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11833 ns 12125 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13167 ns 12625 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11416 ns 12333 ns 0.93
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 74112 ns 73676.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26250 ns 25958 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 27833 ns 26770.5 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27625 ns 27542 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26917 ns 26959 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 486928.5 ns 486341.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11791.5 ns 12834 ns 0.92
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12875 ns 12291 ns 1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13791.5 ns 14916 ns 0.92
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12437.5 ns 12458 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 53830 ns 54897 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 25417 ns 26917 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26625 ns 25583 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26458 ns 26500 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26333 ns 26334 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 310408.5 ns 315364 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 180916 ns 181625 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 180479.5 ns 182687.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 184042 ns 180959 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 181875 ns 180334 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 58334 ns 58698 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 582875 ns 630333 ns 0.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 584292 ns 588125 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 584875 ns 585125 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 583771 ns 615042 ns 0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 295362 ns 295850.5 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5958 ns 6166.5 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6042 ns 6000 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7000 ns 6958 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5875 ns 6458 ns 0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 73116.5 ns 72786 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13792 ns 14375 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15417 ns 14458 ns 1.07
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15417 ns 15500 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14458 ns 14459 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 473681.5 ns 474917.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1172041 ns 1242458 ns 0.94
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1199583 ns 1726750 ns 0.69
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1286750 ns 1284250 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1323125 ns 1304916.5 ns 1.01
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301801 ns 301368.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4107916 ns 4121291.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4485521 ns 4347458 ns 1.03
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4485834 ns 4658625 ns 0.96
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 4442521 ns 4651583.5 ns 0.96
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1036926.5 ns 1044741 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1875 ns 1792 ns 1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1834 ns 1917 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23428 ns 24235 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4833 ns 4875 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4917 ns 4833 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5000 ns 4958 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4917 ns 4875 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 189745 ns 195530.5 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5395.5 ns 6104.5 ns 0.88
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6500 ns 5667 ns 1.15
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6583 ns 6875 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5708 ns 5750 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 55733.5 ns 57282.5 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10750 ns 10979.5 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11917 ns 10917 ns 1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11916 ns 12042 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10750 ns 10792 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 340606.5 ns 344261.5 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 333 ns 375 ns 0.89
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 375 ns 292 ns 1.28
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 334 ns 333 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 375 ns 334 ns 1.12
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22887 ns 23845 ns 0.96
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2792 ns 2708 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 3083 ns 2709 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3084 ns 2750 ns 1.12
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2791 ns 2750 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 158761 ns 164126.5 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10646 ns 11917 ns 0.89
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11750 ns 11167 ns 1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12791 ns 12459 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11833 ns 11417 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 57602 ns 58669 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24416 ns 25000 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 25250 ns 24500 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24875 ns 24708 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25167 ns 25083 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 298284.5 ns 304559.5 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4166 ns 4208 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4250 ns 4167 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4250 ns 4167 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4208 ns 4250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24817 ns 25840 ns 0.96
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16125 ns 16250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16166 ns 16208 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16458 ns 16208 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16125 ns 16084 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 198823 ns 205670.5 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5750 ns 5875 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5834 ns 5792 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5875 ns 5834 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5875 ns 5792 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 34246 ns 34552 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20520.5 ns 20833 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21209 ns 20791 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 22729 ns 21666 ns 1.05
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21041 ns 20709 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 178397 ns 181576.5 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 404667 ns 399208 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 354625 ns 372584 ns 0.95
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 489104 ns 483833 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 525646 ns 506583 ns 1.04
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66601 ns 67542.5 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 953375 ns 1007042 ns 0.95
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 892417 ns 884958.5 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1238812.5 ns 1232166.5 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 1399167 ns 1433709 ns 0.98
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 191860 ns 193226.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 83500 ns 78250 ns 1.07
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 87604 ns 81292 ns 1.08
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 82958 ns 81084 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 131084 ns 82584 ns 1.59
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193332.5 ns 194079.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1915875 ns 1922417 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1936354 ns 1920833 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1926333 ns 1926562 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1922000 ns 1930104 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 409092 ns 412262 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 333 ns 0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 22200 ns 22781 ns 0.97
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1875 ns 1792 ns 1.05
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1834 ns 1833 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 173011 ns 177219.5 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5542 ns 6958.5 ns 0.80
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6708 ns 6250 ns 1.07
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7709 ns 7354.5 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6750 ns 6604.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 61904.5 ns 63079.5 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8833 ns 9250 ns 0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9583 ns 8958 ns 1.07
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9125 ns 9250 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9167 ns 9500 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 320204.5 ns 323232 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 121310646 ns 120001354.5 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 181760542 ns 173860959 ns 1.05
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 147955208 ns 147799416 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 107047750 ns 105257459 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5473575 ns 5487976 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 616325250 ns 617199083.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 579539584 ns 555347958 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 452436916.5 ns 452797563 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 761604583 ns 772493125 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34930079 ns 34928705 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 649842708 ns 649523250 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 687832188 ns 666577687.5 ns 1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 589947958 ns 586637666.5 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 745852459 ns 745838709 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58833 ns 59333 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38917 ns 47459 ns 0.82
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47791 ns 48167 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83458 ns 83542 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37730 ns 38875 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1929812 ns 1923646.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1979208.5 ns 1977041 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1979375 ns 1985354.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1901667 ns 1902208 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 175008.5 ns 178774.5 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 265750 ns 267625 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 272708 ns 266833.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 277250 ns 269833 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 264875 ns 268750 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 141169 ns 135794.5 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 664708 ns 650000 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 673604.5 ns 693729 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 596521 ns 588833 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 584791 ns 671104.5 ns 0.87
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 740154.5 ns 736846.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2239000 ns 2216292 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2243541.5 ns 2247708 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2186042 ns 2179083 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2207334 ns 2228833.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 133565 ns 134998 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5494667 ns 5489729.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5564917 ns 5488416.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5526500 ns 5510083.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5510208 ns 5509500 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 772189 ns 772298.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 643917 ns 644667 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 648042 ns 636750 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 643750 ns 643625 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 644167 ns 664292 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46665 ns 46667 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1820250 ns 1824500 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1693229.5 ns 1724500 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1723895.5 ns 1725291 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2104875 ns 2103083 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 221733 ns 222112 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58500 ns 58292 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38750 ns 47416 ns 0.82
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46250 ns 47229.5 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84416 ns 84250 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28603 ns 28385 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2028417 ns 2028896 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2096541 ns 2089125 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2094542 ns 2095958 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2003250 ns 2000333 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 190253.5 ns 190029 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13359542 ns 13380666.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12463437.5 ns 12446458.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12529500 ns 12502375.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 14818083.5 ns 15323021 ns 0.97
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 513933 ns 514892.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47308250 ns 47308500 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41855187.5 ns 41876750 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 41092584 ns 40911521 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58335625 ns 59068458 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3235135 ns 3251312.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 73575437.5 ns 74382250 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 91455709 ns 67968458 ns 1.35
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90846333 ns 90502167 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 76471792 ns 99974645.5 ns 0.76
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58833 ns 58709 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38834 ns 47416 ns 0.82
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47125 ns 47542 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84250 ns 84750 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 48573 ns 48017 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1920417 ns 1919167 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1968896 ns 1967646 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1976187.5 ns 1985146.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1889896 ns 1905917 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 198559.5 ns 198479.5 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 417 ns 333 ns 1.25
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 33508 ns 33267 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6041 ns 6166 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6500 ns 6166 ns 1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6500 ns 6583 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6375 ns 6125 ns 1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 183482.5 ns 181033.5 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 333 ns 250 ns 1.33
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32079 ns 32220 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2584 ns 2583 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3000 ns 2584 ns 1.16
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2958 ns 2833 ns 1.04
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2625 ns 2584 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 170409 ns 169790 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 287317291.5 ns 285598395.5 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 346633417 ns 340428166.5 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 314215396 ns 314514604.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 269592292 ns 271854666 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7121953.5 ns 7112648 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 999266459 ns 998529958 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 960016625 ns 938308750 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 852780958.5 ns 856850979 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1161564542 ns 1172841333 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34079762.5 ns 33924275.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1308969708.5 ns 1309270416.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1689204000 ns 1342306792 ns 1.26
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1642484833 ns 1639996875 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1298514771 ns 1671556167 ns 0.78
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1461645.5 ns 1461166.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1422250 ns 1458417 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1414000.5 ns 1416916.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1410541 ns 1463625 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 128615.5 ns 128327 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5018812.5 ns 5018562.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5052458 ns 5017583.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5042958 ns 5035291 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5025792 ns 5028709 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 515506.5 ns 596055.5 ns 0.86
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 175005812.5 ns 175022104 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 179670270.5 ns 129880000 ns 1.38
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 129962333.5 ns 128343604 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 155707333.5 ns 159345417 ns 0.98
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4857330 ns 4880680 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 669593458 ns 661717500 ns 1.01
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 553990208 ns 491987125 ns 1.13
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 519026625 ns 484849458 ns 1.07
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 676320542 ns 694008417 ns 0.97
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 16003946 ns 15696658 ns 1.02
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 8934291 ns 8921021 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8826020.5 ns 8820666.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7895083 ns 7856604 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 10163542 ns 10334062.5 ns 0.98
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1609035 ns 1590128 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 36061791 ns 36048792 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 37793833 ns 36944812.5 ns 1.02
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 33277167 ns 33324416 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 39115875 ns 40001167 ns 0.98
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6457436 ns 6454559 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47333 ns 47542 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47459 ns 47333 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47750 ns 47583 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47541 ns 47625 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18918 ns 18400.5 ns 1.03
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 52750 ns 50292 ns 1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50250 ns 50042 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 52958 ns 52833 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50458 ns 50500 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 201180.5 ns 213323 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6041 ns 7125 ns 0.85
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7125 ns 6334 ns 1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8354.5 ns 7709 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6708 ns 6833 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 95394.5 ns 101491.5 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9709 ns 9709 ns 1
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10209 ns 9584 ns 1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10333 ns 10292 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10291.5 ns 10208 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 558425.5 ns 591948.5 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5041 ns 6792 ns 0.74
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6312.5 ns 5666.5 ns 1.11
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 6792 ns 6979.5 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5687.5 ns 6729 ns 0.85
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 129167.5 ns 157875.5 ns 0.82
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12895.5 ns 13000 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13459 ns 12917 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13458 ns 13250 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13125 ns 13167 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 520805 ns 601632 ns 0.87
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1000 ns 1083 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1125 ns 1000 ns 1.13
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1125 ns 1042 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 33824 ns 33030 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7625 ns 7959 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8209 ns 7834 ns 1.05
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8167 ns 7958 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8333 ns 7917 ns 1.05
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 212223 ns 230449 ns 0.92
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23333 ns 23375 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23250 ns 23208 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23708 ns 23417 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23417 ns 23291.5 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18851 ns 19113 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52229.5 ns 52895.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52833 ns 52125 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 53083 ns 52584 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52250 ns 52625 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 290323.5 ns 342803.5 ns 0.85
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1444563 ns 1397500 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1416667 ns 1398958.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1398000 ns 1397959 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1397854.5 ns 1398875 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 196725 ns 197104 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4896875 ns 5023770.5 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5033250 ns 5008292 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4732209 ns 5019833 ns 0.94
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4656125 ns 5023375 ns 0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 628203 ns 702077 ns 0.89
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3050917 ns 3038750 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2085917 ns 2100458.5 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2299229 ns 2297437.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4546416 ns 4584063 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 583838 ns 584858 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24394208 ns 24409833.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 19011416 ns 18883709 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18859500 ns 18947166 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 36642625 ns 37250541.5 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3197291 ns 3224554 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 34077084 ns 34090937.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28833000 ns 28380792 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27989458.5 ns 28106292 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41738687 ns 42177791.5 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 144879458 ns 143784584 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 141694833 ns 141596854 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 124678750 ns 124395791.5 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 172183417 ns 175754958 ns 0.98
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22781734 ns 22549895 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 1324126083.5 ns 984048979.5 ns 1.35
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 862326062 ns 959988750 ns 0.90
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 822435542 ns 915312208 ns 0.90
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 674446500 ns 686199250 ns 0.98
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 117954250 ns 118476719 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 72708 ns 73916.5 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 73458.5 ns 73959 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78666 ns 77458 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 80771 ns 76083 ns 1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 247050 ns 297349 ns 0.83
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 238083 ns 191541.5 ns 1.24
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 259458.5 ns 190083.5 ns 1.36
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 192500 ns 193375 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 286542 ns 205187.5 ns 1.40
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1278131.5 ns 1506117 ns 0.85
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35505542 ns 35382375 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 35779583 ns 35433270.5 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32132583.5 ns 32211792 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 41038625 ns 41320000 ns 0.99
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5842363 ns 5840191 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 148318166 ns 146474167 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 156700916.5 ns 152518104 ns 1.03
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 134987916 ns 136637000 ns 0.99
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 287669583 ns 228362188 ns 1.26
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34854720.5 ns 34868018 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 121572750 ns 121348250 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 181182167 ns 174167875 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 147944417 ns 147844437.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 105962708.5 ns 109402458.5 ns 0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5473146 ns 5468413 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 473565583 ns 470104666 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 483250896 ns 467279541 ns 1.03
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 442160333 ns 440692042 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 742736375 ns 756630062.5 ns 0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 32265706.5 ns 32246487 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 709724354 ns 708913479.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 670668250 ns 654076083.5 ns 1.03
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 576699666.5 ns 576331875 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 846352792 ns 868264500 ns 0.97
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1306395.5 ns 1341333 ns 0.97
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 761604 ns 969333 ns 0.79
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 909459 ns 905958 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2049500 ns 2085854 ns 0.98
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 564019 ns 569576 ns 0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2968812.5 ns 2971041 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2494708 ns 2591478.5 ns 0.96
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2620270.5 ns 2624375 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3705625.5 ns 3763333 ns 0.98
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1753312 ns 1911630 ns 0.92
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 6649125 ns 6646062.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 6469521 ns 6511333.5 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 6522166.5 ns 6212750 ns 1.05
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 4446500 ns 4512000 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7292 ns 7417 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5334 ns 5958 ns 0.90
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6125 ns 6209 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10042 ns 10334 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25527 ns 25303 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212458 ns 212916 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220833 ns 220187.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221896 ns 220125 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 218229.5 ns 206750 ns 1.06
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 256834.5 ns 309128.5 ns 0.83
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 302398208 ns 301567020.5 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 280301750 ns 220519459 ns 1.27
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 195492125 ns 195586458 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 312069834 ns 308649729.5 ns 1.01
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7871517 ns 7785127 ns 1.01
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1078719979.5 ns 1082593062 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 989013042 ns 897538125 ns 1.10
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 814818166 ns 875339084 ns 0.93
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1156526521 ns 1186516209 ns 0.97
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26511423 ns 26500341 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5083 ns 5979.5 ns 0.85
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5750 ns 5709 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6333 ns 6333 ns 1
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5292 ns 5958 ns 0.89
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 148964 ns 202987.5 ns 0.73
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7292 ns 7750 ns 0.94
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7583 ns 6937.5 ns 1.09
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7395.5 ns 7166 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7583.5 ns 7208 ns 1.05
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 625721.5 ns 715249.5 ns 0.87
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 542 ns 625 ns 0.87
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 541 ns 1.16
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 750 ns 792 ns 0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 583 ns 541 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24037 ns 24024 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 8792 ns 9417 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9416 ns 8875 ns 1.06
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9625 ns 9375 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9042 ns 8917 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 217991.5 ns 236297 ns 0.92
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 351083.5 ns 352166.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 351833 ns 352000 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 352313 ns 352625 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 352104.5 ns 356354.5 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21184.5 ns 21208 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 778000 ns 821042 ns 0.95
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 782396 ns 774875 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 808562.5 ns 774479.5 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 816583 ns 784125 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 268217.5 ns 305684 ns 0.88
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 336938 ns 329917 ns 1.02
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 314604 ns 340958 ns 0.92
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 454458 ns 452250 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 332020.5 ns 310020.5 ns 1.07
batchedmm(16, Bsize=32)/forward/GPU/CUDA 18180 ns 18040 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 681937.5 ns 694333 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 738625 ns 741979.5 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1029500 ns 1031791.5 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 696625 ns 699708 ns 1.00
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 249957.5 ns 288808 ns 0.87
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 354229 ns 346250 ns 1.02
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 327229.5 ns 346812.5 ns 0.94
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 418250 ns 414166 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 369104.5 ns 354500.5 ns 1.04
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22561 ns 22617 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 745292 ns 756959 ns 0.98
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 750916 ns 745375 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1075584 ns 1075625 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 825833 ns 831312.5 ns 0.99
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 217990.5 ns 224885 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3292 ns 3667 ns 0.90
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3625 ns 3542 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3875 ns 3708 ns 1.05
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3583 ns 3687.5 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 17937 ns 17766 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4334 ns 4167 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4458 ns 4166 ns 1.07
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4417 ns 4250 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4208 ns 4292 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 264329 ns 276086.5 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3667 ns 3792 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3875 ns 3625 ns 1.07
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4625 ns 4208 ns 1.10
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3750 ns 3833.5 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 193740 ns 212882.5 ns 0.91
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8208 ns 8250 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8709 ns 8083 ns 1.08
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8750 ns 8750 ns 1
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8583.5 ns 8375 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1184156 ns 1204651 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203500 ns 204125 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 208750 ns 210208 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 211167 ns 211541 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 201083 ns 200916 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35565 ns 35493 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 645875 ns 644375 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 623750 ns 622542 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 635250 ns 621416 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 592646 ns 633021 ns 0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 327199.5 ns 345266 ns 0.95
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 959708.5 ns 960666 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 938042 ns 934042 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 954875 ns 962459 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 1293708 ns 1306000 ns 0.99
batchedmm(128, Bsize=128)/forward/GPU/CUDA 206955 ns 205400 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4483167 ns 4490416.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4618833 ns 4462042 ns 1.04
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4317645.5 ns 4294020.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 6243708.5 ns 6374063 ns 0.98
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 961553 ns 948468 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3750 ns 3250 ns 1.15
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3375 ns 3375 ns 1
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4375 ns 4250 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3708 ns 3520.5 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 193389 ns 233579.5 ns 0.83
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7208 ns 7458 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7917 ns 6959 ns 1.14
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7667 ns 7375 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7334 ns 6834 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 1009480 ns 1006195 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1631000 ns 1636208 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1166042 ns 1184104 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1365625 ns 1372937.5 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2367458 ns 2438792 ns 0.97
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215455 ns 214646 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12301250 ns 12368625 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9628833 ns 9576084 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9292104.5 ns 9277625 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18003312.5 ns 18160625 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1946489 ns 1950393 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17321917 ns 17396666.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14455625 ns 14353333 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14321416.5 ns 14318812 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21161000 ns 21205104.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 116959 ns 89646 ns 1.30
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 90000 ns 87125 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 92500 ns 91416.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 89667 ns 89833.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 126257.5 ns 126102.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1952542 ns 2028271 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2033584 ns 2016292 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2036458 ns 1722416.5 ns 1.18
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2026500 ns 2034291.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1082649.5 ns 1030027.5 ns 1.05
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 342854.5 ns 337333 ns 1.02
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 324042 ns 348770.5 ns 0.93
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 395834 ns 398395.5 ns 0.99
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 311645.5 ns 291854 ns 1.07
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16225 ns 16026 ns 1.01
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 698292 ns 701687.5 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 727291 ns 738854 ns 0.98
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 1022499.5 ns 1026479 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 655000 ns 659000 ns 0.99
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 195381.5 ns 192241.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7167 ns 7000 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5292 ns 5875 ns 0.90
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6042 ns 6000 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10084 ns 10166 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34236 ns 34083 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212375 ns 212666.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221500 ns 222291.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 233708 ns 219708 ns 1.06
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 217250 ns 214959 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 351634.5 ns 310410 ns 1.13
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3667 ns 3750 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3708 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3708 ns 3667 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3708 ns 3708 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22712 ns 22765 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14417 ns 14417 ns 1
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14209 ns 14333 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14416 ns 14375 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14375 ns 14250 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 483403.5 ns 477947.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 136416 ns 92104.5 ns 1.48
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 93292 ns 92791.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 97979.5 ns 93458.5 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 138291 ns 94479 ns 1.46
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125690 ns 125492 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1920333 ns 1915542 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1941542 ns 1931312.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1928708 ns 1652875 ns 1.17
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1924833 ns 1932812.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1091617.5 ns 963079 ns 1.13
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 873125 ns 868062.5 ns 1.01
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 800083 ns 820062.5 ns 0.98
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1213708 ns 1224562.5 ns 0.99
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 955958 ns 939937.5 ns 1.02
lenet(28, 28, 1, 32)/forward/GPU/CUDA 275553.5 ns 272869 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2789084 ns 2818583 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2533416.5 ns 2448750 ns 1.03
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3351166.5 ns 3349041.5 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3416521 ns 3429667 ns 1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1671794 ns 1623362 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 14250 ns 15416 ns 0.92
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 16229 ns 16479 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18500 ns 18042 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16666 ns 15416 ns 1.08
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 176658 ns 142624 ns 1.24
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 256791 ns 261396 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 216875 ns 216208 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 216666.5 ns 216500 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 216917 ns 259938 ns 0.83
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 714190.5 ns 645742 ns 1.11
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 218709 ns 221958 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 221500 ns 221521 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 222167 ns 222250 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 219000 ns 219959 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 284510.5 ns 270517 ns 1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 494666 ns 557249.5 ns 0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 498749.5 ns 495479.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 497250 ns 498021 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 560875 ns 511625 ns 1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1421507 ns 1384352 ns 1.03
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 328417 ns 329292 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 311917 ns 332479 ns 0.94
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 389416 ns 373062 ns 1.04
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 322229 ns 302292 ns 1.07
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16925.5 ns 16837 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 710750 ns 712062.5 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 728271 ns 736750 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 1020583 ns 1027458 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 666375 ns 669125 ns 1.00
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 195423 ns 196687 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 15854 ns 18083 ns 0.88
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18187.5 ns 19417 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19791.5 ns 19812.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17125 ns 18125 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 176945 ns 146545.5 ns 1.21
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212208 ns 219125 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 217125 ns 216937.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 224729 ns 215041.5 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 221750 ns 212667 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1030393 ns 944620 ns 1.09
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4250 ns 4375 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4625 ns 3833 ns 1.21
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 4958.5 ns 5000 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3833.5 ns 4375 ns 0.88
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 242789 ns 215167 ns 1.13
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10020.5 ns 10583 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10750 ns 9875 ns 1.09
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10834 ns 10917 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10354 ns 10125 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 1096605.5 ns 1064041.5 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3292 ns 3208 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3375 ns 3083 ns 1.09
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 3854.5 ns 4083 ns 0.94
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3042 ns 3625 ns 0.84
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 249519 ns 238602 ns 1.05
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7291 ns 7125 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7500 ns 7292 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7666 ns 7500 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7417 ns 7625 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1104412 ns 1072124 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23491541 ns 23510042 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43048000 ns 35239042 ns 1.22
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37820875 ns 37521895.5 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34890750 ns 35273916 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1856324 ns 1835321 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 184204375 ns 185664541 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 172049312.5 ns 160177375 ns 1.07
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 146204354.5 ns 146706312.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 414413041 ns 422527708.5 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16518831.5 ns 16512466 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 427089542 ns 425858208 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 257885125 ns 253069291.5 ns 1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 232435333 ns 231211500 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 487128167 ns 494264646 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 181770.5 ns 182792 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 183750 ns 184542 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 185667 ns 185187.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 182458 ns 183709 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 230246 ns 212062 ns 1.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 589417 ns 623709 ns 0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 588333.5 ns 585625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 596874.5 ns 587750 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 597604 ns 635312.5 ns 0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1117454.5 ns 1070840 ns 1.04
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3826354 ns 3842250 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3667353.5 ns 3651042 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3512500 ns 3490583 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 5363750 ns 5452395.5 ns 0.98
batchedmm(128, Bsize=512)/forward/GPU/CUDA 537508 ns 531221 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17313854 ns 17413417 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17708458.5 ns 17282104.5 ns 1.02
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16567146 ns 16562771 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 22157916.5 ns 23195250 ns 0.96
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2616426 ns 2624623.5 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 541 ns 625 ns 0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 708 ns 0.88
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32262 ns 33132 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9084 ns 9333 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9500 ns 8687.5 ns 1.09
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9459 ns 9958 ns 0.95
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9291 ns 9104 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 266752 ns 269468.5 ns 0.99
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 501606583 ns 502544125 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 504711792 ns 428545375 ns 1.18
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 433731812.5 ns 370375396 ns 1.17
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 675887813 ns 676271583.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12472195 ns 12479257.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 2045113375 ns 2046192292 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1661340625 ns 1629661125 ns 1.02
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1497713333.5 ns 1491097083.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2221091187.5 ns 2229658271 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49072522.5 ns 49368782 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1634250 ns 1646083 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1163708 ns 1197833 ns 0.97
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1361458 ns 1360625 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2497000 ns 2466020.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215136 ns 218218 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12684958 ns 12725062 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 10006667 ns 9942708 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9642583 ns 9678458.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18382166 ns 18472437.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2044162.5 ns 2043078 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17670958.5 ns 17708209 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14802250 ns 14671208.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14555750 ns 14589541.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21305750 ns 21579542 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26167 ns 26250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26417 ns 26208 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26250 ns 26291 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26209 ns 26208 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23856 ns 24352 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66875 ns 67041 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 67209 ns 66792 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67125 ns 67125 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66750 ns 66584 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 405103 ns 402288.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203416 ns 203166 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209125 ns 209292 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209584 ns 210166 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200667 ns 199791 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27000 ns 27685 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 602375 ns 609250 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 669125 ns 622062.5 ns 1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 622834 ns 630875 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 631125 ns 631917 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 352778 ns 357035 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 648917 ns 538250 ns 1.21
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 642917 ns 641250 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 544167 ns 600459 ns 0.91
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 639083 ns 670041 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131841.5 ns 132946 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2236583 ns 2237333.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2283750 ns 2232437 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1492958 ns 2242500 ns 0.67
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2238375 ns 2254875 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1252532 ns 1187953 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 16375 ns 18937 ns 0.86
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18208.5 ns 18812.5 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19292 ns 19708 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16333 ns 19000 ns 0.86
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 146257.5 ns 147397 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 225875 ns 218791.5 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 254708.5 ns 221416 ns 1.15
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220145.5 ns 229084 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 261833 ns 260604.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1134332.5 ns 1013651.5 ns 1.12
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 542 ns 584 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 666 ns 542 ns 1.23
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 667 ns 709 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 583 ns 583 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23621 ns 23882 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9667 ns 10125 ns 0.95
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10375 ns 9583 ns 1.08
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10084 ns 9750 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9542 ns 9583 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 259757 ns 264213 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4958 ns 5708 ns 0.87
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5750 ns 5084 ns 1.13
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6167 ns 6875 ns 0.90
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5500 ns 6125 ns 0.90
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 236718.5 ns 236698 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6917 ns 7500 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7208 ns 7167 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7625 ns 7417 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7334 ns 7125 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 801879.5 ns 814113 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1917 ns 2333 ns 0.82
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2167 ns 2250 ns 0.96
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2542 ns 2250 ns 1.13
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2209 ns 2250 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 17960.5 ns 18404 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6666 ns 6625 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6750 ns 6416 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6958 ns 6667 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6708 ns 6500 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 333776.5 ns 335568.5 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 746500 ns 749208 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 749354 ns 746584 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 749479 ns 747708 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 752041.5 ns 749437.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21034 ns 21512 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 775729 ns 810792 ns 0.96
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 787812.5 ns 772583 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 773209 ns 791666 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 792937.5 ns 815292 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 378579.5 ns 300193.5 ns 1.26
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7042 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5333 ns 6000 ns 0.89
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 5959 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10208 ns 10417 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33702 ns 33409 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220541 ns 260625 ns 0.85
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 228500 ns 227833 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229083.5 ns 229042 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 257687.5 ns 239834 ns 1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 366571.5 ns 365578.5 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10084 ns 10208 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10375 ns 9917 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10750 ns 10792 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9791 ns 10709 ns 0.91
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 255521 ns 247512.5 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24792 ns 24958 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 26604.5 ns 24250 ns 1.10
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25500 ns 23791 ns 1.07
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24750 ns 24916 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1129584 ns 1135547.5 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 105995125 ns 106115583 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 125793000 ns 118501687.5 ns 1.06
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 120388208 ns 120163958 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 119036125 ns 118736333 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2663640 ns 2655926 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 393141687.5 ns 392620458 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 378610917 ns 366282792 ns 1.03
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 358649208 ns 355680542 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 482237667 ns 483640417 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15147052.5 ns 15268892 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 758670979.5 ns 758389270.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 774479250 ns 585230833 ns 1.32
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 745435354 ns 746534979.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 768055062.5 ns 959125958 ns 0.80
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6354.5 ns 7396 ns 0.86
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7667 ns 6645.5 ns 1.15
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8209 ns 8917 ns 0.92
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7209 ns 7792 ns 0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 244571.5 ns 237625.5 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13875 ns 14208 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14250 ns 13854.5 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14208 ns 15062.5 ns 0.94
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13834 ns 13959 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1083702.5 ns 1099893 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5125 ns 6500 ns 0.79
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6604.5 ns 5583 ns 1.18
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 6750 ns 6645.5 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5792 ns 6666 ns 0.87
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 238521.5 ns 238125 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12125 ns 12750 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13000 ns 12333 ns 1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12625 ns 12625 ns 1
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12542 ns 12459 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 799702 ns 799101.5 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 343416 ns 342896 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 320583 ns 344667 ns 0.93
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 398646 ns 398458 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 311958 ns 295917 ns 1.05
batchedmm(2, Bsize=128)/forward/GPU/CUDA 17585 ns 17123 ns 1.03
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 701708.5 ns 709000 ns 0.99
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 729708 ns 732875 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 1022708 ns 1023500 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 663646 ns 661541.5 ns 1.00
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 202905.5 ns 201466.5 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 416 ns 0.70
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 417 ns 292 ns 1.43
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 417 ns 458 ns 0.91
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 333 ns 333 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 24123 ns 23927 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6084 ns 6625 ns 0.92
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6750 ns 6208 ns 1.09
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6584 ns 6458 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6333 ns 6042 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 244929.5 ns 243036.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5834 ns 5875 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6000 ns 5917 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6083 ns 6000 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5917 ns 5833 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 25271 ns 24827 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21208 ns 21708 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21708 ns 21208 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 22334 ns 21750 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21584 ns 21041 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 269606 ns 266228 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 146291 ns 144209 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 145562.5 ns 144021 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 146520.5 ns 146750 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 187916.5 ns 147687.5 ns 1.27
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167772.5 ns 168564.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 856895.5 ns 1309750 ns 0.65
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1318270.5 ns 1253895.5 ns 1.05
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1319917 ns 1328541 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1319542 ns 1340854.5 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1369184 ns 1366226 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 21917 ns 22458.5 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22250 ns 24292 ns 0.92
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23209 ns 24792 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21333.5 ns 22770.5 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 358843.5 ns 292862.5 ns 1.23
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 117458 ns 130708 ns 0.90
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 175500 ns 117458 ns 1.49
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 118854.5 ns 119625 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 130625 ns 127520.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1501306 ns 1486893 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 333 ns 333 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 416 ns 292 ns 1.42
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23632 ns 23357 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6083 ns 6583 ns 0.92
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6833 ns 6208 ns 1.10
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6625 ns 6541 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6292 ns 6250 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 262048.5 ns 259976.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4791.5 ns 4333.5 ns 1.11
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4583 ns 4375 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5250 ns 5333 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4250 ns 5041 ns 0.84
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 259862 ns 257469 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9917 ns 9666.5 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10208 ns 9979.5 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10375 ns 10459 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10292 ns 10375 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1366687.5 ns 1365106 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1625 ns 1584 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1584 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23932 ns 23579.5 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5708 ns 5666 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5959 ns 5667 ns 1.05
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6125 ns 5792 ns 1.06
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5708 ns 5625 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 280444.5 ns 278192.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6858875 ns 6816875 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6356666.5 ns 6362583 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6558354 ns 6488041 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7638125 ns 7598354 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215319 ns 215727 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24020854 ns 24100104 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21336812.5 ns 21301625 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21069625 ns 21056624.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29727208 ns 29843458 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2103488 ns 2118629 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 37386500 ns 37406124.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 45855312 ns 34318854 ns 1.34
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45723854 ns 45786625 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 37910104.5 ns 49609729 ns 0.76
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5458 ns 6270.5 ns 0.87
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6500 ns 5584 ns 1.16
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6583 ns 7125 ns 0.92
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5667 ns 6375 ns 0.89
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 238563.5 ns 236566.5 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8250 ns 8458 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8375 ns 8083 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9375 ns 8291 ns 1.13
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9000 ns 8541 ns 1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1077587 ns 1064326 ns 1.01
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1545521 ns 1546833 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1248854 ns 1262667 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1616959 ns 1614875 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2159562 ns 2100437.5 ns 1.03
lenet(28, 28, 1, 128)/forward/GPU/CUDA 283099 ns 273505 ns 1.04
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7874500 ns 7902479 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6609250 ns 6457875 ns 1.02
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7081396 ns 7153750 ns 0.99
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10467646 ns 10520687 ns 0.99
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1880933.5 ns 1860306 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 338958 ns 338041 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 329417 ns 344187.5 ns 0.96
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 416208 ns 403666 ns 1.03
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 346000 ns 325146 ns 1.06
batchedmm(128, Bsize=4)/forward/GPU/CUDA 42726 ns 46661 ns 0.92
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 729291.5 ns 739354.5 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 779145.5 ns 791041.5 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1069708 ns 1068125 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 747500 ns 777583 ns 0.96
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 300168 ns 308223 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397500 ns 397500 ns 1
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 210958 ns 287708 ns 0.73
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287875 ns 287834 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 750791 ns 751187.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 44334 ns 43983 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 669417 ns 669375 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 471583 ns 531834 ns 0.89
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 532312.5 ns 530584 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 973125 ns 974333 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 191851 ns 189215 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 638229 ns 660395.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 595750 ns 644167 ns 0.92
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 634583.5 ns 613333 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 639209 ns 624292 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132118 ns 132150 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2339333.5 ns 2236750 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2471792 ns 2459583.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2452750 ns 2461041.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2462792 ns 2472250 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1204724 ns 1281622 ns 0.94
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 342834 ns 341645.5 ns 1.00
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 321667 ns 342792 ns 0.94
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 399520.5 ns 398021 ns 1.00
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 310625 ns 294000 ns 1.06
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16619 ns 16403 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 698708.5 ns 701083 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 726959 ns 730208 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 1020042 ns 1025708.5 ns 0.99
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 654208 ns 660646 ns 0.99
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 199213 ns 198717 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1458250 ns 1458166 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1492125 ns 1497709 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1498333 ns 1498458 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1440625 ns 1441209 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 41986 ns 41537 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5106291 ns 5119375 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5286750 ns 5293042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5038208 ns 5308020.5 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4995083.5 ns 5000771 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 200103.5 ns 198307.5 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3667 ns 3709 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3667 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3708 ns 3750 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33402.5 ns 33127 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15000 ns 15166 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15375 ns 15125 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15416 ns 15292 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15125 ns 14834 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 381057 ns 376728 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 70750 ns 71208 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71125 ns 71208 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71250 ns 71333 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 71208 ns 70875 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113931 ns 112994 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 316458 ns 317541.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 326395.5 ns 323479.5 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 324958.5 ns 318833 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 317042 ns 322541 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 196642 ns 193878 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1000 ns 1041 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1084 ns 1000 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1000 ns 1000 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 24139 ns 23624 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7666 ns 8083 ns 0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8375 ns 7875 ns 1.06
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8208 ns 8375 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7916 ns 8041 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 264797.5 ns 262928.5 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 468542 ns 463437.5 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 458583 ns 467917 ns 0.98
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 550499.5 ns 555208 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 557354 ns 535979.5 ns 1.04
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129474.5 ns 131030 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1378667 ns 1392125 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1407354 ns 1366083.5 ns 1.03
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1620083 ns 1631541 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 1575771 ns 1632667 ns 0.97
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 274432 ns 274742 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 333 ns 1.13
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 416 ns 417 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 334 ns 0.87
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32500 ns 32052 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 5917 ns 6375 ns 0.93
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6708 ns 6417 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6375 ns 6541 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6167 ns 6209 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 267871.5 ns 267424 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1784500 ns 1722042 ns 1.04
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1728562.5 ns 1722500 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1724000 ns 1723833.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1773500 ns 1723750 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 169383 ns 169639.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4347916.5 ns 3951375.5 ns 1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4395208 ns 4344708.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4356291 ns 4311167 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4355708 ns 4386625 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1176388 ns 1177448 ns 1.00
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6583 ns 6833 ns 0.96
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7375 ns 7083 ns 1.04
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7167 ns 9000 ns 0.80
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6834 ns 6958 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20833 ns 21025 ns 0.99
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 32833 ns 51500 ns 0.64
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 33125 ns 32500 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 32708 ns 33416 ns 0.98
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 70188 ns 68687.5 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 219443 ns 211473.5 ns 1.04
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 349250 ns 347416 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 324917 ns 346562.5 ns 0.94
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 416250 ns 405812.5 ns 1.03
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 319959 ns 300895.5 ns 1.06
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18352 ns 18759 ns 0.98
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 716166.5 ns 717750 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 733562.5 ns 738125 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 1034625 ns 1033854 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 680062.5 ns 685145.5 ns 0.99
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 343358 ns 346768 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75375 ns 75208 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75375 ns 75208 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75250 ns 75209 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75417 ns 75250 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46573.5 ns 47731 ns 0.98
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 323708 ns 325375 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 328375 ns 341167 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 337667 ns 327917 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 323916 ns 328875 ns 0.98
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 210239 ns 213096 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1485625 ns 1484542 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1517625 ns 1524541.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1525916 ns 1525708 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1464958 ns 1464917 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 52987 ns 52950 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5107375 ns 5117000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5290249.5 ns 5293834 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5022896 ns 5304042 ns 0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4990771.5 ns 5007708 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 206297 ns 208140.5 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28125 ns 28291 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28250 ns 28125 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28208 ns 28458 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28208 ns 28292 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24564 ns 25236 ns 0.97
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66125 ns 66417 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66625 ns 66375 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66416 ns 66292 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66375 ns 66333 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 535111 ns 544174 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1491375 ns 1473145.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 929500 ns 1063750 ns 0.87
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1106125 ns 1081749.5 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2118479.5 ns 2244854 ns 0.94
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 569055 ns 581542 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 2874354 ns 3075833.5 ns 0.93
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2606312.5 ns 2738625 ns 0.95
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2740292 ns 2748312.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3820625.5 ns 3862521 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2075439 ns 2089266.5 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 8426958 ns 8827041 ns 0.95
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 8751479 ns 8764208 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 8782333 ns 8760437.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 6369124.5 ns 6449750 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 79250 ns 82520.5 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81250 ns 80750 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 82250 ns 82667 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 99334 ns 88708 ns 1.12
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192927.5 ns 192989 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1987333.5 ns 2019083 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1989937.5 ns 2020500 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2008750 ns 1750146 ns 1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2022667 ns 2040166 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 817057 ns 811335 ns 1.01

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.