Skip to content

Commit

Permalink
fix: pretty printing of MaxPool Layer (#891)
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal authored Sep 9, 2024
1 parent ec5841b commit 043bae1
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 5 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Lux"
uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
version = "1.0.0"
version = "1.0.1"

[deps]
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
Expand Down
4 changes: 3 additions & 1 deletion src/layers/display.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ end
show_leaflike(x) = Functors.isleaf(x) # mostly follow Functors, except for:
show_leaflike(x::AbstractLuxLayer) = false

isa_printable_leaf(x) = false

function underscorise(n::Integer)
return join(reverse(join.(reverse.(Iterators.partition(digits(n), 3)))), '_')
end
Expand All @@ -27,7 +29,7 @@ function big_show(io::IO, obj, indent::Int=0, name=nothing)
return
end
children = printable_children(obj)
if all(show_leaflike, values(children))
if all(show_leaflike, values(children)) || isa_printable_leaf(obj)
layer_show(io, obj, indent, name)
else
println(io, " "^indent, isnothing(name) ? "" : "$name = ", display_name(obj), "(")
Expand Down
12 changes: 9 additions & 3 deletions src/layers/pooling.jl
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ for layer_op in (:Max, :Mean, :LP)
window; stride, pad, dilation, p))
end

function Base.show(io::IO, ::MIME"text/plain", m::$(layer_name))
function Base.show(io::IO, m::$(layer_name))
kernel_size = m.layer.mode.kernel_size
print(io, string($(Meta.quot(layer_name))), "($(kernel_size)")
pad = m.layer.mode.pad
Expand All @@ -213,6 +213,8 @@ for layer_op in (:Max, :Mean, :LP)
print(io, ")")
end

PrettyPrinting.isa_printable_leaf(::$(layer_name)) = true

# Global Pooling Layer
@doc $(global_pooling_docstring) @concrete struct $(global_layer_name) <:
AbstractLuxWrapperLayer{:layer}
Expand All @@ -223,14 +225,16 @@ for layer_op in (:Max, :Mean, :LP)
return $(global_layer_name)(PoolingLayer(static(:global), $(Meta.quot(op)); p))
end

function Base.show(io::IO, ::MIME"text/plain", g::$(global_layer_name))
function Base.show(io::IO, g::$(global_layer_name))
print(io, string($(Meta.quot(global_layer_name))), "(")
if $(Meta.quot(op)) == :lp
g.layer.op.p == 2 || print(io, ", p=", g.layer.op.p)
end
print(io, ")")
end

PrettyPrinting.isa_printable_leaf(::$(global_layer_name)) = true

# Adaptive Pooling Layer
@doc $(adaptive_pooling_docstring) @concrete struct $(adaptive_layer_name) <:
AbstractLuxWrapperLayer{:layer}
Expand All @@ -242,12 +246,14 @@ for layer_op in (:Max, :Mean, :LP)
static(:adaptive), $(Meta.quot(op)), out_size; p))
end

function Base.show(io::IO, ::MIME"text/plain", a::$(adaptive_layer_name))
function Base.show(io::IO, a::$(adaptive_layer_name))
print(io, string($(Meta.quot(adaptive_layer_name))), "(", a.layer.mode.out_size)
if $(Meta.quot(op)) == :lp
a.layer.op.p == 2 || print(io, ", p=", a.layer.op.p)
end
print(io, ")")
end

PrettyPrinting.isa_printable_leaf(::$(adaptive_layer_name)) = true
end
end

3 comments on commit 043bae1

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/114849

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.0.1 -m "<description of version>" 043bae18ad12d1f7120a01a016f36249b7d87553
git push origin v1.0.1

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 043bae1 Previous: ec5841b Ratio
Dense(512 => 512, identity)(512 x 128)/forward/CPU/2 thread(s) 412666 ns 412458.5 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/4 thread(s) 322354.5 ns 322188 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/8 thread(s) 319458 ns 244250 ns 1.31
Dense(512 => 512, identity)(512 x 128)/forward/CPU/1 thread(s) 740416.5 ns 739584 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/GPU/CUDA 43751 ns 43656 ns 1.00
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/2 thread(s) 1313875 ns 1361709 ns 0.96
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/4 thread(s) 2420917 ns 2428521 ns 1.00
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/8 thread(s) 19423583 ns 16099583.5 ns 1.21
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/1 thread(s) 2274292 ns 2260562 ns 1.01
Dense(512 => 512, identity)(512 x 128)/zygote/GPU/CUDA 205395 ns 206975.5 ns 0.99
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/2 thread(s) 1390500 ns 1428812 ns 0.97
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/4 thread(s) 912917 ns 906708 ns 1.01
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/8 thread(s) 10875709 ns 1628500 ns 6.68
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/1 thread(s) 2207896 ns 2244917 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1773146 ns 1660916.5 ns 1.07
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1085124.5 ns 1079750 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1432500 ns 1530375 ns 0.94
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2989333 ns 3002125 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 207449.5 ns 207801.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12143291 ns 12150625 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 8836062.5 ns 8835458 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9291520.5 ns 9224604 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18553083.5 ns 18587667 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1505031.5 ns 1487468.5 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17307791 ns 17307333.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 13990459 ns 13941250 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14509125 ns 14519104.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21766396 ns 21817021.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 250314354.5 ns 249997583.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 148416125 ns 148163667 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 121652791 ns 116524583.5 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 445949875 ns 454091250 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5456985 ns 5461865 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1227899292 ns 1221856542 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 932161917 ns 931447709 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 846826937.5 ns 834664604 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1625356625 ns 1654541041 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 31290090 ns 31167157.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1150326667 ns 1137975750 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 997528083 ns 995311958 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 3958813520.5 ns 1319930125 ns 3.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1725291604 ns 1748556646.5 ns 0.99
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 1123250 ns 1095125 ns 1.03
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 1617437.5 ns 1622583 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 10483333 ns 3546312.5 ns 2.96
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 782875 ns 789458 ns 0.99
lenet(28, 28, 1, 32)/forward/GPU/CUDA 261765 ns 263083 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2988166 ns 2978417 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 4099750 ns 4117958 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 18729584 ns 12025354 ns 1.56
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3268292 ns 3159375 ns 1.03
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1093399.5 ns 1134663 ns 0.96
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 2322000 ns 2330792 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1376646 ns 1433104 ns 0.96
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1622229.5 ns 1541125 ns 1.05
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 4218083 ns 4209645.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 208585 ns 208448.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 19421125 ns 19415042 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 16162729.5 ns 16071271 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 16659708.5 ns 17172750 ns 0.97
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 25778791 ns 25792250 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1607423 ns 1593280 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 34077666.5 ns 34130083.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 30945250 ns 30775292 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 31583520.5 ns 31125042 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 36716875 ns 36818959 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 4526541.5 ns 4527292 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2770209 ns 2773208.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2921229 ns 2656208.5 ns 1.10
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 8366833 ns 8382771 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 423782 ns 428034 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 38926229.5 ns 38903708 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 32245667 ns 32088354 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 32910458 ns 32248250 ns 1.02
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 51760416.5 ns 51944791 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2630256 ns 2616927.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 89355187.5 ns 88376541 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 114003500 ns 113229459 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 1405927875 ns 228466396 ns 6.15
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 74231458.5 ns 74323208 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 268393792 ns 267494667 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 159475583 ns 159172958 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 132975208 ns 123508104 ns 1.08
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 484020063 ns 484768333 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7020070 ns 6999230.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1473032083.5 ns 1465729583.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 1177961375 ns 1176260541 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 1082903374.5 ns 1073046687.5 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1999116063 ns 2008112021 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34775365.5 ns 34700607 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1720636250 ns 1675838166 ns 1.03
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1555603750 ns 1494284562.5 ns 1.04
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 4526585645.5 ns 1751750208 ns 2.58
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 2207239541.5 ns 2233116333 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 2110458 ns 1651542 ns 1.28
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 2946104 ns 2558521 ns 1.15
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 14693125.5 ns 6003209 ns 2.45
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2312646 ns 2474729.5 ns 0.93
lenet(28, 28, 1, 128)/forward/GPU/CUDA 266343.5 ns 266892 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 9579333 ns 8851208.5 ns 1.08
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 12048375 ns 11418334 ns 1.06
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 37492562.5 ns 23064666.5 ns 1.63
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 11316041.5 ns 11739000 ns 0.96
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1164743 ns 1168625 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 381670125 ns 379338084 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 285686541 ns 283711083 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 238643375.5 ns 273375708.5 ns 0.87
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 452689979 ns 453126979.5 ns 1.00
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4852504 ns 4863813 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 1156829333 ns 1152978750 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 936606625 ns 927580833 ns 1.01
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 1034937042 ns 926883375 ns 1.12
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 1394580541 ns 1395415875 ns 1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 18385711 ns 17771785 ns 1.03
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1051604 ns 1046833 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 2032750 ns 1903958.5 ns 1.07
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 6421479 ns 4710334 ns 1.36
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1392792 ns 1288833.5 ns 1.08
lenet(28, 28, 1, 64)/forward/GPU/CUDA 269338 ns 272890.5 ns 0.99
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 6488271 ns 6488521 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 12419333 ns 13795375 ns 0.90
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 20712583 ns 18166417 ns 1.14
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 6069104 ns 6069042 ns 1.00
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1230429.5 ns 1252551 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 70482479 ns 70461917 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43549791.5 ns 43567958 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 39533021 ns 39697500 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 132479958.5 ns 134157667 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1936490 ns 1944258.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 356251291.5 ns 355807438 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 270847875 ns 270773667 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 254784208 ns 252761792 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 534252895.5 ns 534329166.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 12272241.5 ns 12278757.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 394477666 ns 394634458 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 402690375 ns 389791583 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 720082750 ns 673445792 ns 1.07
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 709744667 ns 709693958 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 1187004958 ns 1186364083 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 691832145.5 ns 691957562.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 632123250 ns 638110667 ns 0.99
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 1770715312 ns 1783555042 ns 0.99
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12547245 ns 12301928 ns 1.02
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 3637291458 ns 3711157812 ns 0.98
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 2824273000 ns 2879197208 ns 0.98
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 2730881708 ns 2773307208 ns 0.98
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 5049712083 ns 5035792375 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49393707 ns 49664392.5 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3418083.5 ns 3407458 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2075979 ns 2075563 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2539271 ns 2527020.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 6026459 ns 6024750 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 346441.5 ns 343963.5 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 25952979.5 ns 25968187.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 19050375 ns 19062125 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 19201375 ns 19252125 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 39193125 ns 39301000 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2459244 ns 2461837 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 55322083 ns 55558542 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 81164292 ns 80387500 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 170963771 ns 175413270.5 ns 0.97
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 45568000 ns 45602750 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1782541.5 ns 1783854 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1100417 ns 1100146 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1551125 ns 1552500 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 3037000 ns 3029125 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 214556 ns 212167 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12536291 ns 12521479 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9216000 ns 9200041.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9685375 ns 9609625 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18982854 ns 18969562.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1538065 ns 1536334.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17658500 ns 17631000 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14322812.5 ns 14331187 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14698833 ns 14538062.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 22165937.5 ns 22175666.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 70506187.5 ns 70452395.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43620250 ns 43579917 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 39487104.5 ns 39814895.5 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 132516562.5 ns 133530375 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1945320.5 ns 1873599.5 ns 1.04
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 361020458 ns 359456959 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 347398479.5 ns 345462791 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 304273333 ns 304957917 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 723138416 ns 730536500 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13360775 ns 13387305 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 417284416.5 ns 418668750 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 421339792 ns 422783667 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 702984958 ns 694707583.5 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 714157000 ns 714852708 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 1690770.5 ns 1688208.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 1342250 ns 1348958.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 1267146 ns 1141500 ns 1.11
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 2453041 ns 2410041 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 590494.5 ns 583102.5 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 8943958 ns 8957396 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 12866833 ns 12832625 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 30425166 ns 31672062.5 ns 0.96
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 9850895.5 ns 9824709 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1477491 ns 1427623.5 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 17137625.5 ns 17909083 ns 0.96
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 17335937 ns 17252334 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 29414500 ns 30244979.5 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14221083.5 ns 14301854 ns 0.99
Dense(512 => 512, relu)(512 x 128)/forward/CPU/2 thread(s) 670250 ns 671500 ns 1.00
Dense(512 => 512, relu)(512 x 128)/forward/CPU/4 thread(s) 587208 ns 582646 ns 1.01
Dense(512 => 512, relu)(512 x 128)/forward/CPU/8 thread(s) 1034167 ns 1059875 ns 0.98
Dense(512 => 512, relu)(512 x 128)/forward/CPU/1 thread(s) 723250.5 ns 738291.5 ns 0.98
Dense(512 => 512, relu)(512 x 128)/forward/GPU/CUDA 48445 ns 47313 ns 1.02
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/2 thread(s) 1557292 ns 1553208 ns 1.00
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/4 thread(s) 1015708 ns 1029250 ns 0.99
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/8 thread(s) 1383084 ns 1568708 ns 0.88
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/1 thread(s) 2231354 ns 2237833 ns 1.00
Dense(512 => 512, relu)(512 x 128)/zygote/GPU/CUDA 240891.5 ns 237738.5 ns 1.01
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/2 thread(s) 1521354.5 ns 1562083 ns 0.97
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/4 thread(s) 1075542 ns 1066750 ns 1.01
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/8 thread(s) 1487792 ns 1541083.5 ns 0.97
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/1 thread(s) 2254458 ns 2207833 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3396375 ns 3389167 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2057334 ns 2034875 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2498687.5 ns 2486708 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 5992229 ns 5993958 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 287342 ns 283368 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24078166.5 ns 24043104 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 17297542 ns 17216854.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17230333 ns 17073750 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 37494917 ns 37471000 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2405577 ns 2400544 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 53543396 ns 53725750 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 83822833 ns 80060417 ns 1.05
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 166927250 ns 173749250 ns 0.96
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 44407750 ns 44536333.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 249509499.5 ns 250047041.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 147934792 ns 148068333.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 115819083.5 ns 116114541.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 447669812.5 ns 447302270.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5459471 ns 5442086 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1102880459 ns 1103374375 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 857952270.5 ns 858772104.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 827187146 ns 827438312.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1749946917 ns 1767052584 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 28884820 ns 29017689.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1012294625 ns 1004373333 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 967548417 ns 930750709 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1291437750 ns 1242654000 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1719471271 ns 1746910729 ns 0.98
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1287333 ns 1306417 ns 0.99
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 983417 ns 925687 ns 1.06
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 901750 ns 706792 ns 1.28
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2103875 ns 2044583 ns 1.03
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 563889 ns 568045 ns 0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 5980875 ns 5802000 ns 1.03
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 6572395.5 ns 6862750 ns 0.96
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 23913083.5 ns 25619396 ns 0.93
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 7082125 ns 6379750 ns 1.11
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1362470.5 ns 1369217 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 11293750 ns 10914875 ns 1.03
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 10483750 ns 9317208.5 ns 1.13
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 17869125 ns 17171708.5 ns 1.04
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 8484417 ns 8667334 ns 0.98
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/2 thread(s) 355500 ns 344250 ns 1.03
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/4 thread(s) 375708 ns 388208 ns 0.97
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/8 thread(s) 1905667 ns 2652375 ns 0.72
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/1 thread(s) 88916 ns 88729.5 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/forward/GPU/CUDA 27382 ns 27556 ns 0.99
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/2 thread(s) 388541 ns 361250 ns 1.08
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/4 thread(s) 441375 ns 399709 ns 1.10
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/8 thread(s) 4554812.5 ns 4501000 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/1 thread(s) 258625 ns 262354 ns 0.99
Dense(128 => 128, gelu)(128 x 128)/zygote/GPU/CUDA 219767 ns 223132 ns 0.98
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/2 thread(s) 418500 ns 391375 ns 1.07
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/4 thread(s) 472541 ns 431437.5 ns 1.10
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/8 thread(s) 4780416 ns 4729375 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/1 thread(s) 282333 ns 282666.5 ns 1.00
Dense(128 => 128, relu)(128 x 128)/forward/CPU/2 thread(s) 302875 ns 290750 ns 1.04
Dense(128 => 128, relu)(128 x 128)/forward/CPU/4 thread(s) 310292 ns 327708 ns 0.95
Dense(128 => 128, relu)(128 x 128)/forward/CPU/8 thread(s) 746917 ns 675958 ns 1.10
Dense(128 => 128, relu)(128 x 128)/forward/CPU/1 thread(s) 52958 ns 54270.5 ns 0.98
Dense(128 => 128, relu)(128 x 128)/forward/GPU/CUDA 27770 ns 27926 ns 0.99
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/2 thread(s) 353062.5 ns 310292 ns 1.14
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/4 thread(s) 337625 ns 273625 ns 1.23
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/8 thread(s) 632166.5 ns 375709 ns 1.68
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/1 thread(s) 151958.5 ns 152354 ns 1.00
Dense(128 => 128, relu)(128 x 128)/zygote/GPU/CUDA 205653 ns 207917 ns 0.99
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/2 thread(s) 368187.5 ns 325125 ns 1.13
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/4 thread(s) 352042 ns 289084 ns 1.22
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/8 thread(s) 899958 ns 403542 ns 2.23
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/1 thread(s) 151416 ns 151625 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 602224792 ns 603325417 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 426937479 ns 421621416.5 ns 1.01
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 377069646 ns 380597249.5 ns 0.99
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 871474708 ns 874303208 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7023361 ns 7027347 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 2004892916.5 ns 2005331375 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 1605784063 ns 1619588271 ns 0.99
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 1566567437.5 ns 1613872687.5 ns 0.97
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 2619326292 ns 2628480958 ns 1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26073016 ns 26003745 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/2 thread(s) 537145.5 ns 527792 ns 1.02
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/4 thread(s) 436187.5 ns 426833 ns 1.02
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/8 thread(s) 1750750.5 ns 2562271 ns 0.68
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/1 thread(s) 879542 ns 866208 ns 1.02
Dense(512 => 512, gelu)(512 x 128)/forward/GPU/CUDA 47197 ns 47205 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/2 thread(s) 1906583.5 ns 1876021 ns 1.02
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/4 thread(s) 2789708 ns 2780812 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/8 thread(s) 14534625 ns 16664416 ns 0.87
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/1 thread(s) 2719437.5 ns 2745542 ns 0.99
Dense(512 => 512, gelu)(512 x 128)/zygote/GPU/CUDA 247168 ns 250772.5 ns 0.99
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/2 thread(s) 1975604.5 ns 1973979.5 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/4 thread(s) 5033938 ns 4994854 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/8 thread(s) 14790687.5 ns 16607792 ns 0.89
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/1 thread(s) 2768750 ns 2721770.5 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1615208 ns 1608270.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1258604 ns 1262750 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1213228.5 ns 929208 ns 1.31
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2324937.5 ns 2314292 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 579864 ns 587834.5 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 5939542 ns 5921500 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 6479000 ns 6925166 ns 0.94
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 24227875 ns 25706812 ns 0.94
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 7320042 ns 7304292 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 1342640 ns 1354974 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 13346958 ns 11973583 ns 1.11
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 11570125 ns 12125584 ns 0.95
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 21132000 ns 21506521 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 10783208 ns 10667104.5 ns 1.01
Dense(16 => 16, relu)(16 x 128)/forward/CPU/2 thread(s) 2458.5 ns 2333 ns 1.05
Dense(16 => 16, relu)(16 x 128)/forward/CPU/4 thread(s) 2333 ns 2542 ns 0.92
Dense(16 => 16, relu)(16 x 128)/forward/CPU/8 thread(s) 3292 ns 3333 ns 0.99
Dense(16 => 16, relu)(16 x 128)/forward/CPU/1 thread(s) 2792 ns 2459 ns 1.14
Dense(16 => 16, relu)(16 x 128)/forward/GPU/CUDA 24276 ns 24615 ns 0.99
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/2 thread(s) 6958 ns 7250 ns 0.96
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/4 thread(s) 7250 ns 7125 ns 1.02
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/8 thread(s) 7292 ns 7291 ns 1.00
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/1 thread(s) 7041 ns 7334 ns 0.96
Dense(16 => 16, relu)(16 x 128)/zygote/GPU/CUDA 208234.5 ns 211753 ns 0.98
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/2 thread(s) 8187.5 ns 8208 ns 1.00
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/4 thread(s) 8375 ns 8250 ns 1.02
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/8 thread(s) 8542 ns 8292 ns 1.03
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/1 thread(s) 6042 ns 6125 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/2 thread(s) 11562.5 ns 10625 ns 1.09
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/4 thread(s) 13583 ns 13750 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/8 thread(s) 10500 ns 10520.5 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/1 thread(s) 7375 ns 7500 ns 0.98
Dense(16 => 16, gelu)(16 x 128)/forward/GPU/CUDA 24269 ns 24896 ns 0.97
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/2 thread(s) 19750 ns 19895.5 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/4 thread(s) 20042 ns 19875 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/8 thread(s) 20291.5 ns 20125 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/1 thread(s) 19750 ns 20375 ns 0.97
Dense(16 => 16, gelu)(16 x 128)/zygote/GPU/CUDA 227670.5 ns 232312.5 ns 0.98
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/2 thread(s) 23292 ns 23375 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/4 thread(s) 23625 ns 23583 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/8 thread(s) 24000 ns 23625 ns 1.02
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/1 thread(s) 21375 ns 21375 ns 1
Dense(128 => 128, identity)(128 x 128)/forward/CPU/2 thread(s) 28625 ns 28458 ns 1.01
Dense(128 => 128, identity)(128 x 128)/forward/CPU/4 thread(s) 28854.5 ns 28750 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/8 thread(s) 28709 ns 28209 ns 1.02
Dense(128 => 128, identity)(128 x 128)/forward/CPU/1 thread(s) 46167 ns 46333 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/GPU/CUDA 25604 ns 25917 ns 0.99
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/2 thread(s) 222104.5 ns 220041 ns 1.01
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/4 thread(s) 278083 ns 272792 ns 1.02
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/8 thread(s) 4126167 ns 4142125 ns 1.00
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/1 thread(s) 145792 ns 146229 ns 1.00
Dense(128 => 128, identity)(128 x 128)/zygote/GPU/CUDA 206165 ns 211737.5 ns 0.97
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/2 thread(s) 336792 ns 329896 ns 1.02
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/4 thread(s) 320666 ns 317541 ns 1.01
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/8 thread(s) 569042 ns 641000 ns 0.89
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/1 thread(s) 161000 ns 161708 ns 1.00
Dense(16 => 16, identity)(16 x 128)/forward/CPU/2 thread(s) 1625 ns 1542 ns 1.05
Dense(16 => 16, identity)(16 x 128)/forward/CPU/4 thread(s) 1750 ns 1792 ns 0.98
Dense(16 => 16, identity)(16 x 128)/forward/CPU/8 thread(s) 2250 ns 2334 ns 0.96
Dense(16 => 16, identity)(16 x 128)/forward/CPU/1 thread(s) 1917 ns 2042 ns 0.94
Dense(16 => 16, identity)(16 x 128)/forward/GPU/CUDA 22470 ns 23108 ns 0.97
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/2 thread(s) 5292 ns 5229.5 ns 1.01
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/4 thread(s) 5291 ns 5208 ns 1.02
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/8 thread(s) 5250 ns 5333 ns 0.98
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/1 thread(s) 5167 ns 5209 ns 0.99
Dense(16 => 16, identity)(16 x 128)/zygote/GPU/CUDA 241214 ns 243209 ns 0.99
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/2 thread(s) 11250 ns 11291 ns 1.00
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/4 thread(s) 11583 ns 11250 ns 1.03
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/8 thread(s) 11459 ns 11541 ns 0.99
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/1 thread(s) 6833 ns 6833 ns 1
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 79831958 ns 79916708 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 49067917 ns 48976292 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 44974625 ns 43178166 ns 1.04
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 151479959 ns 151466292 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2716485.5 ns 2667705.5 ns 1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 497629333 ns 660651833 ns 0.75
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 413644333 ns 411637166 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 399276520.5 ns 396465125 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 684016541 ns 687244250 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 14627968 ns 14722784 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 713376625 ns 711004292 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 669209042 ns 670184709 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 995614375 ns 1007186709 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 998601250 ns 997936792 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.