Skip to content

Commit

Permalink
test: eltype matching tests run outside of error mode
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 12, 2024
1 parent 7cf6e11 commit 0b51676
Showing 1 changed file with 16 additions and 15 deletions.
31 changes: 16 additions & 15 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,22 @@ if "all" in LUX_TEST_GROUP || "core_layers" in LUX_TEST_GROUP
end
end

# Eltype Matching Tests
if ("all" in LUX_TEST_GROUP || "eltype_match" in LUX_TEST_GROUP)
@testset "eltype_mismath_handling: $option" for option in (
"none", "warn", "convert", "error")
set_preferences!(Lux, "eltype_mismatch_handling" => option; force=true)
try
run(`$(Base.julia_cmd()) --color=yes --project=$(dirname(Pkg.project().path))
--startup-file=no --code-coverage=user $(@__DIR__)/eltype_matching.jl`)
@test true
catch
@test false
end
end
set_preferences!(Lux, "eltype_mismatch_handling" => "none"; force=true)
end

Lux.set_dispatch_doctor_preferences!(; luxcore="error", luxlib="error")

@testset "Load Tests" begin
Expand Down Expand Up @@ -141,21 +157,6 @@ if ("all" in LUX_TEST_GROUP || "distributed" in LUX_TEST_GROUP)
end
end

# Eltype Matching Tests
if ("all" in LUX_TEST_GROUP || "eltype_match" in LUX_TEST_GROUP)
@testset "eltype_mismath_handling: $option" for option in (
"none", "warn", "convert", "error")
set_preferences!(Lux, "eltype_mismatch_handling" => option; force=true)
try
run(`$(Base.julia_cmd()) --color=yes --project=$(dirname(Pkg.project().path))
--startup-file=no --code-coverage=user $(@__DIR__)/eltype_matching.jl`)
@test true
catch
@test false
end
end
end

# Set preferences tests
if ("all" in LUX_TEST_GROUP || "others" in LUX_TEST_GROUP)
@testset "DispatchDoctor Preferences" begin
Expand Down

3 comments on commit 0b51676

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/115081

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.0.2 -m "<description of version>" 0b51676247f6df9f1553c06941889b90b689fc2f
git push origin v1.0.2

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 0b51676 Previous: f5adbfe Ratio
Dense(512 => 512, identity)(512 x 128)/forward/CPU/2 thread(s) 411125 ns 411750 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/4 thread(s) 322750 ns 322250 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/8 thread(s) 244083 ns 244354.5 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/1 thread(s) 740229 ns 739959 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/GPU/CUDA 43576 ns 44622 ns 0.98
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/2 thread(s) 1361688 ns 1314687.5 ns 1.04
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/4 thread(s) 2448167 ns 2415854 ns 1.01
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/8 thread(s) 16505500 ns 16411375 ns 1.01
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/1 thread(s) 2198042 ns 2250459 ns 0.98
Dense(512 => 512, identity)(512 x 128)/zygote/GPU/CUDA 207361 ns 210429 ns 0.99
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/2 thread(s) 1419479 ns 1387333 ns 1.02
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/4 thread(s) 931729 ns 913146 ns 1.02
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/8 thread(s) 1582917 ns 1549208 ns 1.02
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/1 thread(s) 2213229 ns 2241750 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1768708 ns 1764229.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1072541.5 ns 1093291 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1542417 ns 1517187.5 ns 1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 3010167 ns 2995125 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 208923 ns 211213.5 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12164458 ns 12135312.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 8831167 ns 8821250.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9231125 ns 9211042 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18575542 ns 18575959 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1506706 ns 1486214 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17297875 ns 17305854 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 13966709 ns 13958125 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14490229 ns 14521020.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21825958 ns 21821271.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 250077771 ns 250357604 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 148351292 ns 148471959 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 116742208 ns 116711333.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 446235042 ns 447366750 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5474148 ns 5485324 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1226735000 ns 1224804208 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 933099541 ns 931517375 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 833488083 ns 829351334 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1628798917 ns 1631699042 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 31247743 ns 31517422 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1139513458 ns 1033852125 ns 1.10
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1004012958 ns 985852708.5 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1343460771 ns 1297620895.5 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1729098333 ns 1729286083.5 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 1084187.5 ns 1104458 ns 0.98
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 1632875 ns 1636000 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 3807833 ns 3608917 ns 1.06
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 781500 ns 779208.5 ns 1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA 269181 ns 263937.5 ns 1.02
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2973917 ns 3005562.5 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 4123458 ns 4102687.5 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 11391021 ns 9959375 ns 1.14
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3140229.5 ns 3171291.5 ns 0.99
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1147789 ns 1093100.5 ns 1.05
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 2327458.5 ns 2309084 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1427875 ns 1395125 ns 1.02
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1552208 ns 1532166.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 4203041 ns 4206854.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 209123 ns 207803.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 19423562 ns 19411834 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 16279416 ns 16074729.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 17361812 ns 17204167 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 25815125 ns 25846729 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1606839 ns 1588884 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 34524104 ns 34056708 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 31057875 ns 30790312.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 31105416 ns 31003625 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 36883875 ns 37038625 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 4526208.5 ns 4527562.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2777083.5 ns 2780667 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2685312.5 ns 2672396 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 8381562.5 ns 8380583 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 373639 ns 420119 ns 0.89
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 38887521 ns 39090938 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 32509584 ns 32065354 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 32333229 ns 32270791 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 51833125 ns 51859750 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2633953 ns 2623535 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 88607687.5 ns 89004313 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 113743125 ns 114465416 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 227726583 ns 219243333 ns 1.04
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 74951083 ns 74793562.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 267716166 ns 268192958 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 159256375 ns 159139541 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 123708895.5 ns 123304667 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 485091625 ns 484886208 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7022924 ns 7013600 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1478680979 ns 1472254854 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 1179547083 ns 1174209000 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 1066054563 ns 1058770187.5 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 2001889209 ns 2000167187.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34822377.5 ns 34540951 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1724298291 ns 1715889292 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1565497271 ns 1527816438 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1925114250 ns 1882392833 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 2239111625 ns 2226899333 ns 1.01
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 2028500 ns 2068395.5 ns 0.98
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 2967646 ns 2993084 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 8104667 ns 8374792 ns 0.97
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2308041.5 ns 2453875.5 ns 0.94
lenet(28, 28, 1, 128)/forward/GPU/CUDA 272667 ns 266526.5 ns 1.02
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 9619395.5 ns 9618542 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 12015166 ns 12066625 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 26324292 ns 23824500 ns 1.10
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 11677541 ns 11760125.5 ns 0.99
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1188628.5 ns 1164719.5 ns 1.02
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 383215354.5 ns 382306709 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 284366604.5 ns 285915229.5 ns 0.99
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 261725395.5 ns 259469458 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 453056042 ns 452429396 ns 1.00
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 5009701 ns 4851990 ns 1.03
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 1160384584 ns 1152636625 ns 1.01
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 912166042 ns 942909208 ns 0.97
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 984922208 ns 988346750 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 1396092167 ns 1394608042 ns 1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 18111984 ns 17883204 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1053833 ns 1047084 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 1605958 ns 2051062.5 ns 0.78
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 5411083 ns 5536708 ns 0.98
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1296875 ns 1365833.5 ns 0.95
lenet(28, 28, 1, 64)/forward/GPU/CUDA 265721 ns 273727 ns 0.97
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 6510958 ns 6487104 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 13082584 ns 12416624.5 ns 1.05
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 21760833.5 ns 18396062.5 ns 1.18
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5984375 ns 6074542 ns 0.99
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1208949 ns 1242828 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 70494333 ns 70480271 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43641125 ns 43555583 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 39690584 ns 39728521 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 133468354 ns 132459104 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1945255.5 ns 1879688 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 356723479.5 ns 356722500 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 271306709 ns 270518833 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 254269771 ns 253991500 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 536238459 ns 534459625 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 12301288 ns 12289288 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 395599834 ns 395296292 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 377440167 ns 405206479 ns 0.93
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 697289229.5 ns 702801292 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 708495833 ns 709895792 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 1188885083 ns 1186905458 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 692916625 ns 688634396 ns 1.01
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 642915416.5 ns 641177604 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 1776695937.5 ns 1774744187 ns 1.00
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12306515 ns 12312145 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 3668882667 ns 3681320875 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 2834396125 ns 2815834792 ns 1.01
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 2699395792 ns 2699549167 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 5050853166 ns 5054825084 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49852240.5 ns 49638979 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3422958 ns 3415021 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2075583 ns 2058416.5 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2513666 ns 2523458 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 6018396 ns 6016791 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 317455.5 ns 345305 ns 0.92
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 26048666 ns 26262750 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 19094062.5 ns 18935500 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 19316000 ns 19377771 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 39190562.5 ns 39256000 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2466381 ns 2462287 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 55369583 ns 55393875 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 82210395.5 ns 81461166 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 173994812.5 ns 173473458 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 45354333 ns 45537167 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1779187.5 ns 1775625 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1097834 ns 1108166 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1568791 ns 1574125 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 3021312 ns 3027041 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 210623 ns 213889 ns 0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12543916 ns 12554020.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9277708.5 ns 9212687 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9594229.5 ns 9634625 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18987604.5 ns 18974791 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1527868.5 ns 1535990.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17650708 ns 17660667 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14335458 ns 14318666.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14544250 ns 14527083 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 22174250 ns 22176166 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 70431125 ns 70469833.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43537125 ns 43612917 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 39620583 ns 39834375 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 132531916.5 ns 132581875 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1888879 ns 1939391 ns 0.97
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 360439083.5 ns 362447520.5 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 347132666.5 ns 345850729 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 304637542 ns 304601834 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 722631792 ns 723285166 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13304668 ns 13373827 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 419234750 ns 418608542 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 421465729 ns 424576167 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 724319500 ns 708932270.5 ns 1.02
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 714217917 ns 719137166 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 1705416 ns 1646791.5 ns 1.04
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 1350333.5 ns 1350958 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 1170667 ns 1155333 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 2385333.5 ns 2429416.5 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 580442.5 ns 590640 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 8948271 ns 8673937.5 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 12980437.5 ns 12961417 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 32353312.5 ns 32282958 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 9804417 ns 9836041 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1427987.5 ns 1466324 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 17962354 ns 17283334 ns 1.04
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 17440000 ns 17102416.5 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 29738291 ns 29614750 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14431937.5 ns 14366916 ns 1.00
Dense(512 => 512, relu)(512 x 128)/forward/CPU/2 thread(s) 669833.5 ns 668167 ns 1.00
Dense(512 => 512, relu)(512 x 128)/forward/CPU/4 thread(s) 529250 ns 576791 ns 0.92
Dense(512 => 512, relu)(512 x 128)/forward/CPU/8 thread(s) 1065708.5 ns 1066666.5 ns 1.00
Dense(512 => 512, relu)(512 x 128)/forward/CPU/1 thread(s) 725395.5 ns 725292 ns 1.00
Dense(512 => 512, relu)(512 x 128)/forward/GPU/CUDA 47647 ns 48292 ns 0.99
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/2 thread(s) 1549104 ns 1517874.5 ns 1.02
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/4 thread(s) 1038917 ns 1004646 ns 1.03
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/8 thread(s) 1517584 ns 1520604 ns 1.00
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/1 thread(s) 2269896 ns 2250708.5 ns 1.01
Dense(512 => 512, relu)(512 x 128)/zygote/GPU/CUDA 233022 ns 239031.5 ns 0.97
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/2 thread(s) 1582916 ns 1572667 ns 1.01
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/4 thread(s) 1087854.5 ns 1074146 ns 1.01
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/8 thread(s) 1464166 ns 1411458 ns 1.04
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/1 thread(s) 2190854 ns 2225791.5 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3413625 ns 3403354 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2047083 ns 2053083 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2507333.5 ns 2486145.5 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 6011813 ns 5997125 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 284231.5 ns 289032.5 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24149000 ns 24071812.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 17330312.5 ns 17199083 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17059271 ns 17076604 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 37480499.5 ns 37510583.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2394265 ns 2401628 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 53573937.5 ns 53560041.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 83649500 ns 81012667 ns 1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 172928458 ns 171727292 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 44425187.5 ns 44535666.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 249999250 ns 250063500 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 148223583 ns 148044042 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 116384896 ns 116121687.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 447335937.5 ns 446980916.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5449146 ns 5449734 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1105347792 ns 1103639292 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 857822708.5 ns 857470145.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 830398396 ns 823519999.5 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1762030583 ns 1754095625 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 28862807 ns 29250301 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1020245354 ns 1017950791.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 966178875 ns 975600583 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1293466208 ns 1309755625 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1724193375.5 ns 1718285542 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1306896.5 ns 1300041 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 984292 ns 946500 ns 1.04
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 778437.5 ns 781542 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 1958750 ns 1942625 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 566426 ns 559913 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 6042375 ns 5979812 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 6715125 ns 6230958.5 ns 1.08
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 26872708 ns 25879708.5 ns 1.04
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 6973417 ns 7087208 ns 0.98
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1365853 ns 1361665 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 11215770.5 ns 11063062 ns 1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 10033208 ns 10077750 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 17672208 ns 17523854 ns 1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 8568500 ns 8754104 ns 0.98
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/2 thread(s) 399500 ns 358709 ns 1.11
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/4 thread(s) 399291.5 ns 439875 ns 0.91
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/8 thread(s) 3544167 ns 3375791.5 ns 1.05
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/1 thread(s) 88459 ns 88834 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/forward/GPU/CUDA 27618 ns 27879 ns 0.99
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/2 thread(s) 397459 ns 388437.5 ns 1.02
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/4 thread(s) 445041.5 ns 426292 ns 1.04
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/8 thread(s) 4819375 ns 4297000 ns 1.12
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/1 thread(s) 259833 ns 258500 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/zygote/GPU/CUDA 219889.5 ns 218884.5 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/2 thread(s) 428313 ns 419000 ns 1.02
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/4 thread(s) 475541 ns 456354 ns 1.04
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/8 thread(s) 4960437.5 ns 4722375 ns 1.05
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/1 thread(s) 271333 ns 271062.5 ns 1.00
Dense(128 => 128, relu)(128 x 128)/forward/CPU/2 thread(s) 343709 ns 306375 ns 1.12
Dense(128 => 128, relu)(128 x 128)/forward/CPU/4 thread(s) 333937.5 ns 375708.5 ns 0.89
Dense(128 => 128, relu)(128 x 128)/forward/CPU/8 thread(s) 769833 ns 776917 ns 0.99
Dense(128 => 128, relu)(128 x 128)/forward/CPU/1 thread(s) 53125 ns 53833 ns 0.99
Dense(128 => 128, relu)(128 x 128)/forward/GPU/CUDA 28016 ns 27939 ns 1.00
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/2 thread(s) 362209 ns 351666 ns 1.03
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/4 thread(s) 342792 ns 314041 ns 1.09
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/8 thread(s) 897833 ns 420792 ns 2.13
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/1 thread(s) 152583 ns 151583 ns 1.01
Dense(128 => 128, relu)(128 x 128)/zygote/GPU/CUDA 205326.5 ns 204717.5 ns 1.00
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/2 thread(s) 378500 ns 366416 ns 1.03
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/4 thread(s) 358042 ns 329334 ns 1.09
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/8 thread(s) 728708 ns 423542 ns 1.72
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/1 thread(s) 150833.5 ns 150833 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 603479208 ns 603125958 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 429058104 ns 424709854 ns 1.01
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 385950542 ns 379453834 ns 1.02
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 872372584 ns 872147584 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7023071 ns 7026308.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 2010730958 ns 2006276000.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 1608264687.5 ns 1611544791.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 1653085833 ns 1550847520.5 ns 1.07
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 2638084625 ns 2621300375 ns 1.01
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 25932761 ns 25894358 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/2 thread(s) 535250 ns 524666 ns 1.02
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/4 thread(s) 433291.5 ns 431646 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/8 thread(s) 3023791.5 ns 2828083 ns 1.07
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/1 thread(s) 880791 ns 865708.5 ns 1.02
Dense(512 => 512, gelu)(512 x 128)/forward/GPU/CUDA 46986 ns 47753 ns 0.98
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/2 thread(s) 1881604 ns 1892208 ns 0.99
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/4 thread(s) 2798729 ns 2773459 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/8 thread(s) 16356750 ns 16216042 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/1 thread(s) 2759229 ns 2764145.5 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/GPU/CUDA 246659.5 ns 250438 ns 0.98
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/2 thread(s) 1962958.5 ns 1946958.5 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/4 thread(s) 5070604 ns 5023625 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/8 thread(s) 16396875 ns 16786084 ns 0.98
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/1 thread(s) 2785625.5 ns 2779000 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1614125 ns 1564584 ns 1.03
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1235583 ns 1208666.5 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1027208 ns 946958 ns 1.08
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2300875 ns 2330542 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 587018.5 ns 588876.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 5921542 ns 5931146 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 5089688 ns 4679292 ns 1.09
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 26372271 ns 25938874.5 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 7288250 ns 7312458.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 1379747.5 ns 1358666 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 13324958 ns 13317979 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 12237645.5 ns 11993375 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 21281499.5 ns 20776000 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 10668750 ns 10716854.5 ns 1.00
Dense(16 => 16, relu)(16 x 128)/forward/CPU/2 thread(s) 4417 ns 2625 ns 1.68
Dense(16 => 16, relu)(16 x 128)/forward/CPU/4 thread(s) 2583.5 ns 2292 ns 1.13
Dense(16 => 16, relu)(16 x 128)/forward/CPU/8 thread(s) 2750 ns 3542 ns 0.78
Dense(16 => 16, relu)(16 x 128)/forward/CPU/1 thread(s) 2500 ns 2333.5 ns 1.07
Dense(16 => 16, relu)(16 x 128)/forward/GPU/CUDA 24754 ns 24837.5 ns 1.00
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/2 thread(s) 7459 ns 7083 ns 1.05
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/4 thread(s) 7250 ns 7084 ns 1.02
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/8 thread(s) 7333 ns 7167 ns 1.02
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/1 thread(s) 7083 ns 7125 ns 0.99
Dense(16 => 16, relu)(16 x 128)/zygote/GPU/CUDA 213008 ns 210657.5 ns 1.01
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/2 thread(s) 8375 ns 8208 ns 1.02
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/4 thread(s) 8583 ns 8167 ns 1.05
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/8 thread(s) 8459 ns 8292 ns 1.02
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/1 thread(s) 5834 ns 6000 ns 0.97
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/2 thread(s) 10625 ns 10312.5 ns 1.03
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/4 thread(s) 13708 ns 13625 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/8 thread(s) 12042 ns 10667 ns 1.13
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/1 thread(s) 7500 ns 6917 ns 1.08
Dense(16 => 16, gelu)(16 x 128)/forward/GPU/CUDA 25091.5 ns 25243 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/2 thread(s) 20250 ns 19959 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/4 thread(s) 19959 ns 19792 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/8 thread(s) 20083 ns 20084 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/1 thread(s) 19875 ns 19959 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/zygote/GPU/CUDA 231793 ns 230204.5 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/2 thread(s) 23625 ns 23583 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/4 thread(s) 23667 ns 23458 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/8 thread(s) 23666 ns 23583 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/1 thread(s) 21084 ns 21375 ns 0.99
Dense(128 => 128, identity)(128 x 128)/forward/CPU/2 thread(s) 28708 ns 28875 ns 0.99
Dense(128 => 128, identity)(128 x 128)/forward/CPU/4 thread(s) 29292 ns 28542 ns 1.03
Dense(128 => 128, identity)(128 x 128)/forward/CPU/8 thread(s) 28375 ns 28542 ns 0.99
Dense(128 => 128, identity)(128 x 128)/forward/CPU/1 thread(s) 46584 ns 46084 ns 1.01
Dense(128 => 128, identity)(128 x 128)/forward/GPU/CUDA 26247 ns 26158 ns 1.00
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/2 thread(s) 222250 ns 223771 ns 0.99
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/4 thread(s) 279729.5 ns 274229.5 ns 1.02
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/8 thread(s) 4335396.5 ns 4189916 ns 1.03
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/1 thread(s) 145208 ns 144958 ns 1.00
Dense(128 => 128, identity)(128 x 128)/zygote/GPU/CUDA 203061 ns 206708.5 ns 0.98
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/2 thread(s) 333124.5 ns 331333 ns 1.01
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/4 thread(s) 322500 ns 311771 ns 1.03
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/8 thread(s) 861333 ns 855937.5 ns 1.01
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/1 thread(s) 160750 ns 160334 ns 1.00
Dense(16 => 16, identity)(16 x 128)/forward/CPU/2 thread(s) 1875 ns 1875 ns 1
Dense(16 => 16, identity)(16 x 128)/forward/CPU/4 thread(s) 1958 ns 2000 ns 0.98
Dense(16 => 16, identity)(16 x 128)/forward/CPU/8 thread(s) 2416 ns 2750 ns 0.88
Dense(16 => 16, identity)(16 x 128)/forward/CPU/1 thread(s) 1792 ns 3833.5 ns 0.47
Dense(16 => 16, identity)(16 x 128)/forward/GPU/CUDA 23061 ns 23305 ns 0.99
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/2 thread(s) 5458 ns 5459 ns 1.00
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/4 thread(s) 5500 ns 5292 ns 1.04
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/8 thread(s) 5375 ns 5291 ns 1.02
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/1 thread(s) 5375 ns 5209 ns 1.03
Dense(16 => 16, identity)(16 x 128)/zygote/GPU/CUDA 243257 ns 255218.5 ns 0.95
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/2 thread(s) 11333.5 ns 11500 ns 0.99
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/4 thread(s) 11208 ns 11375 ns 0.99
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/8 thread(s) 11667 ns 11417 ns 1.02
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/1 thread(s) 6833 ns 6791 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 79834791 ns 79822041 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 49125291 ns 49051354.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 43259375 ns 43286875 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 151428917 ns 151651459 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2726005 ns 2720855 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 498680292 ns 667046042 ns 0.75
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 414152083 ns 413223250 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 396991709 ns 397303625 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 689086500 ns 681225125 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 14585553 ns 14587521 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 712438146 ns 715487875 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 683887166 ns 677171917 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 1013847083 ns 1012616958 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 999589459 ns 1001064708 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.