From 3f04a79ada7ca974176a0c7c3c3306f394eae9a9 Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Sun, 7 Jan 2024 14:40:15 +0100 Subject: [PATCH 1/6] Use cfg to seperate benchmark results based on features --- candle-core/benches/bench_utils.rs | 56 ++++++++++++++++++++++++++++++ candle-core/benches/matmul.rs | 16 ++++----- 2 files changed, 64 insertions(+), 8 deletions(-) create mode 100644 candle-core/benches/bench_utils.rs diff --git a/candle-core/benches/bench_utils.rs b/candle-core/benches/bench_utils.rs new file mode 100644 index 0000000000..75800761f4 --- /dev/null +++ b/candle-core/benches/bench_utils.rs @@ -0,0 +1,56 @@ +use candle_core::{Device, Result}; + +pub(crate) trait BenchDevice { + fn sync(&self) -> Result<()>; +} + +impl BenchDevice for Device { + fn sync(&self) -> Result<()> { + match self { + Device::Cpu => Ok(()), + Device::Cuda(device) => { + #[cfg(feature = "cuda")] + return Ok(device.synchronize()?); + #[cfg(not(feature = "cuda"))] + panic!("Cuda device without cuda feature enabled: {:?}", device) + } + Device::Metal(device) => { + #[cfg(feature = "metal")] + return Ok(device.wait_until_completed()?); + #[cfg(not(feature = "metal"))] + panic!("Metal device without metal feature enabled: {:?}", device) + } + } + } +} + +#[allow(dead_code)] +pub(crate) fn device() -> Result { + return if cfg!(feature = "metal") { + Device::new_metal(0) + } else if cfg!(feature = "cuda") { + Device::new_cuda(0) + } else { + Ok(Device::Cpu) + }; +} + +#[allow(dead_code)] +pub(crate) fn bench_name>(name: S) -> String { + format!("{}_{}", device_variant(), name.into()) +} + +#[allow(dead_code)] +const fn device_variant() -> &'static str { + return if cfg!(feature = "metal") { + "metal" + } else if cfg!(feature = "cuda") { + "cuda" + } else if cfg!(feature = "accelerate") { + "accelerate" + } else if cfg!(feature = "mkl") { + "mkl" + } else { + "cpu" + }; +} diff --git a/candle-core/benches/matmul.rs b/candle-core/benches/matmul.rs index 83679771b2..4f7dfa6c9f 100644 --- a/candle-core/benches/matmul.rs +++ b/candle-core/benches/matmul.rs @@ -1,4 +1,8 @@ -use candle_core::{DType, Device, Tensor}; +mod bench_utils; + +use crate::bench_utils::bench_name; +use bench_utils::{device, BenchDevice}; +use candle_core::{DType, Tensor}; use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; use std::time::Instant; @@ -12,14 +16,14 @@ fn criterion_benchmark(c: &mut Criterion) { let n = 2048; let k = 2048; - let device = Device::new_metal(0).unwrap(); + let device = device().unwrap(); let dtype = DType::F32; let lhs = Tensor::zeros((b, m, k), dtype, &device).unwrap(); let rhs = Tensor::zeros((b, n, k), dtype, &device).unwrap(); let flops = b * m * n * k; - let mut group = c.benchmark_group("matmul_metal"); + let mut group = c.benchmark_group(bench_name("matmul")); group.throughput(Throughput::Bytes(flops as u64)); group.bench_function("iter", move |b| { b.iter_custom(|iters| { @@ -27,11 +31,7 @@ fn criterion_benchmark(c: &mut Criterion) { for _i in 0..iters { run(black_box(&lhs), black_box(&rhs)); } - if let Device::Metal(device) = &device { - device.wait_until_completed().unwrap(); - } else { - panic!("Expected metal device"); - } + device.sync().unwrap(); start.elapsed() }) }); From ad075a5f7edb0114b820b3e99a19b17d0d25ec3b Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Mon, 8 Jan 2024 06:48:33 +0100 Subject: [PATCH 2/6] Remove allow pragma --- candle-core/benches/matmul.rs | 5 ++--- candle-core/benches/{bench_utils.rs => utils.rs} | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) rename candle-core/benches/{bench_utils.rs => utils.rs} (96%) diff --git a/candle-core/benches/matmul.rs b/candle-core/benches/matmul.rs index 4f7dfa6c9f..a5dba9ccdb 100644 --- a/candle-core/benches/matmul.rs +++ b/candle-core/benches/matmul.rs @@ -1,10 +1,9 @@ -mod bench_utils; +mod utils; -use crate::bench_utils::bench_name; -use bench_utils::{device, BenchDevice}; use candle_core::{DType, Tensor}; use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; use std::time::Instant; +use utils::{bench_name, device, BenchDevice}; fn run(a: &Tensor, b: &Tensor) { a.matmul(&b.t().unwrap()).unwrap(); diff --git a/candle-core/benches/bench_utils.rs b/candle-core/benches/utils.rs similarity index 96% rename from candle-core/benches/bench_utils.rs rename to candle-core/benches/utils.rs index 75800761f4..a93afc6e76 100644 --- a/candle-core/benches/bench_utils.rs +++ b/candle-core/benches/utils.rs @@ -24,7 +24,6 @@ impl BenchDevice for Device { } } -#[allow(dead_code)] pub(crate) fn device() -> Result { return if cfg!(feature = "metal") { Device::new_metal(0) @@ -35,12 +34,10 @@ pub(crate) fn device() -> Result { }; } -#[allow(dead_code)] pub(crate) fn bench_name>(name: S) -> String { format!("{}_{}", device_variant(), name.into()) } -#[allow(dead_code)] const fn device_variant() -> &'static str { return if cfg!(feature = "metal") { "metal" From fb05af4c42a6343856f167893b19974ed6b50276 Mon Sep 17 00:00:00 2001 From: Laurent Date: Mon, 8 Jan 2024 07:19:59 +0100 Subject: [PATCH 3/6] Avoid some unnecessary returns. --- candle-core/benches/utils.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/candle-core/benches/utils.rs b/candle-core/benches/utils.rs index a93afc6e76..3e8b3c5776 100644 --- a/candle-core/benches/utils.rs +++ b/candle-core/benches/utils.rs @@ -25,13 +25,13 @@ impl BenchDevice for Device { } pub(crate) fn device() -> Result { - return if cfg!(feature = "metal") { + if cfg!(feature = "metal") { Device::new_metal(0) } else if cfg!(feature = "cuda") { Device::new_cuda(0) } else { Ok(Device::Cpu) - }; + } } pub(crate) fn bench_name>(name: S) -> String { @@ -39,7 +39,7 @@ pub(crate) fn bench_name>(name: S) -> String { } const fn device_variant() -> &'static str { - return if cfg!(feature = "metal") { + if cfg!(feature = "metal") { "metal" } else if cfg!(feature = "cuda") { "cuda" @@ -49,5 +49,5 @@ const fn device_variant() -> &'static str { "mkl" } else { "cpu" - }; + } } From 88945f2c227d6a76e8f133d8b72c701b180c4f4c Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Tue, 9 Jan 2024 18:31:28 +0100 Subject: [PATCH 4/6] Improve benchmarks layout --- candle-core/Cargo.toml | 2 +- candle-core/benches/bench_main.rs | 4 ++++ candle-core/benches/{ => benchmarks}/matmul.rs | 7 ++----- candle-core/benches/{utils.rs => benchmarks/mod.rs} | 2 ++ 4 files changed, 9 insertions(+), 6 deletions(-) create mode 100644 candle-core/benches/bench_main.rs rename candle-core/benches/{ => benchmarks}/matmul.rs (85%) rename candle-core/benches/{utils.rs => benchmarks/mod.rs} (98%) diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml index 91655f5782..93b718a3e3 100644 --- a/candle-core/Cargo.toml +++ b/candle-core/Cargo.toml @@ -46,6 +46,6 @@ accelerate = ["dep:libc", "dep:accelerate-src"] metal = ["dep:metal", "dep:candle-metal-kernels"] [[bench]] -name = "matmul" +name = "bench_main" harness = false diff --git a/candle-core/benches/bench_main.rs b/candle-core/benches/bench_main.rs new file mode 100644 index 0000000000..4425f2fb32 --- /dev/null +++ b/candle-core/benches/bench_main.rs @@ -0,0 +1,4 @@ +mod benchmarks; + +use criterion::criterion_main; +criterion_main!(benchmarks::matmul::benches); diff --git a/candle-core/benches/matmul.rs b/candle-core/benches/benchmarks/matmul.rs similarity index 85% rename from candle-core/benches/matmul.rs rename to candle-core/benches/benchmarks/matmul.rs index a5dba9ccdb..fb173f0475 100644 --- a/candle-core/benches/matmul.rs +++ b/candle-core/benches/benchmarks/matmul.rs @@ -1,9 +1,7 @@ -mod utils; - +use crate::benchmarks::{bench_name, device, BenchDevice}; use candle_core::{DType, Tensor}; -use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use criterion::{black_box, criterion_group, Criterion, Throughput}; use std::time::Instant; -use utils::{bench_name, device, BenchDevice}; fn run(a: &Tensor, b: &Tensor) { a.matmul(&b.t().unwrap()).unwrap(); @@ -38,4 +36,3 @@ fn criterion_benchmark(c: &mut Criterion) { } criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/candle-core/benches/utils.rs b/candle-core/benches/benchmarks/mod.rs similarity index 98% rename from candle-core/benches/utils.rs rename to candle-core/benches/benchmarks/mod.rs index 3e8b3c5776..1344770dce 100644 --- a/candle-core/benches/utils.rs +++ b/candle-core/benches/benchmarks/mod.rs @@ -1,3 +1,5 @@ +pub(crate) mod matmul; + use candle_core::{Device, Result}; pub(crate) trait BenchDevice { From dc3725b9594e874246cd68a914d15412d345cbb4 Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Wed, 10 Jan 2024 16:16:18 +0100 Subject: [PATCH 5/6] Derive bench_name from actual device --- candle-core/benches/benchmarks/matmul.rs | 4 +-- candle-core/benches/benchmarks/mod.rs | 37 ++++++++++++------------ 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/candle-core/benches/benchmarks/matmul.rs b/candle-core/benches/benchmarks/matmul.rs index fb173f0475..d0a276e724 100644 --- a/candle-core/benches/benchmarks/matmul.rs +++ b/candle-core/benches/benchmarks/matmul.rs @@ -1,4 +1,4 @@ -use crate::benchmarks::{bench_name, device, BenchDevice}; +use crate::benchmarks::{device, BenchDevice}; use candle_core::{DType, Tensor}; use criterion::{black_box, criterion_group, Criterion, Throughput}; use std::time::Instant; @@ -20,7 +20,7 @@ fn criterion_benchmark(c: &mut Criterion) { let flops = b * m * n * k; - let mut group = c.benchmark_group(bench_name("matmul")); + let mut group = c.benchmark_group(device.bench_name("matmul")); group.throughput(Throughput::Bytes(flops as u64)); group.bench_function("iter", move |b| { b.iter_custom(|iters| { diff --git a/candle-core/benches/benchmarks/mod.rs b/candle-core/benches/benchmarks/mod.rs index 1344770dce..4f7be553e2 100644 --- a/candle-core/benches/benchmarks/mod.rs +++ b/candle-core/benches/benchmarks/mod.rs @@ -4,6 +4,8 @@ use candle_core::{Device, Result}; pub(crate) trait BenchDevice { fn sync(&self) -> Result<()>; + + fn bench_name>(&self, name: S) -> String; } impl BenchDevice for Device { @@ -24,6 +26,23 @@ impl BenchDevice for Device { } } } + + fn bench_name>(&self, name: S) -> String { + match self { + Device::Cpu => { + let cpu_type = if cfg!(feature = "accelerate") { + "accelerate" + } else if cfg!(feature = "mkl") { + "mkl" + } else { + "cpu" + }; + format!("{}_{}", cpu_type, name.into()) + } + Device::Cuda(_) => format!("cuda_{}", name.into()), + Device::Metal(_) => format!("metal_{}", name.into()), + } + } } pub(crate) fn device() -> Result { @@ -35,21 +54,3 @@ pub(crate) fn device() -> Result { Ok(Device::Cpu) } } - -pub(crate) fn bench_name>(name: S) -> String { - format!("{}_{}", device_variant(), name.into()) -} - -const fn device_variant() -> &'static str { - if cfg!(feature = "metal") { - "metal" - } else if cfg!(feature = "cuda") { - "cuda" - } else if cfg!(feature = "accelerate") { - "accelerate" - } else if cfg!(feature = "mkl") { - "mkl" - } else { - "cpu" - } -} From e9e31f1996433d9b1041e1a8d0bfd956f7713cc9 Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Wed, 10 Jan 2024 20:28:13 +0100 Subject: [PATCH 6/6] Run CPU benchmarks even when GPU feature is enabled --- candle-core/benches/benchmarks/matmul.rs | 18 ++++++++++++------ candle-core/benches/benchmarks/mod.rs | 21 ++++++++++++++------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/candle-core/benches/benchmarks/matmul.rs b/candle-core/benches/benchmarks/matmul.rs index d0a276e724..9d67e642cd 100644 --- a/candle-core/benches/benchmarks/matmul.rs +++ b/candle-core/benches/benchmarks/matmul.rs @@ -1,5 +1,5 @@ -use crate::benchmarks::{device, BenchDevice}; -use candle_core::{DType, Tensor}; +use crate::benchmarks::{BenchDevice, BenchDeviceHandler}; +use candle_core::{DType, Device, Tensor}; use criterion::{black_box, criterion_group, Criterion, Throughput}; use std::time::Instant; @@ -7,16 +7,15 @@ fn run(a: &Tensor, b: &Tensor) { a.matmul(&b.t().unwrap()).unwrap(); } -fn criterion_benchmark(c: &mut Criterion) { +fn run_bench(c: &mut Criterion, device: &Device) { let b = 1; let m = 1; let n = 2048; let k = 2048; - let device = device().unwrap(); let dtype = DType::F32; - let lhs = Tensor::zeros((b, m, k), dtype, &device).unwrap(); - let rhs = Tensor::zeros((b, n, k), dtype, &device).unwrap(); + let lhs = Tensor::zeros((b, m, k), dtype, device).unwrap(); + let rhs = Tensor::zeros((b, n, k), dtype, device).unwrap(); let flops = b * m * n * k; @@ -35,4 +34,11 @@ fn criterion_benchmark(c: &mut Criterion) { group.finish(); } +fn criterion_benchmark(c: &mut Criterion) { + let handler = BenchDeviceHandler::new().unwrap(); + for device in handler.devices { + run_bench(c, &device); + } +} + criterion_group!(benches, criterion_benchmark); diff --git a/candle-core/benches/benchmarks/mod.rs b/candle-core/benches/benchmarks/mod.rs index 4f7be553e2..295bbabda1 100644 --- a/candle-core/benches/benchmarks/mod.rs +++ b/candle-core/benches/benchmarks/mod.rs @@ -45,12 +45,19 @@ impl BenchDevice for Device { } } -pub(crate) fn device() -> Result { - if cfg!(feature = "metal") { - Device::new_metal(0) - } else if cfg!(feature = "cuda") { - Device::new_cuda(0) - } else { - Ok(Device::Cpu) +struct BenchDeviceHandler { + devices: Vec, +} + +impl BenchDeviceHandler { + pub fn new() -> Result { + let mut devices = Vec::new(); + if cfg!(feature = "metal") { + devices.push(Device::new_metal(0)?); + } else if cfg!(feature = "cuda") { + devices.push(Device::new_cuda(0)?); + } + devices.push(Device::Cpu); + Ok(Self { devices }) } }