perf: EXC-1942: Add BitVec create/grow benchmarks (#4030)

Creating 768GiB worth of pages (192M bits) takes 2.9ms: ``` test bitvec_from_elem_false/bits:256GiB/ops:1 ... bench: 144149 ns/iter (+/- 1961) test bitvec_from_elem_false/bits:512GiB/ops:1 ... bench: 1336148 ns/iter (+/- 50658) test bitvec_from_elem_false/bits:768GiB/ops:1 ... bench: 2917326 ns/iter (+/- 22253) ``` There is a significant performance improvement for 1TiB worth of pages (256M bits or 32MiB): ``` test bitvec_from_elem_false/bits:1008GiB/ops:1 ... bench: 4069684 ns/iter (+/- 17428) test bitvec_from_elem_false/bits:1TiB/ops:1 ... bench: 11312 ns/iter (+/- 53) test bitvec_from_elem_false/bits:2TiB/ops:1 ... bench: 12974 ns/iter (+/- 209) ``` The standard vector behaves similarly, so it's likely the underlying memory allocator optimizations: ``` test vec_from_elem_0/bits:768GiB/ops:1 ... bench: 2709736 ns/iter (+/- 27170) test vec_from_elem_0/bits:1008GiB/ops:1 ... bench: 3843070 ns/iter (+/- 111132) test vec_from_elem_0/bits:1TiB/ops:1 ... bench: 11379 ns/iter (+/- 564) test vec_from_elem_0/bits:2TiB/ops:1 ... bench: 12687 ns/iter (+/- 110) ``` Growing 768GiB worth of pages (192M bits) takes 2.6ms: ``` test bitvec_grow_false/bits:256GiB/ops:1 ... bench: 186167 ns/iter (+/- 2079) test bitvec_grow_false/bits:512GiB/ops:1 ... bench: 1169816 ns/iter (+/- 20458) test bitvec_grow_false/bits:768GiB/ops:1 ... bench: 2617347 ns/iter (+/- 22551) ``` There is a significant performance degradation for 1TiB worth of pages (256M bits or 32MiB). The root cause is likely the underlying memory allocator behavior and the fact that `BitVec::grow` initializes all the data. ``` test bitvec_grow_false/bits:1008GiB/ops:1 ... bench: 3568375 ns/iter (+/- 54736) test bitvec_grow_false/bits:1TiB/ops:1 ... bench: 23986994 ns/iter (+/- 463857) test bitvec_grow_false/bits:2TiB/ops:1 ... bench: 51976114 ns/iter (+/- 1330421) ``` The benchmark also confirms that allocating non-initialized memory (`bitvec_with_capacity` or `bitvec_reserve`) always takes a constant time: ``` test bitvec_with_capacity/bits:1TiB/ops:1 ... bench: 11705 ns/iter (+/- 246) test bitvec_with_capacity/bits:2TiB/ops:1 ... bench: 12682 ns/iter (+/- 48) test bitvec_reserve/bits:1TiB/ops:1 ... bench: 11379 ns/iter (+/- 84) test bitvec_reserve/bits:2TiB/ops:1 ... bench: 13149 ns/iter (+/- 112) ``` The benchmark shows that the canister scheduling overhead should be a function of state size. Also, for large memory allocations, `BitVec` itself should be optimized.
dfinity · Feb 21, 2025 · 98fa250 · 98fa250
1 parent 2bbbfd1
commit 98fa250
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 0 deletions.
diff --git a/rs/memory_tracker/BUILD.bazel b/rs/memory_tracker/BUILD.bazel
@@ -57,3 +57,15 @@ rust_bench(
         "@crate_index//:rayon",
     ] + DEPENDENCIES,
 )
+
+rust_bench(
+    name = "bit_vec",
+    testonly = True,
+    srcs = ["benches/bit_vec.rs"],
+    deps = [
+        # Keep sorted.
+        "@crate_index//:criterion",
+        "@crate_index//:memmap2",
+        "@crate_index//:rayon",
+    ] + DEPENDENCIES,
+)
diff --git a/rs/memory_tracker/Cargo.toml b/rs/memory_tracker/Cargo.toml
@@ -37,3 +37,7 @@ name = "traps"
 [[bench]]
 harness = false
 name = "memory_ops"
+
+[[bench]]
+harness = false
+name = "bit_vec"
diff --git a/rs/memory_tracker/benches/bit_vec.rs b/rs/memory_tracker/benches/bit_vec.rs
@@ -0,0 +1,108 @@
+use std::hint::black_box;
+
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+
+const PAGE_SIZE: usize = 4096;
+const MIB: usize = 1024 * 1024;
+const GIB: usize = 1024 * 1024 * 1024;
+const TIB: usize = 1024 * 1024 * 1024 * 1024;
+const OPS: u64 = 1;
+const NUM_BITS: &[(&str, usize)] = &[
+    ("64MiB", 64 * MIB / PAGE_SIZE),
+    ("512MiB", 512 * MIB / PAGE_SIZE),
+    ("1GiB", GIB / PAGE_SIZE),
+    ("2GiB", 2 * GIB / PAGE_SIZE),
+    ("4GiB", 4 * GIB / PAGE_SIZE),
+    ("8GiB", 8 * GIB / PAGE_SIZE),
+    ("64GiB", 64 * GIB / PAGE_SIZE),
+    ("256GiB", 256 * GIB / PAGE_SIZE),
+    ("512GiB", 512 * GIB / PAGE_SIZE),
+    ("768GiB", 768 * GIB / PAGE_SIZE),
+    ("1008GiB", 1008 * GIB / PAGE_SIZE),
+    ("1TiB", TIB / PAGE_SIZE),
+    ("2TiB", 2 * TIB / PAGE_SIZE),
+];
+
+fn bitvec_from_elem_false(num_bits: usize) {
+    for _ in 0..OPS {
+        let _bit_vec = black_box(bit_vec::BitVec::from_elem(black_box(num_bits), false));
+    }
+}
+
+fn bitvec_from_elem_true(num_bits: usize) {
+    for _ in 0..OPS {
+        let _bit_vec = black_box(bit_vec::BitVec::from_elem(black_box(num_bits), true));
+    }
+}
+
+fn bitvec_with_capacity(num_bits: usize) {
+    for _ in 0..OPS {
+        let mut bit_vec = black_box(bit_vec::BitVec::with_capacity(black_box(num_bits)));
+        bit_vec.push(true);
+        let _bit_vec = black_box(bit_vec);
+    }
+}
+
+fn vec_from_elem_0(num_bits: usize) {
+    let num_blocks = num_bits / u32::BITS as usize;
+    for _ in 0..OPS {
+        let _vec = black_box(std::vec::from_elem(0_u32, black_box(num_blocks)));
+    }
+}
+
+fn from_elem_bench(c: &mut Criterion) {
+    bench(c, "bitvec_from_elem_false", bitvec_from_elem_false);
+    bench(c, "bitvec_from_elem_true", bitvec_from_elem_true);
+    bench(c, "bitvec_with_capacity", bitvec_with_capacity);
+    bench(c, "vec_from_elem_0", vec_from_elem_0);
+}
+
+fn bitvec_grow_false(num_bits: usize) {
+    for _ in 0..OPS {
+        let mut bit_vec = bit_vec::BitVec::new();
+        bit_vec.grow(black_box(num_bits), false);
+        let _bit_vec = black_box(bit_vec);
+    }
+}
+
+fn bitvec_grow_true(num_bits: usize) {
+    for _ in 0..OPS {
+        let mut bit_vec = bit_vec::BitVec::new();
+        bit_vec.grow(black_box(num_bits), true);
+        let _bit_vec = black_box(bit_vec);
+    }
+}
+
+fn bitvec_reserve(num_bits: usize) {
+    for _ in 0..OPS {
+        let mut bit_vec = bit_vec::BitVec::new();
+        bit_vec.reserve(black_box(num_bits));
+        bit_vec.push(true);
+        let _bit_vec = black_box(bit_vec);
+    }
+}
+
+fn resize_bench(c: &mut Criterion) {
+    bench(c, "bitvec_grow_false", bitvec_grow_false);
+    bench(c, "bitvec_grow_true", bitvec_grow_true);
+    bench(c, "bitvec_reserve", bitvec_reserve);
+}
+
+fn bench(c: &mut Criterion, group_name: &str, routine: fn(usize)) {
+    let mut group = c.benchmark_group(group_name);
+
+    for (id, num_bits) in NUM_BITS {
+        group.throughput(Throughput::Elements(OPS));
+
+        group.bench_function(format!("bits:{id}/ops:{OPS}"), |b| {
+            b.iter(|| {
+                routine(*num_bits);
+            })
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, from_elem_bench, resize_bench);
+criterion_main!(benches);