From 1bce336187c1a73ad07582f887ab207a70fd6a4e Mon Sep 17 00:00:00 2001 From: Lukas Lalinsky Date: Mon, 2 Dec 2024 15:56:52 +0100 Subject: [PATCH 1/5] Turn docs into std.AutoHashMapUnmanaged --- src/FileSegment.zig | 7 +++---- src/MemorySegment.zig | 9 ++++----- src/filefmt.zig | 7 ++++++- src/segment_merger.zig | 13 +++++-------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/FileSegment.zig b/src/FileSegment.zig index 4e0063f..05de5da 100644 --- a/src/FileSegment.zig +++ b/src/FileSegment.zig @@ -21,7 +21,7 @@ allocator: std.mem.Allocator, dir: std.fs.Dir, info: SegmentInfo = .{}, attributes: std.AutoHashMapUnmanaged(u64, u64) = .{}, -docs: std.AutoHashMap(u32, bool), +docs: std.AutoHashMapUnmanaged(u32, bool) = .{}, index: std.ArrayList(u32), block_size: usize = 0, blocks: []const u8, @@ -35,7 +35,6 @@ pub fn init(allocator: std.mem.Allocator, options: Options) Self { return Self{ .allocator = allocator, .dir = options.dir, - .docs = std.AutoHashMap(u32, bool).init(allocator), .index = std.ArrayList(u32).init(allocator), .blocks = undefined, }; @@ -43,7 +42,7 @@ pub fn init(allocator: std.mem.Allocator, options: Options) Self { pub fn deinit(self: *Self, delete_file: KeepOrDelete) void { self.attributes.deinit(self.allocator); - self.docs.deinit(); + self.docs.deinit(self.allocator); self.index.deinit(); if (self.raw_data) |data| { @@ -138,7 +137,7 @@ test "build" { source.info = .{ .version = 1 }; source.frozen = true; - try source.docs.put(1, true); + try source.docs.put(source.allocator, 1, true); try source.items.append(.{ .id = 1, .hash = 1 }); try source.items.append(.{ .id = 1, .hash = 2 }); diff --git a/src/MemorySegment.zig b/src/MemorySegment.zig index 8350679..69876f3 100644 --- a/src/MemorySegment.zig +++ b/src/MemorySegment.zig @@ -20,7 +20,7 @@ pub const Options = struct {}; allocator: std.mem.Allocator, info: SegmentInfo = .{}, attributes: std.AutoHashMapUnmanaged(u64, u64) = .{}, -docs: std.AutoHashMap(u32, bool), +docs: std.AutoHashMapUnmanaged(u32, bool) = .{}, items: std.ArrayList(Item), frozen: bool = false, @@ -28,7 +28,6 @@ pub fn init(allocator: std.mem.Allocator, opts: Options) Self { _ = opts; return .{ .allocator = allocator, - .docs = std.AutoHashMap(u32, bool).init(allocator), .items = std.ArrayList(Item).init(allocator), }; } @@ -37,7 +36,7 @@ pub fn deinit(self: *Self, delete_file: KeepOrDelete) void { _ = delete_file; self.attributes.deinit(self.allocator); - self.docs.deinit(); + self.docs.deinit(self.allocator); self.items.deinit(); } @@ -76,7 +75,7 @@ pub fn build(self: *Self, changes: []const Change) !void { } try self.attributes.ensureTotalCapacity(self.allocator, num_attributes); - try self.docs.ensureTotalCapacity(num_docs); + try self.docs.ensureTotalCapacity(self.allocator, num_docs); try self.items.ensureTotalCapacity(num_items); var i = changes.len; @@ -124,7 +123,7 @@ pub fn merge(self: *Self, merger: *SegmentMerger(Self)) !void { self.attributes.deinit(self.allocator); self.attributes = merger.segment.attributes.move(); - self.docs.deinit(); + self.docs.deinit(self.allocator); self.docs = merger.segment.docs.move(); self.items.clearRetainingCapacity(); diff --git a/src/filefmt.zig b/src/filefmt.zig index 1be1840..d5eebda 100644 --- a/src/filefmt.zig +++ b/src/filefmt.zig @@ -428,7 +428,12 @@ pub fn readSegmentFile(dir: fs.Dir, info: SegmentInfo, segment: *FileSegment) !v } if (header.has_docs) { - try unpacker.readMapInto(&segment.docs); + // FIXME nicer api in msgpack.zig + var docs = std.AutoHashMap(u32, bool).init(segment.allocator); + defer docs.deinit(); + try unpacker.readMapInto(&docs); + segment.docs.deinit(segment.allocator); + segment.docs = docs.unmanaged.move(); } const block_size = header.block_size; diff --git a/src/segment_merger.zig b/src/segment_merger.zig index 61cf8d8..97e4ebd 100644 --- a/src/segment_merger.zig +++ b/src/segment_merger.zig @@ -8,7 +8,7 @@ const SharedPtr = @import("utils/shared_ptr.zig").SharedPtr; pub const MergedSegmentInfo = struct { info: SegmentInfo = .{}, attributes: std.AutoHashMapUnmanaged(u64, u64) = .{}, - docs: std.AutoHashMap(u32, bool), + docs: std.AutoHashMapUnmanaged(u32, bool) = .{}, }; pub fn SegmentMerger(comptime Segment: type) type { @@ -38,7 +38,7 @@ pub fn SegmentMerger(comptime Segment: type) type { allocator: std.mem.Allocator, sources: std.ArrayList(Source), collection: *SegmentList(Segment), - segment: MergedSegmentInfo, + segment: MergedSegmentInfo = .{}, estimated_size: usize = 0, current_item: ?Item = null, @@ -48,9 +48,6 @@ pub fn SegmentMerger(comptime Segment: type) type { .allocator = allocator, .sources = std.ArrayList(Source).init(allocator), .collection = collection, - .segment = .{ - .docs = std.AutoHashMap(u32, bool).init(allocator), - }, }; } @@ -60,7 +57,7 @@ pub fn SegmentMerger(comptime Segment: type) type { source.skip_docs.deinit(); } self.sources.deinit(); - self.segment.docs.deinit(); + self.segment.docs.deinit(self.allocator); self.* = undefined; } @@ -100,7 +97,7 @@ pub fn SegmentMerger(comptime Segment: type) type { } } - try self.segment.docs.ensureTotalCapacity(total_docs); + try self.segment.docs.ensureTotalCapacity(self.allocator, total_docs); for (sources) |*source| { const segment = source.reader.segment; var docs_added: usize = 0; @@ -111,7 +108,7 @@ pub fn SegmentMerger(comptime Segment: type) type { const doc_id = entry.key_ptr.*; const doc_status = entry.value_ptr.*; if (!self.collection.hasNewerVersion(doc_id, segment.info.version)) { - try self.segment.docs.put(doc_id, doc_status); + try self.segment.docs.put(self.allocator, doc_id, doc_status); docs_added += 1; } else { try source.skip_docs.put(doc_id, {}); From 46d24b5606e5d971260302dee905c9e5b4cf7dd5 Mon Sep 17 00:00:00 2001 From: Lukas Lalinsky Date: Mon, 2 Dec 2024 15:58:29 +0100 Subject: [PATCH 2/5] More conversions to unmanaged --- src/segment_merger.zig | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/segment_merger.zig b/src/segment_merger.zig index 97e4ebd..6fa176c 100644 --- a/src/segment_merger.zig +++ b/src/segment_merger.zig @@ -17,7 +17,7 @@ pub fn SegmentMerger(comptime Segment: type) type { const Source = struct { reader: Segment.Reader, - skip_docs: std.AutoHashMap(u32, void), + skip_docs: std.AutoHashMapUnmanaged(u32, void) = .{}, pub fn read(self: *Source) !?Item { while (true) { @@ -54,7 +54,7 @@ pub fn SegmentMerger(comptime Segment: type) type { pub fn deinit(self: *Self) void { for (self.sources.items) |*source| { source.reader.close(); - source.skip_docs.deinit(); + source.skip_docs.deinit(self.allocator); } self.sources.deinit(); self.segment.docs.deinit(self.allocator); @@ -64,7 +64,6 @@ pub fn SegmentMerger(comptime Segment: type) type { pub fn addSource(self: *Self, source: *Segment) !void { try self.sources.append(.{ .reader = source.reader(), - .skip_docs = std.AutoHashMap(u32, void).init(self.allocator), }); } @@ -111,7 +110,7 @@ pub fn SegmentMerger(comptime Segment: type) type { try self.segment.docs.put(self.allocator, doc_id, doc_status); docs_added += 1; } else { - try source.skip_docs.put(doc_id, {}); + try source.skip_docs.put(self.allocator, doc_id, {}); } } if (docs_found > 0) { From d125e22586d20eac0a42d9b9433cd8f2d8d001b9 Mon Sep 17 00:00:00 2001 From: Lukas Lalinsky Date: Mon, 2 Dec 2024 16:01:35 +0100 Subject: [PATCH 3/5] Fix memory leak in unmanaged --- src/segment_merger.zig | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/segment_merger.zig b/src/segment_merger.zig index 6fa176c..7cc5f84 100644 --- a/src/segment_merger.zig +++ b/src/segment_merger.zig @@ -9,6 +9,11 @@ pub const MergedSegmentInfo = struct { info: SegmentInfo = .{}, attributes: std.AutoHashMapUnmanaged(u64, u64) = .{}, docs: std.AutoHashMapUnmanaged(u32, bool) = .{}, + + pub fn deinit(self: *MergedSegmentInfo, allocator: std.mem.Allocator) void { + self.attributes.deinit(allocator); + self.docs.deinit(allocator); + } }; pub fn SegmentMerger(comptime Segment: type) type { @@ -36,8 +41,8 @@ pub fn SegmentMerger(comptime Segment: type) type { }; allocator: std.mem.Allocator, - sources: std.ArrayList(Source), collection: *SegmentList(Segment), + sources: std.ArrayListUnmanaged(Source) = .{}, segment: MergedSegmentInfo = .{}, estimated_size: usize = 0, @@ -46,7 +51,6 @@ pub fn SegmentMerger(comptime Segment: type) type { pub fn init(allocator: std.mem.Allocator, collection: *SegmentList(Segment)) Self { return .{ .allocator = allocator, - .sources = std.ArrayList(Source).init(allocator), .collection = collection, }; } @@ -56,13 +60,13 @@ pub fn SegmentMerger(comptime Segment: type) type { source.reader.close(); source.skip_docs.deinit(self.allocator); } - self.sources.deinit(); - self.segment.docs.deinit(self.allocator); + self.sources.deinit(self.allocator); + self.segment.deinit(self.allocator); self.* = undefined; } pub fn addSource(self: *Self, source: *Segment) !void { - try self.sources.append(.{ + try self.sources.append(self.allocator, .{ .reader = source.reader(), }); } From 06d12bf77ba6c04649e1d5056eed6c336600ba0b Mon Sep 17 00:00:00 2001 From: Lukas Lalinsky Date: Mon, 2 Dec 2024 16:06:13 +0100 Subject: [PATCH 4/5] Less allocations in SegmentMerger --- src/segment_list.zig | 4 ++-- src/segment_merger.zig | 23 ++++++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/segment_list.zig b/src/segment_list.zig index 39aab3d..ea08729 100644 --- a/src/segment_list.zig +++ b/src/segment_list.zig @@ -209,11 +209,11 @@ pub fn SegmentListManager(Segment: type) type { var target = try List.createSegment(allocator, self.options); defer List.destroySegment(allocator, &target); - var merger = SegmentMerger(Segment).init(allocator, segments.value); + var merger = try SegmentMerger(Segment).init(allocator, segments.value, candidate.end - candidate.start); defer merger.deinit(); for (segments.value.nodes.items[candidate.start..candidate.end]) |segment| { - try merger.addSource(segment.value); + merger.addSource(segment.value); } try merger.prepare(); diff --git a/src/segment_merger.zig b/src/segment_merger.zig index 7cc5f84..13ac404 100644 --- a/src/segment_merger.zig +++ b/src/segment_merger.zig @@ -24,6 +24,11 @@ pub fn SegmentMerger(comptime Segment: type) type { reader: Segment.Reader, skip_docs: std.AutoHashMapUnmanaged(u32, void) = .{}, + pub fn deinit(self: *Source, allocator: std.mem.Allocator) void { + self.reader.close(); + self.skip_docs.deinit(allocator); + } + pub fn read(self: *Source) !?Item { while (true) { const item = try self.reader.read() orelse return null; @@ -48,25 +53,25 @@ pub fn SegmentMerger(comptime Segment: type) type { current_item: ?Item = null, - pub fn init(allocator: std.mem.Allocator, collection: *SegmentList(Segment)) Self { + pub fn init(allocator: std.mem.Allocator, collection: *SegmentList(Segment), num_sources: usize) !Self { return .{ .allocator = allocator, .collection = collection, + .sources = try std.ArrayListUnmanaged(Source).initCapacity(allocator, num_sources), }; } pub fn deinit(self: *Self) void { for (self.sources.items) |*source| { - source.reader.close(); - source.skip_docs.deinit(self.allocator); + source.deinit(self.allocator); } self.sources.deinit(self.allocator); self.segment.deinit(self.allocator); self.* = undefined; } - pub fn addSource(self: *Self, source: *Segment) !void { - try self.sources.append(self.allocator, .{ + pub fn addSource(self: *Self, source: *Segment) void { + self.sources.appendAssumeCapacity(.{ .reader = source.reader(), }); } @@ -158,7 +163,7 @@ test "merge segments" { var collection = try SegmentList(MemorySegment).init(std.testing.allocator, 3); defer collection.deinit(std.testing.allocator, .delete); - var merger = SegmentMerger(MemorySegment).init(std.testing.allocator, &collection); + var merger = try SegmentMerger(MemorySegment).init(std.testing.allocator, &collection, 3); defer merger.deinit(); var node1 = try SegmentList(MemorySegment).createSegment(std.testing.allocator, .{}); @@ -174,9 +179,9 @@ test "merge segments" { node2.value.info = .{ .version = 12, .merges = 0 }; node3.value.info = .{ .version = 13, .merges = 0 }; - try merger.addSource(node1.value); - try merger.addSource(node2.value); - try merger.addSource(node3.value); + merger.addSource(node1.value); + merger.addSource(node2.value); + merger.addSource(node3.value); try merger.prepare(); From 049b4e0c469947415ff95a36633ea599e9a6871f Mon Sep 17 00:00:00 2001 From: Lukas Lalinsky Date: Mon, 2 Dec 2024 16:12:47 +0100 Subject: [PATCH 5/5] More unmanaged conversions --- src/FileSegment.zig | 9 ++++----- src/MemorySegment.zig | 11 +++++------ src/filefmt.zig | 17 +++++++++-------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/FileSegment.zig b/src/FileSegment.zig index 05de5da..cc50319 100644 --- a/src/FileSegment.zig +++ b/src/FileSegment.zig @@ -22,7 +22,7 @@ dir: std.fs.Dir, info: SegmentInfo = .{}, attributes: std.AutoHashMapUnmanaged(u64, u64) = .{}, docs: std.AutoHashMapUnmanaged(u32, bool) = .{}, -index: std.ArrayList(u32), +index: std.ArrayListUnmanaged(u32) = .{}, block_size: usize = 0, blocks: []const u8, merged: u32 = 0, @@ -35,7 +35,6 @@ pub fn init(allocator: std.mem.Allocator, options: Options) Self { return Self{ .allocator = allocator, .dir = options.dir, - .index = std.ArrayList(u32).init(allocator), .blocks = undefined, }; } @@ -43,7 +42,7 @@ pub fn init(allocator: std.mem.Allocator, options: Options) Self { pub fn deinit(self: *Self, delete_file: KeepOrDelete) void { self.attributes.deinit(self.allocator); self.docs.deinit(self.allocator); - self.index.deinit(); + self.index.deinit(self.allocator); if (self.raw_data) |data| { std.posix.munmap(data); @@ -138,8 +137,8 @@ test "build" { source.info = .{ .version = 1 }; source.frozen = true; try source.docs.put(source.allocator, 1, true); - try source.items.append(.{ .id = 1, .hash = 1 }); - try source.items.append(.{ .id = 1, .hash = 2 }); + try source.items.append(source.allocator, .{ .id = 1, .hash = 1 }); + try source.items.append(source.allocator, .{ .id = 1, .hash = 2 }); var source_reader = source.reader(); defer source_reader.close(); diff --git a/src/MemorySegment.zig b/src/MemorySegment.zig index 69876f3..b26335f 100644 --- a/src/MemorySegment.zig +++ b/src/MemorySegment.zig @@ -21,14 +21,13 @@ allocator: std.mem.Allocator, info: SegmentInfo = .{}, attributes: std.AutoHashMapUnmanaged(u64, u64) = .{}, docs: std.AutoHashMapUnmanaged(u32, bool) = .{}, -items: std.ArrayList(Item), +items: std.ArrayListUnmanaged(Item) = .{}, frozen: bool = false, pub fn init(allocator: std.mem.Allocator, opts: Options) Self { _ = opts; return .{ .allocator = allocator, - .items = std.ArrayList(Item).init(allocator), }; } @@ -37,7 +36,7 @@ pub fn deinit(self: *Self, delete_file: KeepOrDelete) void { self.attributes.deinit(self.allocator); self.docs.deinit(self.allocator); - self.items.deinit(); + self.items.deinit(self.allocator); } pub fn search(self: Self, sorted_hashes: []const u32, results: *SearchResults) !void { @@ -76,7 +75,7 @@ pub fn build(self: *Self, changes: []const Change) !void { try self.attributes.ensureTotalCapacity(self.allocator, num_attributes); try self.docs.ensureTotalCapacity(self.allocator, num_docs); - try self.items.ensureTotalCapacity(num_items); + try self.items.ensureTotalCapacity(self.allocator, num_items); var i = changes.len; while (i > 0) { @@ -127,10 +126,10 @@ pub fn merge(self: *Self, merger: *SegmentMerger(Self)) !void { self.docs = merger.segment.docs.move(); self.items.clearRetainingCapacity(); - try self.items.ensureTotalCapacity(merger.estimated_size); + try self.items.ensureTotalCapacity(self.allocator, merger.estimated_size); while (true) { const item = try merger.read() orelse break; - try self.items.append(item); + try self.items.append(self.allocator, item); merger.advance(); } } diff --git a/src/filefmt.zig b/src/filefmt.zig index d5eebda..03b9649 100644 --- a/src/filefmt.zig +++ b/src/filefmt.zig @@ -200,11 +200,12 @@ test "writeBlock/readBlock/readFirstItemFromBlock" { var segment = MemorySegment.init(std.testing.allocator, .{}); defer segment.deinit(.delete); - try segment.items.append(.{ .hash = 1, .id = 1 }); - try segment.items.append(.{ .hash = 2, .id = 1 }); - try segment.items.append(.{ .hash = 3, .id = 1 }); - try segment.items.append(.{ .hash = 3, .id = 2 }); - try segment.items.append(.{ .hash = 4, .id = 1 }); + try segment.items.ensureTotalCapacity(std.testing.allocator, 5); + segment.items.appendAssumeCapacity(.{ .hash = 1, .id = 1 }); + segment.items.appendAssumeCapacity(.{ .hash = 2, .id = 1 }); + segment.items.appendAssumeCapacity(.{ .hash = 3, .id = 1 }); + segment.items.appendAssumeCapacity(.{ .hash = 3, .id = 2 }); + segment.items.appendAssumeCapacity(.{ .hash = 4, .id = 1 }); const block_size = 1024; var block_data: [block_size]u8 = undefined; @@ -442,8 +443,8 @@ pub fn readSegmentFile(dir: fs.Dir, info: SegmentInfo, segment: *FileSegment) !v const blocks_data_start = fixed_buffer_stream.pos; - const estimated_block_count = (raw_data.len - fixed_buffer_stream.pos) / block_size; - try segment.index.ensureTotalCapacity(estimated_block_count); + const max_possible_block_count = (raw_data.len - fixed_buffer_stream.pos) / block_size; + try segment.index.ensureTotalCapacity(segment.allocator, max_possible_block_count); var num_items: u32 = 0; var num_blocks: u32 = 0; @@ -457,7 +458,7 @@ pub fn readSegmentFile(dir: fs.Dir, info: SegmentInfo, segment: *FileSegment) !v if (block_header.num_items == 0) { break; } - try segment.index.append(block_header.first_item.hash); + segment.index.appendAssumeCapacity(block_header.first_item.hash); num_items += block_header.num_items; num_blocks += 1; crc.update(block_data[0..]);