Skip to content

Commit

Permalink
fix(580): close trimmer without lock to avoid deadlock (#632)
Browse files Browse the repository at this point in the history
Fix: #580

we must close trimmer without lock, because when it running doTrim, will
acquire lock and cause an deadlock.

```
panic: test timed out after 10m0s
        running tests:
                TestWalTrimUpToCommitOffset (7m11s)
                TestWalTrimUpToCommitOffset/test-69 (7m8s)

goroutine 21768 [running]:
testing.(*M).startAlarm.func1()
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:2373 +0x718
created by time.goFunc
        /Users/bytedance/.gvm/gos/go1.23.5/src/time/sleep.go:215 +0x44

goroutine 1 [chan receive, 7 minutes]:
testing.(*T).Run(0x140001bb860, {0x1056e4c3a, 0x1b}, 0x1059d80e0)
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:1751 +0x7e4
testing.runTests.func1(0x140001bb6c0)
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:2168 +0x98
testing.tRunner(0x140001bb6c0, 0x140004c5588)
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:1690 +0x1d0
testing.runTests(0x1400000c468, {0x105f44f80, 0x25, 0x25}, {0xc1e86345a6417500, 0x8bb315f586, 0x105f52160})
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:2166 +0x674
testing.(*M).Run(0x140000ca1e0)
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:2034 +0xe44
main.main()
        _testmain.go:123 +0x98

goroutine 21403 [chan receive, 7 minutes]:
testing.(*T).Run(0x140006521a0, {0x140000172a5, 0x7}, 0x1059d8190)
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:1751 +0x7e4
github.com/streamnative/oxia/server/wal.TestWalTrimUpToCommitOffset(0x140001bb860)
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_trimmer_test.go:98 +0xe4
testing.tRunner(0x140001bb860, 0x1059d80e0)
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:1690 +0x1d0
created by testing.(*T).Run in goroutine 1
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:1743 +0x7c4

goroutine 21765 [sync.RWMutex.RLock, 7 minutes]:
sync.runtime_SemacquireRWMutexR(0x1400035cd38?, 0x84?, 0x1400035cda8?)
        /Users/bytedance/.gvm/gos/go1.23.5/src/runtime/sema.go:100 +0x28
sync.(*RWMutex).RLock(...)
        /Users/bytedance/.gvm/gos/go1.23.5/src/sync/rwmutex.go:72
github.com/streamnative/oxia/server/wal.(*wal).readAtIndex(0x14000660160, 0x57)           // doTrim accquire RLock
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_impl.go:164 +0xc8
github.com/streamnative/oxia/server/wal.(*forwardReader).ReadNext(0x1400061aa80)
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_reader.go:94 +0x290
github.com/streamnative/oxia/server/wal.(*trimmer).readAtOffset(0x1400060d740, 0x57)
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_trimmer.go:205 +0x23c
github.com/streamnative/oxia/server/wal.(*trimmer).doTrim(0x1400060d740)
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_trimmer.go:133 +0x420
github.com/streamnative/oxia/server/wal.(*trimmer).run(0x1400060d740)
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_trimmer.go:105 +0xe8
github.com/streamnative/oxia/common.DoWithLabels.func1({0x1059e6918, 0x140003e7350})
        /Users/bytedance/workspace/github.com/streamnative/oxia/common/pprof.go:46 +0x30
runtime/pprof.Do({0x1059e6918, 0x140003e7350}, {{0x140001245c0, 0x2, 0x2}}, 0x14000604f00)
        /Users/bytedance/.gvm/gos/go1.23.5/src/runtime/pprof/runtime.go:51 +0xf0
github.com/streamnative/oxia/common.DoWithLabels({0x1059e6950, 0x140006306e0}, 0x140001366c0, 0x140002ecc20)
        /Users/bytedance/workspace/github.com/streamnative/oxia/common/pprof.go:42 +0x3c4
created by github.com/streamnative/oxia/server/wal.newTrimmer in goroutine 21764
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_trimmer.go:63 +0x958

goroutine 21764 [chan receive, 7 minutes]:
github.com/streamnative/oxia/server/wal.(*trimmer).Close(0x1400060d740)   // Close will wait doTrim returned, and it holds Lock
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_trimmer.go:96 +0xcc
github.com/streamnative/oxia/server/wal.(*wal).closeWithoutLock(0x14000660160)
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_impl.go:240 +0xc0
github.com/streamnative/oxia/server/wal.(*wal).Close(0x14000660160)
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_impl.go:228 +0xb0
github.com/streamnative/oxia/server/wal.TestWalTrimUpToCommitOffset.func1(0x140006521a0)
        /Users/bytedance/workspace/github.com/streamnative/oxia/server/wal/wal_trimmer_test.go:164 +0xaf0
testing.tRunner(0x140006521a0, 0x1059d8190)
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:1690 +0x1d0
created by testing.(*T).Run in goroutine 21403
        /Users/bytedance/.gvm/gos/go1.23.5/src/testing/testing.go:1743 +0x7c4
FAIL    github.com/streamnative/oxia/server/wal 601.285s
```
  • Loading branch information
lsytj0413 authored Mar 1, 2025
1 parent 8c1d357 commit d434094
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
11 changes: 10 additions & 1 deletion server/wal/wal_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ func (t *wal) trim(firstOffset int64) error {
}

func (t *wal) Close() error {
if err := t.trimmer.Close(); err != nil {
return err
}

t.Lock()
defer t.Unlock()

Expand All @@ -237,7 +241,6 @@ func (t *wal) closeWithoutLock() error {
t.activeEntries.Unregister()

return multierr.Combine(
t.trimmer.Close(),
t.currentSegment.Close(),
t.readOnlySegments.Close(),
)
Expand Down Expand Up @@ -455,6 +458,12 @@ func (t *wal) Clear() error {
}

func (t *wal) Delete() error {
// NOTE: we must close the trimmer before closing the wal(without the lock), otherwise
// when trimmer is doTrim, it accquire the lock and it will block forever
if err := t.Close(); err != nil {
return err
}

t.Lock()
defer t.Unlock()

Expand Down
12 changes: 7 additions & 5 deletions server/wal/wal_trimmer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,16 +133,16 @@ func TestWalTrimUpToCommitOffset(t *testing.T) {
commitOffsetProvider.commitOffset.Store(2)

assert.Eventually(t, func() bool {
offset := w.FirstOffset()
slog.Info(
"checking...",
slog.Int64("first-offset", w.FirstOffset()),
slog.Int64("first-offset", offset),
slog.String("TestName", t.Name()),
)
return w.FirstOffset() == 2
return offset == 2
}, 10*time.Second, 10*time.Millisecond)

clock.Set(89)

time.Sleep(100 * time.Microsecond)

// No trimming should happen yet, because of commit offset
Expand All @@ -152,14 +152,16 @@ func TestWalTrimUpToCommitOffset(t *testing.T) {
commitOffsetProvider.commitOffset.Store(100)

assert.Eventually(t, func() bool {
offset := w.FirstOffset()
slog.Info(
"checking...",
slog.Int64("first-offset", w.FirstOffset()),
slog.Int64("first-offset", offset),
slog.String("TestName", t.Name()),
)
return w.FirstOffset() == 87
return offset == 87
}, 10*time.Second, 10*time.Millisecond)

slog.Info("Starting to close wal")
assert.NoError(t, w.Close())
})
}
Expand Down

0 comments on commit d434094

Please sign in to comment.