Skip to content

Commit 329250d

Browse files
authored
[POSIX] Drop idempotent writes (#620)
1 parent 22d6326 commit 329250d

File tree

2 files changed

+19
-29
lines changed

2 files changed

+19
-29
lines changed

storage/posix/README.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ The final step in the dance above is atomic according to the POSIX spec, so in p
3232
of actions we can avoid corrupt or partially written files being part of the tree.
3333

3434
1. Leaves are submitted by the binary built using Tessera via a call the storage's `Add` func.
35-
1. The storage library batches these entries up, and, after a configurable period of time has elapsed
35+
1. The storage library batches these entries up in memory, and, after a configurable period of time has elapsed
3636
or the batch reaches a configurable size threshold, the batch is sequenced and appended to the tree:
3737
1. An advisory lock is taken on `.state/treeState.lock` file.
3838
This helps prevent multiple frontends from stepping on each other, but isn't necesary for safety.
@@ -48,6 +48,12 @@ will be updated:
4848

4949
## Filesystems
5050

51-
This implementation has been somewhat tested on both a local `ext4` filesystem and on a distributed
52-
[CephFS](https://docs.ceph.com/en/reef/cephfs/) instance on GCP, in both cases with multiple
51+
This implementation has been somewhat tested on local `ext4` and `ZFS` filesystems, and on a distributed
52+
[CephFS](https://docs.ceph.com/en/reef/cephfs/) instance on GCP, in all cases with multiple
5353
personality binaries attempting to add new entries concurrently.
54+
55+
Other POSIX compliant filesystems such as `XFS` _should_ work, but filesystems which do not offer strong
56+
POSIX compliance (e.g. `s3fs` or `NFS`) are unlikely to result in long term happiness.
57+
58+
If in doubt, tools like https://github.com/saidsay-so/pjdfstest may help in determining whether a given
59+
filesystem is suitable.

storage/posix/files.go

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ func (lrs *logResourceStorage) storeTile(ctx context.Context, level, index, logS
396396
func (lrs *logResourceStorage) writeTile(_ context.Context, level, index uint64, partial uint8, t []byte) error {
397397
tPath := layout.TilePath(level, index, partial)
398398

399-
if err := lrs.s.createIdempotent(tPath, t); err != nil {
399+
if err := lrs.s.createOverwrite(tPath, t); err != nil {
400400
return err
401401
}
402402

@@ -428,7 +428,7 @@ func (lrs *logResourceStorage) writeTile(_ context.Context, level, index uint64,
428428
// writeBundle takes care of writing out the serialised entry bundle file.
429429
func (lrs *logResourceStorage) writeBundle(_ context.Context, index uint64, partial uint8, bundle []byte) error {
430430
bf := lrs.entriesPath(index, partial)
431-
if err := lrs.s.createIdempotent(bf, bundle); err != nil {
431+
if err := lrs.s.createOverwrite(bf, bundle); err != nil {
432432
if !errors.Is(err, os.ErrExist) {
433433
return err
434434
}
@@ -525,7 +525,7 @@ func (s *Storage) writeTreeState(size uint64, root []byte) error {
525525
return fmt.Errorf("error in Marshal: %v", err)
526526
}
527527

528-
if err := overwrite(filepath.Join(s.path, stateDir, "treeState"), raw); err != nil {
528+
if err := s.createOverwrite(filepath.Join(stateDir, "treeState"), raw); err != nil {
529529
return fmt.Errorf("failed to create private tree state file: %w", err)
530530
}
531531
return nil
@@ -581,8 +581,8 @@ func (a *appender) publishCheckpoint(ctx context.Context, minStaleness time.Dura
581581
return fmt.Errorf("newCP: %v", err)
582582
}
583583

584-
if err := overwrite(filepath.Join(a.s.path, layout.CheckpointPath), cpRaw); err != nil {
585-
return fmt.Errorf("overwrite(%s): %v", layout.CheckpointPath, err)
584+
if err := a.s.createOverwrite(layout.CheckpointPath, cpRaw); err != nil {
585+
return fmt.Errorf("createOverwrite(%s): %v", layout.CheckpointPath, err)
586586
}
587587

588588
klog.V(2).Infof("Published latest checkpoint: %d, %x", size, root)
@@ -598,33 +598,17 @@ func (s *Storage) createExclusive(p string, d []byte) error {
598598
return createEx(filepath.Join(s.path, p), d)
599599
}
600600

601+
// createOverwrite atomically creates or overwrites a file at the given path with the provided data.
602+
func (s *Storage) createOverwrite(p string, d []byte) error {
603+
return overwrite(filepath.Join(s.path, p), d)
604+
}
605+
601606
func (s *Storage) readAll(p string) ([]byte, error) {
602607
p = filepath.Join(s.path, p)
603608
return os.ReadFile(p)
604609

605610
}
606611

607-
// createIdempotent atomically writes the provided data to a file at the provided path, relative to the root of the log.
608-
// An error will be returned if a file already exists in the named location AND that file's
609-
// contents is not identical to the provided data.
610-
func (s *Storage) createIdempotent(p string, d []byte) error {
611-
if err := s.createExclusive(p, d); err != nil {
612-
if errors.Is(err, os.ErrExist) {
613-
r, err := s.readAll(p)
614-
if err != nil {
615-
return fmt.Errorf("file %q already exists, but unable to read it: %v", p, err)
616-
}
617-
if !bytes.Equal(d, r) {
618-
return fmt.Errorf("file %q already exists but has different contents", p)
619-
}
620-
// Idempotent write.
621-
return nil
622-
}
623-
return err
624-
}
625-
return nil
626-
}
627-
628612
// stat returns os.Stat info for the speficied file relative to the log root.
629613
func (s *Storage) stat(p string) (os.FileInfo, error) {
630614
p = filepath.Join(s.path, p)

0 commit comments

Comments
 (0)