Skip to content

Commit 3a5d299

Browse files
committed
Merge branch 'main' into feat/commit-tracking-log-store-checker
2 parents 1bdf161 + c0dc6a0 commit 3a5d299

25 files changed

+202
-55
lines changed

.github/CODEOWNERS

+12-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,13 @@
1-
* @hashicorp/consul-core-reviewers @hashicorp/nomad-eng
1+
# Each line is a file pattern followed by one or more owners.
2+
# More on CODEOWNERS files: https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners
23

3-
/.release/ @hashicorp/release-engineering
4-
/.github/workflows/ci.yml @hashicorp/release-engineering
4+
# Default owner
5+
* @hashicorp/team-ip-compliance @hashicorp/consul-core-reviewers @hashicorp/nomad-eng @hashicorp/raft-force
6+
7+
# Add override rules below. Each line is a file/folder pattern followed by one or more owners.
8+
# Being an owner means those groups or individuals will be added as reviewers to PRs affecting
9+
# those areas of the code.
10+
# Examples:
11+
# /docs/ @docs-team
12+
# *.js @js-team
13+
# *.go @go-team

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,6 @@ _testmain.go
2121

2222
*.exe
2323
*.test
24+
25+
# Goland IDE
26+
.idea

.golangci-lint.yml

+12
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,25 @@ linters-settings:
99
check-shadowing: true
1010
golint:
1111
min-confidence: 0
12+
depguard:
13+
rules:
14+
main:
15+
list-mode: lax
16+
allow:
17+
- "github.com/hashicorp/go-metrics/compat"
18+
deny:
19+
- pkg: "github.com/hashicorp/go-metrics"
20+
desc: not allowed, use github.com/hashicorp/go-metrics/compat instead
21+
- pkg: "github.com/armon/go-metrics"
22+
desc: not allowed, use github.com/hashicorp/go-metrics/compat instead
1223

1324
linters:
1425
disable-all: true
1526
enable:
1627
- gofmt
1728
#- golint
1829
- govet
30+
- depguard
1931
#- varcheck
2032
#- typecheck
2133
#- gosimple

CHANGELOG.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# UNRELEASED
22

3+
IMPROVEMENETS
4+
5+
* Added a flag to skip legacy duplicate telemetry. [GH-630](https://github.com/hashicorp/raft/pull/630)
6+
37
# 1.7.0 (June 5th, 2024)
48

59
CHANGES
@@ -37,7 +41,7 @@ CHANGES
3741

3842
go-msgpack v2.1.1 is by default binary compatible with v0.5.5 ("non-builtin" encoding of `time.Time`), but can decode messages produced by v1.1.5 as well ("builtin" encoding of `time.Time`).
3943

40-
However, if users of this libary overrode the version of go-msgpack (especially to v1), this **could break** compatibility if raft nodes are running a mix of versions.
44+
However, if users of this library overrode the version of go-msgpack (especially to v1), this **could break** compatibility if raft nodes are running a mix of versions.
4145

4246
This compatibility can be configured at runtime in Raft using `NetworkTransportConfig.MsgpackUseNewTimeFormat` -- the default is `false`, which maintains compatibility with `go-msgpack` v0.5.5, but if set to `true`, will be compatible with `go-msgpack` v1.1.5.
4347

README.md

+26
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,29 @@ greatly sacrificing performance.
110110
In terms of performance, Raft is comparable to Paxos. Assuming stable leadership,
111111
committing a log entry requires a single round trip to half of the cluster.
112112
Thus performance is bound by disk I/O and network latency.
113+
114+
115+
## Metrics Emission and Compatibility
116+
117+
This library can emit metrics using either `github.com/armon/go-metrics` or `github.com/hashicorp/go-metrics`. Choosing between the libraries is controlled via build tags.
118+
119+
**Build Tags**
120+
* `armonmetrics` - Using this tag will cause metrics to be routed to `armon/go-metrics`
121+
* `hashicorpmetrics` - Using this tag will cause all metrics to be routed to `hashicorp/go-metrics`
122+
123+
If no build tag is specified, the default behavior is to use `armon/go-metrics`.
124+
125+
**Deprecating `armon/go-metrics`**
126+
127+
Emitting metrics to `armon/go-metrics` is officially deprecated. Usage of `armon/go-metrics` will remain the default until mid-2025 with opt-in support continuing to the end of 2025.
128+
129+
**Migration**
130+
To migrate an application currently using the older `armon/go-metrics` to instead use `hashicorp/go-metrics` the following should be done.
131+
132+
1. Upgrade libraries using `armon/go-metrics` to consume `hashicorp/go-metrics/compat` instead. This should involve only changing import statements. All repositories in the `hashicorp` namespace
133+
2. Update an applications library dependencies to those that have the compatibility layer configured.
134+
3. Update the application to use `hashicorp/go-metrics` for configuring metrics export instead of `armon/go-metrics`
135+
* Replace all application imports of `github.com/armon/go-metrics` with `github.com/hashicorp/go-metrics`
136+
* Instrument your build system to build with the `hashicorpmetrics` tag.
137+
138+
Eventually once the default behavior changes to use `hashicorp/go-metrics` by default (mid-2025), you can drop the `hashicorpmetrics` build tag.

api.go

+21-7
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ import (
1212
"sync/atomic"
1313
"time"
1414

15-
metrics "github.com/armon/go-metrics"
1615
hclog "github.com/hashicorp/go-hclog"
16+
metrics "github.com/hashicorp/go-metrics/compat"
1717
)
1818

1919
const (
@@ -59,7 +59,7 @@ var (
5959
ErrEnqueueTimeout = errors.New("timed out enqueuing operation")
6060

6161
// ErrNothingNewToSnapshot is returned when trying to create a snapshot
62-
// but there's nothing new commited to the FSM since we started.
62+
// but there's nothing new committed to the FSM since we started.
6363
ErrNothingNewToSnapshot = errors.New("nothing new to snapshot")
6464

6565
// ErrUnsupportedProtocol is returned when an operation is attempted
@@ -222,6 +222,11 @@ type Raft struct {
222222
// prevote feature is disabled if set to true.
223223
preVoteDisabled bool
224224

225+
// noLegacyTelemetry allows to skip the legacy metrics to avoid duplicates.
226+
// legacy metrics are those that have `_peer_name` as metric suffix instead as labels.
227+
// e.g: raft_replication_heartbeat_peer0
228+
noLegacyTelemetry bool
229+
225230
// RestoreCommittedLogs is used to enable restore committed logs mode
226231
// restore committed logs mode is disabled if set to false.
227232
RestoreCommittedLogs bool
@@ -240,7 +245,8 @@ type Raft struct {
240245
// listing just itself as a Voter, then invoke AddVoter() on it to add other
241246
// servers to the cluster.
242247
func BootstrapCluster(conf *Config, logs LogStore, stable StableStore,
243-
snaps SnapshotStore, trans Transport, configuration Configuration) error {
248+
snaps SnapshotStore, trans Transport, configuration Configuration,
249+
) error {
244250
// Validate the Raft server config.
245251
if err := ValidateConfig(conf); err != nil {
246252
return err
@@ -313,7 +319,8 @@ func BootstrapCluster(conf *Config, logs LogStore, stable StableStore,
313319
// the sole voter, and then join up other new clean-state peer servers using
314320
// the usual APIs in order to bring the cluster back into a known state.
315321
func RecoverCluster(conf *Config, fsm FSM, logs LogStore, stable StableStore,
316-
snaps SnapshotStore, trans Transport, configuration Configuration) error {
322+
snaps SnapshotStore, trans Transport, configuration Configuration,
323+
) error {
317324
// Validate the Raft server config.
318325
if err := ValidateConfig(conf); err != nil {
319326
return err
@@ -444,7 +451,8 @@ func RecoverCluster(conf *Config, fsm FSM, logs LogStore, stable StableStore,
444451
// without starting a Raft instance or connecting to the cluster. This function
445452
// has identical behavior to Raft.GetConfiguration.
446453
func GetConfiguration(conf *Config, fsm FSM, logs LogStore, stable StableStore,
447-
snaps SnapshotStore, trans Transport) (Configuration, error) {
454+
snaps SnapshotStore, trans Transport,
455+
) (Configuration, error) {
448456
conf.skipStartup = true
449457
r, err := NewRaft(conf, fsm, logs, stable, snaps, trans)
450458
if err != nil {
@@ -574,6 +582,7 @@ func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps Sna
574582
followerNotifyCh: make(chan struct{}, 1),
575583
mainThreadSaturation: newSaturationMetric([]string{"raft", "thread", "main", "saturation"}, 1*time.Second),
576584
preVoteDisabled: conf.PreVoteDisabled || !transportSupportPreVote,
585+
noLegacyTelemetry: conf.NoLegacyTelemetry,
577586
RestoreCommittedLogs: conf.RestoreCommittedLogs,
578587
}
579588
if !transportSupportPreVote && !conf.PreVoteDisabled {
@@ -1147,12 +1156,12 @@ func (r *Raft) State() RaftState {
11471156
// lose it.
11481157
//
11491158
// Receivers can expect to receive a notification only if leadership
1150-
// transition has occured.
1159+
// transition has occurred.
11511160
//
11521161
// If receivers aren't ready for the signal, signals may drop and only the
11531162
// latest leadership transition. For example, if a receiver receives subsequent
11541163
// `true` values, they may deduce that leadership was lost and regained while
1155-
// the the receiver was processing first leadership transition.
1164+
// the receiver was processing first leadership transition.
11561165
func (r *Raft) LeaderCh() <-chan bool {
11571166
return r.leaderCh
11581167
}
@@ -1256,6 +1265,11 @@ func (r *Raft) Stats() map[string]string {
12561265
return s
12571266
}
12581267

1268+
// CurrentTerm returns the current term.
1269+
func (r *Raft) CurrentTerm() uint64 {
1270+
return r.getCurrentTerm()
1271+
}
1272+
12591273
// LastIndex returns the last index in stable storage,
12601274
// either from the last log or from the last snapshot.
12611275
func (r *Raft) LastIndex() uint64 {

bench/bench.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ func StoreLogs(b *testing.B, store raft.LogStore) {
9999
func DeleteRange(b *testing.B, store raft.LogStore) {
100100
// Create some fake data. In this case, we create 3 new log entries for each
101101
// test case, and separate them by index in multiples of 10. This allows
102-
// some room so that we can test deleting ranges with "extra" logs to
102+
// some room so that we can test deleting ranges with "extra" logs
103103
// to ensure we stop going to the database once our max index is hit.
104104
var logs []*raft.Log
105105
for n := 0; n < b.N; n++ {

config.go

+5
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,11 @@ type Config struct {
235235
// PreVoteDisabled deactivate the pre-vote feature when set to true
236236
PreVoteDisabled bool
237237

238+
// NoLegacyTelemetry allows to skip the legacy metrics to avoid duplicates.
239+
// legacy metrics are those that have `_peer_name` as metric suffix instead as labels.
240+
// e.g: raft_replication_heartbeat_peer0
241+
NoLegacyTelemetry bool
242+
238243
// RestoreCommittedLogs controls if the Raft server should use the restore committed logs
239244
// mechanism. Restore committed logs requires a LogStore implementation that
240245
// support commit tracking. When such a store is used and this config

configuration.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ func hasVote(configuration Configuration, id ServerID) bool {
180180
return false
181181
}
182182

183-
// inConfiguration returns true if the server identified by 'id' is in in the
183+
// inConfiguration returns true if the server identified by 'id' is in the
184184
// provided Configuration.
185185
func inConfiguration(configuration Configuration, id ServerID) bool {
186186
for _, server := range configuration.Servers {

docs/apply.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Raft Apply
22

33
Apply is the primary operation provided by raft. A client calls `raft.Apply` to apply
4-
a command to the FSM. A command will first be commited, i.e., durably stored on a
4+
a command to the FSM. A command will first be committed, i.e., durably stored on a
55
quorum of raft nodes. Then, the committed command is applied to fsm.
66

77
This sequence diagram shows the steps involved in a `raft.Apply` operation. Each box
@@ -63,7 +63,7 @@ leader's lastIndex). Another parameter to AppendEntries is the LeaderCommitIndex
6363
is some examples:
6464

6565
```
66-
AppenEntries(Log: 1..5, LeaderCommitIndex: 0) // Replicating log entries 1..5,
66+
AppendEntries(Log: 1..5, LeaderCommitIndex: 0) // Replicating log entries 1..5,
6767
// the leader hasn't committed any log entry;
6868
AppendEntries(Log: 6..8, LeaderCommitIndex: 4) // Replicating log entries 6..8,
6969
// log 0..4 are committed after the leader receives
@@ -92,7 +92,7 @@ Therefore, it's possible that a very small window of time exists when all follow
9292
committed the log to disk, the write has been realized in the FSM of the leader but the
9393
followers have not yet applied the log to their FSM.
9494

95-
7. The peer applies the commited entries to the FSM.
95+
7. The peer applies the committed entries to the FSM.
9696

9797
8. If all went well, the follower responds success (`resp.Success = true`) to the
9898
`appendEntries` RPC call.
@@ -108,9 +108,9 @@ grouping the entries that can be applied to the fsm.
108108

109109
11. `processLogs` applies all the committed entries that haven't been applied by batching the log entries and forwarding them through the `fsmMutateCh` channel to fsm.
110110

111-
12. The actual place applying the commited log entries is in the main loop of `runFSM()`.
111+
12. The actual place applying the committed log entries is in the main loop of `runFSM()`.
112112

113113
13. After the log entries that contains the client req are applied to the fsm, the fsm
114-
module will set the reponses to the client request (`req.future.respond(nil)`). From the
114+
module will set the responses to the client request (`req.future.respond(nil)`). From the
115115
client's point of view, the future returned by `raft.Apply` should now be unblocked and
116116
calls to `Error()` or `Response()` should return the data at this point.

fsm.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ import (
88
"io"
99
"time"
1010

11-
"github.com/armon/go-metrics"
1211
hclog "github.com/hashicorp/go-hclog"
12+
"github.com/hashicorp/go-metrics/compat"
1313
)
1414

1515
// FSM is implemented by clients to make use of the replicated log.
@@ -34,6 +34,12 @@ type FSM interface {
3434
// Apply and Snapshot are always called from the same thread, but Apply will
3535
// be called concurrently with FSMSnapshot.Persist. This means the FSM should
3636
// be implemented to allow for concurrent updates while a snapshot is happening.
37+
//
38+
// Clients of this library should make no assumptions about whether a returned
39+
// Snapshot() will actually be stored by Raft. In fact it's quite possible that
40+
// any Snapshot returned by this call will be discarded, and that
41+
// FSMSnapshot.Persist will never be called. Raft will always call
42+
// FSMSnapshot.Release however.
3743
Snapshot() (FSMSnapshot, error)
3844

3945
// Restore is used to restore an FSM from a snapshot. It is not called

fuzzy/cluster.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ func (c *cluster) Stop(t *testing.T, maxWait time.Duration) {
184184
}
185185

186186
// WaitTilUptoDate blocks until all nodes in the cluster have gotten their
187-
// commitedIndex upto the Index from the last successful call to Apply
187+
// committedIndex upto the Index from the last successful call to Apply
188188
func (c *cluster) WaitTilUptoDate(t *testing.T, maxWait time.Duration) {
189189
idx := c.lastApplySuccess.Index()
190190
start := time.Now()

fuzzy/verifier.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ func (v *appendEntriesVerifier) PreAppendEntries(src, target string, req *raft.A
5858
if ldr != src {
5959
v.Lock()
6060
defer v.Unlock()
61-
v.errors = append(v.errors, fmt.Sprintf("Node %v sent an appendEnties request for term %d that said the leader was some other node %v", src, term, ldr))
61+
v.errors = append(v.errors, fmt.Sprintf("Node %v sent an appendEntries request for term %d that said the leader was some other node %v", src, term, ldr))
6262
}
6363
v.RLock()
6464
tl, exists := v.leaderForTerm[term]

go.mod

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@ go 1.20
55
retract v1.1.3 // Deleted original tag; module checksum may not be accurate.
66

77
require (
8-
github.com/armon/go-metrics v0.4.1
98
github.com/hashicorp/go-hclog v1.6.2
9+
github.com/hashicorp/go-metrics v0.5.4
1010
github.com/hashicorp/go-msgpack/v2 v2.1.2
1111
github.com/stretchr/testify v1.8.4
1212
)
1313

1414
require (
15+
github.com/armon/go-metrics v0.4.1 // indirect
1516
github.com/davecgh/go-spew v1.1.1 // indirect
1617
github.com/fatih/color v1.13.0 // indirect
1718
github.com/hashicorp/go-immutable-radix v1.0.0 // indirect

0 commit comments

Comments
 (0)