diff --git a/server/etcdserver/memory_storage_test.go b/server/etcdserver/memory_storage_test.go new file mode 100644 index 00000000000..19e61efc688 --- /dev/null +++ b/server/etcdserver/memory_storage_test.go @@ -0,0 +1,67 @@ +// Copyright 2024 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package etcdserver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "go.etcd.io/raft/v3" + "go.etcd.io/raft/v3/raftpb" +) + +// TestMemoryStorageCompactInclusive tests that compacting is inclusive, +// meaning the first index after compaction is larger by one than compacted index. +func TestMemoryStorageCompactInclusive(t *testing.T) { + // entries: [ {Index: 0} ] + raftStorage := raft.NewMemoryStorage() + + firstIndex, err := raftStorage.FirstIndex() + assert.NoError(t, err) + assert.Equal(t, uint64(1), firstIndex) + + // after appending, entries should be: + // [ {Index: 0}, {Index: 1}, {Index: 2}, {Index: 3}, {Index: 4}, {Index: 5} ] + appliedIndex := uint64(1) + for ; appliedIndex <= 5; appliedIndex++ { + e := raftpb.Entry{ + Type: raftpb.EntryNormal, + Term: 1, + Index: appliedIndex, + } + err := raftStorage.Append([]raftpb.Entry{e}) + assert.NoError(t, err) + } + + firstIndex, err = raftStorage.FirstIndex() + assert.NoError(t, err) + assert.Equal(t, uint64(1), firstIndex) + + lastIndex, err := raftStorage.LastIndex() + assert.NoError(t, err) + assert.Equal(t, uint64(5), lastIndex) + + // after compacting, entries should be: + // [ {Index: 3}, {Index: 4}, {Index: 5} ] + compacti := uint64(3) + err = raftStorage.Compact(compacti) + assert.NoError(t, err) + + + firstIndex, err = raftStorage.FirstIndex() + assert.NoError(t, err) + assert.Equal(t, compacti+1, firstIndex) +} diff --git a/server/etcdserver/server.go b/server/etcdserver/server.go index ba3a3f3ffe1..165a2a4c0cd 100644 --- a/server/etcdserver/server.go +++ b/server/etcdserver/server.go @@ -745,6 +745,7 @@ type etcdProgress struct { snapi uint64 appliedt uint64 appliedi uint64 + compacti uint64 } // raftReadyHandler contains a set of EtcdServer operations to be called by raftNode, @@ -764,6 +765,10 @@ func (s *EtcdServer) run() { if err != nil { lg.Panic("failed to get snapshot from Raft storage", zap.Error(err)) } + firstRaftIndex, err := s.r.raftStorage.FirstIndex() + if err != nil { + lg.Panic("failed to get first index from Raft storage", zap.Error(err)) + } // asynchronously accept toApply packets, dispatch progress in-order sched := schedule.NewFIFOScheduler(lg) @@ -813,6 +818,10 @@ func (s *EtcdServer) run() { snapi: sn.Metadata.Index, appliedt: sn.Metadata.Term, appliedi: sn.Metadata.Index, + // Compaction is inclusive, meaning compact index should be lower by one + // than the first index after compaction. + // This is validated by TestMemoryStorageCompaction. + compacti: firstRaftIndex - 1, } defer func() { @@ -980,6 +989,7 @@ func (s *EtcdServer) applyAll(ep *etcdProgress, apply *toApply) { <-apply.notifyc s.triggerSnapshot(ep) + s.maybeCompactRaftLog(ep) select { // snapshot requested via send() case m := <-s.r.msgSnapC: @@ -2170,6 +2180,22 @@ func (s *EtcdServer) snapshot(snapi uint64, confState raftpb.ConfState) { "saved snapshot", zap.Uint64("snapshot-index", snap.Metadata.Index), ) +} + +func (s *EtcdServer) maybeCompactRaftLog(ep *etcdProgress) { + // Retain all log entries up to the latest snapshot index to ensure any member can recover from that snapshot. + // Beyond the snapshot index, preserve the most recent s.Cfg.SnapshotCatchUpEntries entries in memory. + // This allows slow followers to catch up by synchronizing entries instead of requiring a full snapshot transfer. + if ep.snapi <= s.Cfg.SnapshotCatchUpEntries { + return + } + + compacti := ep.snapi - s.Cfg.SnapshotCatchUpEntries + if compacti <= ep.compacti { + return + } + + lg := s.Logger() // When sending a snapshot, etcd will pause compaction. // After receives a snapshot, the slow follower needs to get all the entries right after @@ -2181,13 +2207,7 @@ func (s *EtcdServer) snapshot(snapi uint64, confState raftpb.ConfState) { return } - // keep some in memory log entries for slow followers. - compacti := uint64(1) - if snapi > s.Cfg.SnapshotCatchUpEntries { - compacti = snapi - s.Cfg.SnapshotCatchUpEntries - } - - err = s.r.raftStorage.Compact(compacti) + err := s.r.raftStorage.Compact(compacti) if err != nil { // the compaction was done asynchronously with the progress of raft. // raft log might already been compact. @@ -2196,6 +2216,7 @@ func (s *EtcdServer) snapshot(snapi uint64, confState raftpb.ConfState) { } lg.Panic("failed to compact", zap.Error(err)) } + ep.compacti = compacti lg.Info( "compacted Raft logs", zap.Uint64("compact-index", compacti), diff --git a/tests/integration/raft_log_test.go b/tests/integration/raft_log_test.go new file mode 100644 index 00000000000..c38a15d8187 --- /dev/null +++ b/tests/integration/raft_log_test.go @@ -0,0 +1,114 @@ +// Copyright 2024 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package integration + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" + + pb "go.etcd.io/etcd/api/v3/etcdserverpb" + "go.etcd.io/etcd/tests/v3/framework/integration" +) + +// TestRaftLogCompaction tests whether raft log snapshot and compaction work correctly. +func TestRaftLogCompaction(t *testing.T) { + integration.BeforeTest(t) + + clus := integration.NewCluster(t, &integration.ClusterConfig{ + Size: 1, + SnapshotCount: 10, + SnapshotCatchUpEntries: 5, + }) + defer clus.Terminate(t) + + mem := clus.Members[0] + + // Get applied index of raft log + endpoint := mem.Client.Endpoints()[0] + assert.NotEmpty(t, endpoint) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + status, _ := mem.Client.Status(ctx, endpoint) + appliedi := status.RaftAppliedIndex + // Assume applied index is less than 10, should be fine at this stage + assert.Less(t, appliedi, uint64(10)) + + kvc := integration.ToGRPC(mem.Client).KV + + // When applied index is a multiple of 11 (SnapshotCount+1), + // a snapshot is created, and entries are compacted. + // + // increase applied index to 10 + for ; appliedi < 10; appliedi++ { + _, err := kvc.Put(context.TODO(), &pb.PutRequest{Key: []byte("foo"), Value: []byte("bar")}) + if err != nil { + t.Errorf("#%d: couldn't put key (%v)", appliedi, err) + } + } + // The first snapshot and compaction shouldn't happen because applied index is less than 11 + logOccurredAtMostNTimes(t, mem, 5*time.Second, "saved snapshot", 0) + logOccurredAtMostNTimes(t, mem, time.Second, "compacted Raft logs", 0) + + // increase applied index to 11 + for ; appliedi < 11; appliedi++ { + _, err := kvc.Put(context.TODO(), &pb.PutRequest{Key: []byte("foo"), Value: []byte("bar")}) + if err != nil { + t.Errorf("#%d: couldn't put key (%v)", appliedi, err) + } + } + // The first snapshot and compaction should happen because applied index is 11 + logOccurredAtMostNTimes(t, mem, 5*time.Second, "saved snapshot", 1) + logOccurredAtMostNTimes(t, mem, time.Second, "compacted Raft logs", 1) + expectMemberLog(t, mem, time.Second, "\"compact-index\": 6", 1) + + // increase applied index to 1100 + for ; appliedi < 1100; appliedi++ { + _, err := kvc.Put(context.TODO(), &pb.PutRequest{Key: []byte("foo"), Value: []byte("bar")}) + if err != nil { + t.Errorf("#%d: couldn't put key (%v)", appliedi, err) + } + } + // With applied index at 1100, snapshot and compaction should happen 100 times. + logOccurredAtMostNTimes(t, mem, 5*time.Second, "saved snapshot", 100) + logOccurredAtMostNTimes(t, mem, time.Second, "compacted Raft logs", 100) + expectMemberLog(t, mem, time.Second, "\"compact-index\": 1095", 1) +} + +// logOccurredAtMostNTimes ensures that the log has exactly `count` occurrences of `s` before timing out, no more, no less. +func logOccurredAtMostNTimes(t *testing.T, m *integration.Member, timeout time.Duration, s string, count int) { + ctx, cancel := context.WithTimeout(context.TODO(), timeout) + defer cancel() + + // The log must have `count` occurrences before timeout + _, err := m.LogObserver.Expect(ctx, s, count) + if err != nil { + t.Fatalf("failed to expect(log:%s, count:%d): %v", s, count, err) + } + + // The log mustn't have `count+1` occurrences before timeout + lines, err := m.LogObserver.Expect(ctx, s, count+1) + if err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return + } else { + t.Fatalf("failed to expect(log:%s, count:%d): %v", s, count+1, err) + } + } + t.Fatalf("failed: too many occurrences of %s, expect %d, got %d", s, count, len(lines)) +} \ No newline at end of file