Skip to content

Commit

Permalink
Merge pull request #5151 from emosbaugh/issue-5149-etcd-creates-new-c…
Browse files Browse the repository at this point in the history
…luster-rather-than-join-if-sync-fails

fix: join node creates new cluster when initial etcd sync config fails
  • Loading branch information
juanluisvaladas authored Dec 23, 2024
2 parents c84a0ea + e6d4e22 commit 3fdb4c5
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 4 deletions.
12 changes: 8 additions & 4 deletions cmd/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -673,15 +673,19 @@ func (c *command) startWorker(ctx context.Context, profile string, nodeConfig *v
return wc.Start(ctx)
}

// If we've got CA in place we assume the node has already joined previously
// If we've got an etcd data directory in place for embedded etcd, or a ca for
// external or other storage types, we assume the node has already joined
// previously.
func (c *command) needToJoin(nodeConfig *v1beta1.ClusterConfig) bool {
if nodeConfig.Spec.Storage.Type == v1beta1.EtcdStorageType && !nodeConfig.Spec.Storage.Etcd.IsExternalClusterUsed() {
// Use the main etcd data directory as the source of truth to determine if this node has already joined
// See https://etcd.io/docs/v3.5/learning/persistent-storage-files/#bbolt-btree-membersnapdb
return !file.Exists(filepath.Join(c.K0sVars.EtcdDataDir, "member", "snap", "db"))
}
if file.Exists(filepath.Join(c.K0sVars.CertRootDir, "ca.key")) &&
file.Exists(filepath.Join(c.K0sVars.CertRootDir, "ca.crt")) {
return false
}
if nodeConfig.Spec.Storage.Type == v1beta1.EtcdStorageType && !nodeConfig.Spec.Storage.Etcd.IsExternalClusterUsed() {
return !file.Exists(filepath.Join(c.K0sVars.EtcdDataDir, "member", "snap", "db"))
}
return true
}

Expand Down
7 changes: 7 additions & 0 deletions pkg/component/controller/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ func (e *Etcd) syncEtcdConfig(ctx context.Context, etcdRequest v1beta1.EtcdReque
etcdResponse, err = e.JoinClient.JoinEtcd(ctx, etcdRequest)
return err
},
// When joining multiple nodes in parallel, etcd can lose consensus and will return 500 responses
// Allow for more time to recover (~ 4 minutes = 0+1+2+4+8+16+32+60+60+60)
retry.Attempts(10),
retry.Delay(1*time.Second),
retry.MaxDelay(60*time.Second),
retry.Context(ctx),
retry.LastErrorOnly(true),
retry.OnRetry(func(attempt uint, err error) {
Expand Down Expand Up @@ -191,6 +196,8 @@ func (e *Etcd) Start(ctx context.Context) error {
"--enable-pprof": "false",
}

// Use the main etcd data directory as the source of truth to determine if this node has already joined
// See https://etcd.io/docs/v3.5/learning/persistent-storage-files/#bbolt-btree-membersnapdb
if file.Exists(filepath.Join(e.K0sVars.EtcdDataDir, "member", "snap", "db")) {
logrus.Warnf("etcd db file(s) already exist, not gonna run join process")
} else if e.JoinClient != nil {
Expand Down

0 comments on commit 3fdb4c5

Please sign in to comment.