Skip to content

[release-0.12] Fix panic when OpenStack server is deleted by an external agent #2477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/v1beta1/conditions_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ const (
OpenStackErrorReason = "OpenStackError"
// DependencyFailedReason indicates that a dependent object failed.
DependencyFailedReason = "DependencyFailed"

// ServerUnexpectedDeletedMessage is the message used when the server is unexpectedly deleted via an external agent.
ServerUnexpectedDeletedMessage = "The server was unexpectedly deleted"
)

const (
Expand Down
30 changes: 17 additions & 13 deletions controllers/openstackmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,11 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
return ctrl.Result{}, err
}

result := r.reconcileMachineState(scope, openStackMachine, machine, machineServer)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

given we set the openStackMachine.Status.Addresses at line 405, shouldn't we keep the reconcileMachineState after that?
If we don't do that I think we might end up returning the result without the addresses reconciled.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if result != nil {
return *result, nil
}

computeService, err := compute.NewService(scope)
if err != nil {
return ctrl.Result{}, err
Expand All @@ -377,6 +382,12 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
return ctrl.Result{}, err
}

if instanceStatus == nil {
msg := "server has been unexpectedly deleted"
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeletedReason, clusterv1.ConditionSeverityError, msg)
openStackMachine.SetFailure(capoerrors.DeprecatedCAPIUpdateMachineError, errors.New(msg))
}

instanceNS, err := instanceStatus.NetworkStatus()
if err != nil {
return ctrl.Result{}, fmt.Errorf("get network status: %w", err)
Expand All @@ -393,22 +404,15 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
})
openStackMachine.Status.Addresses = addresses

result := r.reconcileMachineState(scope, openStackMachine, machine, machineServer)
if result != nil {
return *result, nil
}

if !util.IsControlPlaneMachine(machine) {
scope.Logger().Info("Not a Control plane machine, no floating ip reconcile needed, Reconciled Machine create successfully")
return ctrl.Result{}, nil
}
if util.IsControlPlaneMachine(machine) {
err = r.reconcileAPIServerLoadBalancer(scope, openStackCluster, openStackMachine, instanceStatus, instanceNS, clusterResourceName)
if err != nil {
return ctrl.Result{}, err
}

err = r.reconcileAPIServerLoadBalancer(scope, openStackCluster, openStackMachine, instanceStatus, instanceNS, clusterResourceName)
if err != nil {
return ctrl.Result{}, err
conditions.MarkTrue(openStackMachine, infrav1.APIServerIngressReadyCondition)
}

conditions.MarkTrue(openStackMachine, infrav1.APIServerIngressReadyCondition)
scope.Logger().Info("Reconciled Machine create successfully")
return ctrl.Result{}, nil
}
Expand Down
22 changes: 14 additions & 8 deletions controllers/openstackserver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -433,9 +433,18 @@ func (r *OpenStackServerReconciler) getOrCreateServer(ctx context.Context, logge

if openStackServer.Status.InstanceID != nil {
instanceStatus, err = computeService.GetInstanceStatus(*openStackServer.Status.InstanceID)
if err != nil {
logger.Info("Unable to get OpenStack instance", "name", openStackServer.Name)
conditions.MarkFalse(openStackServer, infrav1.InstanceReadyCondition, infrav1.OpenStackErrorReason, clusterv1.ConditionSeverityError, "%s", err.Error())
if err != nil || instanceStatus == nil {
logger.Info("Unable to get OpenStack instance", "name", openStackServer.Name, "id", *openStackServer.Status.InstanceID)
var msg string
var reason string
if err != nil {
msg = err.Error()
reason = infrav1.OpenStackErrorReason
} else {
msg = infrav1.ServerUnexpectedDeletedMessage
reason = infrav1.InstanceNotFoundReason
}
conditions.MarkFalse(openStackServer, infrav1.InstanceReadyCondition, reason, clusterv1.ConditionSeverityError, msg)
return nil, err
}
}
Expand All @@ -450,11 +459,6 @@ func (r *OpenStackServerReconciler) getOrCreateServer(ctx context.Context, logge
logger.Info("Server already exists", "name", openStackServer.Name, "id", instanceStatus.ID())
return instanceStatus, nil
}
if openStackServer.Status.InstanceID != nil {
logger.Info("Not reconciling server in failed state. The previously existing OpenStack instance is no longer available")
conditions.MarkFalse(openStackServer, infrav1.InstanceReadyCondition, infrav1.InstanceNotFoundReason, clusterv1.ConditionSeverityError, "virtual machine no longer exists")
return nil, nil
}

logger.Info("Server does not exist, creating Server", "name", openStackServer.Name)
instanceSpec, err := r.serverToInstanceSpec(ctx, openStackServer)
Expand All @@ -468,6 +472,8 @@ func (r *OpenStackServerReconciler) getOrCreateServer(ctx context.Context, logge
openStackServer.Status.InstanceState = &infrav1.InstanceStateError
return nil, fmt.Errorf("create OpenStack instance: %w", err)
}
// We reached a point where a server was created with no error but we can't predict its state yet which is why we don't update conditions yet.
// The actual state of the server is checked in the next reconcile loops.
return instanceStatus, nil
}
return instanceStatus, nil
Expand Down
184 changes: 184 additions & 0 deletions controllers/openstackserver_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,11 @@ import (
"github.com/gophercloud/gophercloud/v2/openstack/networking/v2/ports"
. "github.com/onsi/gomega" //nolint:revive
"go.uber.org/mock/gomock"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util/conditions"

infrav1alpha1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1alpha1"
infrav1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1beta1"
Expand Down Expand Up @@ -548,3 +551,184 @@ func Test_OpenStackServerReconcileCreate(t *testing.T) {
})
}
}

func TestOpenStackServerReconciler_getOrCreateServer(t *testing.T) {
tests := []struct {
name string
openStackServer *infrav1alpha1.OpenStackServer
setupMocks func(r *recorders)
wantServer *servers.Server
wantErr bool
wantCondition *clusterv1.Condition
}{
{
name: "instanceID set in status but server not found",
openStackServer: &infrav1alpha1.OpenStackServer{
Status: infrav1alpha1.OpenStackServerStatus{
InstanceID: ptr.To(instanceUUID),
},
},
setupMocks: func(r *recorders) {
r.compute.GetServer(instanceUUID).Return(nil, gophercloud.ErrUnexpectedResponseCode{Actual: 404})
},
wantErr: false,
wantCondition: &clusterv1.Condition{
Type: infrav1.InstanceReadyCondition,
Status: corev1.ConditionFalse,
Reason: infrav1.InstanceNotFoundReason,
Message: infrav1.ServerUnexpectedDeletedMessage,
},
},
{
name: "instanceID set in status but server not found with error",
openStackServer: &infrav1alpha1.OpenStackServer{
Status: infrav1alpha1.OpenStackServerStatus{
InstanceID: ptr.To(instanceUUID),
},
},
setupMocks: func(r *recorders) {
r.compute.GetServer(instanceUUID).Return(nil, fmt.Errorf("error"))
},
wantErr: true,
wantCondition: &clusterv1.Condition{
Type: infrav1.InstanceReadyCondition,
Status: corev1.ConditionFalse,
Reason: infrav1.OpenStackErrorReason,
Message: "get server \"" + instanceUUID + "\" detail failed: error",
},
},
{
name: "instanceStatus is nil but server found with machine name",
openStackServer: &infrav1alpha1.OpenStackServer{
ObjectMeta: metav1.ObjectMeta{
Name: openStackServerName,
},
Status: infrav1alpha1.OpenStackServerStatus{},
},
setupMocks: func(r *recorders) {
r.compute.ListServers(servers.ListOpts{
Name: "^" + openStackServerName + "$",
}).Return([]servers.Server{{ID: instanceUUID}}, nil)
},
wantErr: false,
wantServer: &servers.Server{
ID: instanceUUID,
},
},
{
name: "instanceStatus is nil and server not found and then created",
openStackServer: &infrav1alpha1.OpenStackServer{
ObjectMeta: metav1.ObjectMeta{
Name: openStackServerName,
},
Status: infrav1alpha1.OpenStackServerStatus{
Resolved: &infrav1alpha1.ResolvedServerSpec{
ImageID: imageUUID,
FlavorID: flavorUUID,
Ports: defaultResolvedPorts,
},
},
},
setupMocks: func(r *recorders) {
r.compute.ListServers(servers.ListOpts{
Name: "^" + openStackServerName + "$",
}).Return([]servers.Server{}, nil)
r.compute.CreateServer(gomock.Any(), gomock.Any()).Return(&servers.Server{ID: instanceUUID}, nil)
},
wantErr: false,
wantServer: &servers.Server{
ID: instanceUUID,
},
// It's off but no condition is set because the server creation was kicked off but we
// don't know the result yet in this function.
},
{
name: "instanceStatus is nil and server not found and then created with error",
openStackServer: &infrav1alpha1.OpenStackServer{
ObjectMeta: metav1.ObjectMeta{
Name: openStackServerName,
},
Status: infrav1alpha1.OpenStackServerStatus{
Resolved: &infrav1alpha1.ResolvedServerSpec{
ImageID: imageUUID,
FlavorID: flavorUUID,
Ports: defaultResolvedPorts,
},
},
},
setupMocks: func(r *recorders) {
r.compute.ListServers(servers.ListOpts{
Name: "^" + openStackServerName + "$",
}).Return([]servers.Server{}, nil)
r.compute.CreateServer(gomock.Any(), gomock.Any()).Return(nil, fmt.Errorf("error"))
},
wantErr: true,
wantCondition: &clusterv1.Condition{
Type: infrav1.InstanceReadyCondition,
Status: corev1.ConditionFalse,
Reason: infrav1.InstanceCreateFailedReason,
Message: "error creating Openstack instance: " + "error",
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := NewGomegaWithT(t)
log := testr.New(t)

mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()

mockScopeFactory := scope.NewMockScopeFactory(mockCtrl, "")
scopeWithLogger := scope.NewWithLogger(mockScopeFactory, log)

computeRecorder := mockScopeFactory.ComputeClient.EXPECT()
imageRecorder := mockScopeFactory.ImageClient.EXPECT()
networkRecorder := mockScopeFactory.NetworkClient.EXPECT()
volumeRecorder := mockScopeFactory.VolumeClient.EXPECT()

recorders := &recorders{
compute: computeRecorder,
image: imageRecorder,
network: networkRecorder,
volume: volumeRecorder,
}

if tt.setupMocks != nil {
tt.setupMocks(recorders)
}

computeService, err := compute.NewService(scopeWithLogger)
g.Expect(err).ToNot(HaveOccurred())

reconciler := OpenStackServerReconciler{}
status, err := reconciler.getOrCreateServer(ctx, log, tt.openStackServer, computeService, []string{portUUID})

// Check error result
if tt.wantErr {
g.Expect(err).To(HaveOccurred())
} else {
g.Expect(err).ToNot(HaveOccurred())
}

// Check instance status
if tt.wantServer != nil {
g.Expect(status.ID()).To(Equal(tt.wantServer.ID))
}

// Check the condition is set correctly
if tt.wantCondition != nil {
// print openstackServer conditions
for _, condition := range tt.openStackServer.Status.Conditions {
t.Logf("Condition: %s, Status: %s, Reason: %s", condition.Type, condition.Status, condition.Reason)
}
conditionType := conditions.Get(tt.openStackServer, tt.wantCondition.Type)
g.Expect(conditionType).ToNot(BeNil())
g.Expect(conditionType.Status).To(Equal(tt.wantCondition.Status))
g.Expect(conditionType.Reason).To(Equal(tt.wantCondition.Reason))
g.Expect(conditionType.Message).To(Equal(tt.wantCondition.Message))
}
})
}
}
1 change: 1 addition & 0 deletions test/e2e/data/e2e_conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ intervals:
default/wait-control-plane: ["30m", "10s"]
default/wait-worker-nodes: ["30m", "10s"]
default/wait-delete-cluster: ["5m", "10s"]
default/wait-delete-machine: ["30m", "10s"]
default/wait-alt-az: ["20m", "30s"]
default/wait-machine-upgrade: ["30m", "10s"]
default/wait-nodes-ready: ["15m", "10s"]
Expand Down
15 changes: 15 additions & 0 deletions test/e2e/shared/openstack.go
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,21 @@ func CreateOpenStackNetwork(e2eCtx *E2EContext, name, cidr string) (*networks.Ne
return net, nil
}

func DeleteOpenStackServer(ctx context.Context, e2eCtx *E2EContext, id string) error {
providerClient, clientOpts, _, err := GetTenantProviderClient(e2eCtx)
if err != nil {
_, _ = fmt.Fprintf(GinkgoWriter, "error creating provider client: %s\n", err)
return err
}

computeClient, err := openstack.NewComputeV2(providerClient, gophercloud.EndpointOpts{Region: clientOpts.RegionName})
if err != nil {
return fmt.Errorf("error creating compute client: %s", err)
}

return servers.Delete(ctx, computeClient, id).ExtractErr()
}

func DeleteOpenStackNetwork(ctx context.Context, e2eCtx *E2EContext, id string) error {
providerClient, clientOpts, _, err := GetTenantProviderClient(e2eCtx)
if err != nil {
Expand Down
22 changes: 22 additions & 0 deletions test/e2e/suites/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,28 @@ var _ = Describe("e2e tests [PR-Blocking]", func() {
Expect(additionalVolume.AvailabilityZone).To(Equal(failureDomain))
Expect(additionalVolume.VolumeType).To(Equal(volumeTypeAlt))
}

// This last block tests a scenario where an external agent deletes a server (e.g. a user via Horizon).
// We want to ensure that the OpenStackMachine conditions are updated to reflect the server deletion.
// Context: https://github.com/kubernetes-sigs/cluster-api-provider-openstack/issues/2474
shared.Logf("Deleting a server")
serverToDelete := allServers[controlPlaneMachines[0].Spec.InfrastructureRef.Name]
err = shared.DeleteOpenStackServer(ctx, e2eCtx, serverToDelete.ID)
Expect(err).NotTo(HaveOccurred())
shared.Logf("Waiting for the OpenStackMachine to have a condition that the server has been unexpectedly deleted")
Eventually(func() bool {
openStackMachine := &infrav1.OpenStackMachine{}
err := e2eCtx.Environment.BootstrapClusterProxy.GetClient().Get(ctx, crclient.ObjectKey{Name: controlPlaneMachines[0].Name, Namespace: controlPlaneMachines[0].Namespace}, openStackMachine)
if err != nil {
return false
}
for _, condition := range openStackMachine.Status.Conditions {
if condition.Type == infrav1.InstanceReadyCondition && condition.Status == corev1.ConditionFalse && condition.Reason == infrav1.InstanceDeletedReason && condition.Message == "server has been unexpectedly deleted" {
return true
}
}
return false
}, e2eCtx.E2EConfig.GetIntervals(specName, "wait-delete-machine")...).Should(BeTrue())
})
})
})
Expand Down