From a4128136bed424f202c407908ad4b0028977acf4 Mon Sep 17 00:00:00 2001 From: Aditya Dani Date: Wed, 8 Nov 2017 21:02:25 +0000 Subject: [PATCH 1/5] Separate the mysql app from one single container to a service and deployment model. --- .../k8s/specs/mysql/px-mysql-app.yaml | 53 ++++++++++++++++--- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/drivers/scheduler/k8s/specs/mysql/px-mysql-app.yaml b/drivers/scheduler/k8s/specs/mysql/px-mysql-app.yaml index edd71f01b..a0b6ef3d3 100644 --- a/drivers/scheduler/k8s/specs/mysql/px-mysql-app.yaml +++ b/drivers/scheduler/k8s/specs/mysql/px-mysql-app.yaml @@ -1,8 +1,25 @@ +apiVersion: v1 +kind: Service +metadata: + name: mysql + labels: + app: mysql +spec: + ports: + - port: 3306 + selector: + app: mysql +--- apiVersion: apps/v1beta2 kind: Deployment metadata: name: mysql + labels: + app: mysql spec: + selector: + matchLabels: + app: mysql strategy: rollingUpdate: maxSurge: 1 @@ -17,7 +34,6 @@ spec: metadata: labels: app: mysql - version: "1" spec: containers: - image: mysql:5.6 @@ -30,17 +46,38 @@ spec: volumeMounts: - name: mysql-persistent-storage mountPath: /var/lib/mysql + volumes: + - name: mysql-persistent-storage + persistentVolumeClaim: + claimName: mysql-data +--- +apiVersion: apps/v1beta2 +kind: Deployment +metadata: + name: mysqlslap + labels: + app: mysql +spec: + selector: + matchLabels: + app: mysql + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + replicas: 1 + template: + metadata: + labels: + app: mysql + spec: + containers: - name: mysqlslap image: adityadani/mysqlslap imagePullPolicy: "Always" env: - name: MYSQL_ROOT_PASSWORD value: password - - name: MYSQL_SERVICE_HOST - value: "127.0.0.1" - name: MYSQL_SERVICE_PORT - value: "3306" - volumes: - - name: mysql-persistent-storage - persistentVolumeClaim: - claimName: mysql-data + value: "3306" \ No newline at end of file From 5fee89f0122d830efe4be2f2f09104a0ebda9656 Mon Sep 17 00:00:00 2001 From: Aditya Dani Date: Wed, 8 Nov 2017 21:10:11 +0000 Subject: [PATCH 2/5] Add utilities in node driver and volume driver to Fail and Recover drives. - Add two new APIs in node driver -YankDrive and RecoverDrive - Add implementations for ssh driver. - Add two new APIs in volume driver - GetStorageDevices and RecoverDriver. --- drivers/node/errors.go | 20 ++++++ drivers/node/node.go | 21 +++++++ drivers/node/ssh/ssh.go | 60 ++++++++++++++++++ drivers/volume/portworx/errors.go | 16 ++++- drivers/volume/portworx/portworx.go | 94 ++++++++++++++++++++++++++++- drivers/volume/volume.go | 8 +++ 6 files changed, 216 insertions(+), 3 deletions(-) diff --git a/drivers/node/errors.go b/drivers/node/errors.go index ab92b8992..fde9f0963 100644 --- a/drivers/node/errors.go +++ b/drivers/node/errors.go @@ -63,3 +63,23 @@ type ErrFailedToRunCommand struct { func (e *ErrFailedToRunCommand) Error() string { return fmt.Sprintf("Failed to run command on: %v. Cause: %v", e.Addr, e.Cause) } + +// ErrFailedToYankDrive error type when we fail to simulate drive failure +type ErrFailedToYankDrive struct { + Node Node + Cause string +} + +func (e *ErrFailedToYankDrive) Error() string { + return fmt.Sprintf("Failed to yank a drive on: %v. Cause: %v", e.Node.Name, e.Cause) +} + +// ErrFailedToRecoverDrive error type when we fail to simulate drive failure +type ErrFailedToRecoverDrive struct { + Node Node + Cause string +} + +func (e *ErrFailedToRecoverDrive) Error() string { + return fmt.Sprintf("Failed to recover a drive on: %v. Cause: %v", e.Node.Name, e.Cause) +} diff --git a/drivers/node/node.go b/drivers/node/node.go index 513abebc5..66d164c27 100644 --- a/drivers/node/node.go +++ b/drivers/node/node.go @@ -90,6 +90,13 @@ type Driver interface { // TestConnection tests connection to given node. returns nil if driver can connect to given node TestConnection(node Node, options ConnectionOpts) error + + // YankDrive simulates a failure on the provided drive on the given node. + // It returns the UUID of the drive which can be used to recover it back + YankDrive(node Node, driveNameToFail string, options ConnectionOpts) (string, error) + + // RecoverDrive recovers the given drive from failure on the given node. + RecoverDrive(node Node, driveNameToRecover string, driveUUID string, options ConnectionOpts) error } // Register registers the given node driver @@ -158,6 +165,20 @@ func (d *notSupportedDriver) Systemctl(node Node, service string, options System } } +func (d *notSupportedDriver) YankDrive(node Node, driveToFail string, options ConnectionOpts) (string, error) { + return "", &errors.ErrNotSupported{ + Type: "Function", + Operation: "YankDrive()", + } +} + +func (d *notSupportedDriver) RecoverDrive(node Node, driveToRecover string, driveID string, options ConnectionOpts) error { + return &errors.ErrNotSupported{ + Type: "Function", + Operation: "RecoverDrive()", + } +} + func (d *notSupportedDriver) TestConnection(node Node, options ConnectionOpts) error { return &errors.ErrNotSupported{ Type: "Function", diff --git a/drivers/node/ssh/ssh.go b/drivers/node/ssh/ssh.go index f59dbfa88..ccf2e0196 100644 --- a/drivers/node/ssh/ssh.go +++ b/drivers/node/ssh/ssh.go @@ -5,6 +5,7 @@ import ( "io/ioutil" "os" "strconv" + "strings" "time" "github.com/portworx/sched-ops/task" @@ -181,6 +182,65 @@ func (s *ssh) ShutdownNode(n node.Node, options node.ShutdownNodeOpts) error { return nil } +func (s *ssh) YankDrive(n node.Node, driveNameToFail string, options node.ConnectionOpts) (string, error) { + // Currently only works for iSCSI drives + // TODO: Make it generic (Add support dev mapper devices) + addr, err := s.getAddrToConnect(n, options) + if err != nil { + return "", &node.ErrFailedToYankDrive{ + Node: n, + Cause: fmt.Sprintf("failed to get node address due to: %v", err), + } + } + + // Get the HBA number for the drive which would be then used to recover the drive + hbaCmd := "lsscsi | grep -n " + driveNameToFail + "| awk -F\":\" '{print $2}'" + "| awk -F\"[\" '{print $2}'" + driveID, err := s.doCmd(addr, hbaCmd, false) + if err != nil { + return "", &node.ErrFailedToYankDrive{ + Node: n, + Cause: fmt.Sprintf("unable to find HBA attribute of the drive %v due to: %v", driveNameToFail, err), + } + } + + driveID = strings.TrimRight(driveID, "\n") + driveNameToFail = strings.TrimRight(strings.TrimLeft(driveNameToFail, "/"), "/") + devices := strings.Split(driveNameToFail, "/") + + // Disable the block device, so that it returns IO errors + yankCommand := "echo 1 > /sys/block/" + devices[len(devices)-1] + "/device/delete" + + _, err = s.doCmd(addr, yankCommand, false) + if err != nil { + return "", &node.ErrFailedToYankDrive{ + Node: n, + Cause: fmt.Sprintf("failed to yank drive %v due to: %v", driveNameToFail, err), + } + } + return driveID, nil +} + +func (s *ssh) RecoverDrive(n node.Node, driveNameToRecover string, driveUUIDToRecover string, options node.ConnectionOpts) error { + addr, err := s.getAddrToConnect(n, options) + if err != nil { + return &node.ErrFailedToRecoverDrive{ + Node: n, + Cause: fmt.Sprintf("failed to get node address due to: %v", err), + } + } + + // Enable the drive by rescaning + recoverCmd := "echo \" - - -\" > /sys/class/scsi_host/host" + driveUUIDToRecover + "/scan" + _, err = s.doCmd(addr, recoverCmd, false) + if err != nil { + return &node.ErrFailedToRecoverDrive{ + Node: n, + Cause: fmt.Sprintf("Unable to rescan the drive (%v): %v", driveNameToRecover, err), + } + } + return nil +} + func (s *ssh) FindFiles(path string, n node.Node, options node.FindOpts) (string, error) { addr, err := s.getAddrToConnect(n, options.ConnectionOpts) if err != nil { diff --git a/drivers/volume/portworx/errors.go b/drivers/volume/portworx/errors.go index a205244be..89b7b650a 100644 --- a/drivers/volume/portworx/errors.go +++ b/drivers/volume/portworx/errors.go @@ -38,9 +38,9 @@ func (e *ErrFailedToDeleteVolume) Error() string { return fmt.Sprintf("Failed to delete volume: %v due to err: %v", e.ID, e.Cause) } -// ErrFailedToWaitForPx error type for failing to wait for px to be up on a node +// ErrFailedToWaitForPx error type for failing to wait for PX to be up on a node type ErrFailedToWaitForPx struct { - // Node is the node on which px was waited upon + // Node is the node on which PX was waited upon Node node.Node // Cause is the underlying cause of the error Cause string @@ -61,3 +61,15 @@ type ErrFailedToUpgradeVolumeDriver struct { func (e *ErrFailedToUpgradeVolumeDriver) Error() string { return fmt.Sprintf("Failed to upgrade volume driver to version: %v due to err: %v", e.Version, e.Cause) } + +// ErrFailedToRecoverDriver error type for failing to recover PX on a node +type ErrFailedToRecoverDriver struct { + // Node is the node on which PX failed to recover on + Node node.Node + // Cause is the underlying cause of the error + Cause string +} + +func (e *ErrFailedToRecoverDriver) Error() string { + return fmt.Sprintf("Failed to wait for px to be up on: %v due to err: %v", e.Node.Name, e.Cause) +} diff --git a/drivers/volume/portworx/portworx.go b/drivers/volume/portworx/portworx.go index ecbdcc6ef..569c7277d 100644 --- a/drivers/volume/portworx/portworx.go +++ b/drivers/volume/portworx/portworx.go @@ -8,6 +8,7 @@ import ( "time" "github.com/libopenstorage/openstorage/api" + "github.com/libopenstorage/openstorage/api/client" clusterclient "github.com/libopenstorage/openstorage/api/client/cluster" volumeclient "github.com/libopenstorage/openstorage/api/client/volume" "github.com/libopenstorage/openstorage/api/spec" @@ -27,6 +28,9 @@ const ( pxdClientSchedUserAgent = "pxd-sched" pxdRestPort = 9001 pxVersionLabel = "PX Version" + maintenanceOpRetries = 3 + enterMaintenancePath = "/entermaintenance" + exitMaintenancePath = "/exitmaintenance" ) type portworx struct { @@ -152,6 +156,78 @@ func (d *portworx) CleanupVolume(name string) error { return nil } +func (d *portworx) GetStorageDevices(n node.Node) ([]string, error) { + const ( + storageInfoKey = "STORAGE-INFO" + resourcesKey = "Resources" + pathKey = "path" + ) + pxNode, err := d.getClusterManager().Inspect(n.Name) + if err != nil { + return []string{}, err + } + devPaths := []string{} + + storageInfo, ok := pxNode.NodeData[storageInfoKey] + if !ok { + return []string{}, fmt.Errorf("Unable to find storage info for node: %v", n.Name) + } + storageInfoMap := storageInfo.(map[string]interface{}) + + resourcesMapIntf, ok := storageInfoMap[resourcesKey] + if !ok || resourcesMapIntf == nil { + return []string{}, fmt.Errorf("Unable to find resource info for node: %v", n.Name) + } + resourcesMap := resourcesMapIntf.(map[string]interface{}) + + for _, v := range resourcesMap { + resource := v.(map[string]interface{}) + path, _ := resource[pathKey] + if path == "" { + continue + } + devPaths = append(devPaths, path.(string)) + } + return devPaths, nil +} + +func (d *portworx) RecoverDriver(n node.Node) error { + + t := func() (interface{}, error) { + return nil, d.maintenanceOp(n, enterMaintenancePath) + } + + if _, err := task.DoRetryWithTimeout(t, 1*time.Minute, 10*time.Second); err != nil { + return err + } + t = func() (interface{}, error) { + apiNode, err := d.getClusterManager().Inspect(n.Name) + if err != nil { + return nil, err + } + if apiNode.Status == api.Status_STATUS_MAINTENANCE { + return nil, nil + } + return nil, fmt.Errorf("Node %v is not in Maintenance mode", n.Name) + } + + if _, err := task.DoRetryWithTimeout(t, 1*time.Minute, 10*time.Second); err != nil { + return &ErrFailedToRecoverDriver{ + Node: n, + Cause: err.Error(), + } + } + t = func() (interface{}, error) { + return nil, d.maintenanceOp(n, exitMaintenancePath) + } + + if _, err := task.DoRetryWithTimeout(t, 1*time.Minute, 10*time.Second); err != nil { + return err + } + + return nil +} + func (d *portworx) ValidateCreateVolume(name string, params map[string]string) error { t := func() (interface{}, error) { vols, err := d.getVolDriver().Inspect([]string{name}) @@ -476,7 +552,7 @@ func (d *portworx) setDriver() error { } func (d *portworx) testAndSetEndpoint(endpoint string) error { - pxEndpoint := fmt.Sprintf("http://%s:%d", endpoint, pxdRestPort) + pxEndpoint := d.constructURL(endpoint) cClient, err := clusterclient.NewClusterClient(pxEndpoint, "v1") if err != nil { return err @@ -533,6 +609,22 @@ func (d *portworx) getClusterManager() cluster.Cluster { d.setDriver() } return d.clusterManager + +} + +func (d *portworx) maintenanceOp(n node.Node, op string) error { + url := d.constructURL(n.Addresses[0]) + c, err := client.NewClient(url, "", "") + if err != nil { + return err + } + req := c.Get().Resource(op) + resp := req.Do() + return resp.Error() +} + +func (d *portworx) constructURL(ip string) string { + return fmt.Sprintf("http://%s:%d", ip, pxdRestPort) } func init() { diff --git a/drivers/volume/volume.go b/drivers/volume/volume.go index 28c612cf5..b12a66019 100644 --- a/drivers/volume/volume.go +++ b/drivers/volume/volume.go @@ -56,6 +56,14 @@ type Driver interface { // RandomizeVolumeName randomizes the volume name from the given name RandomizeVolumeName(name string) string + + // RecoverDriver will recover a volume driver from a failure/storage down state. + // This could be used by a volume driver to recover itself from any underlying storage + // failure. + RecoverDriver(n node.Node) error + + // GetStorageDevices returns the list of storage devices used by the given node. + GetStorageDevices(n node.Node) ([]string, error) } var ( From b3b3d2ed7d3dbfe31158ebcf26137bfd877c4d00 Mon Sep 17 00:00:00 2001 From: Aditya Dani Date: Wed, 8 Nov 2017 21:11:27 +0000 Subject: [PATCH 3/5] Add a ginkgo test to induce a drive failure and check apps. --- tests/drive_failure/drive_failure_test.go | 105 ++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 tests/drive_failure/drive_failure_test.go diff --git a/tests/drive_failure/drive_failure_test.go b/tests/drive_failure/drive_failure_test.go new file mode 100644 index 000000000..a81f8e7ef --- /dev/null +++ b/tests/drive_failure/drive_failure_test.go @@ -0,0 +1,105 @@ +package tests + +import ( + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + "github.com/portworx/torpedo/drivers/node" + . "github.com/portworx/torpedo/tests" +) + +func TestDriveFailure(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Torpedo: DriveFailure") +} + +var _ = BeforeSuite(func() { + InitInstance() +}) + +func driveFailureTest(testName string) { + It("has to schedule apps and induce a drive failure on one of the nodes", func() { + var err error + contexts := ScheduleAndValidate(testName) + + Step("get nodes for all apps in test and induce drive failure on one of the nodes", func() { + for _, ctx := range contexts { + var ( + drives []string + appNodes []node.Node + nodeWithDrive node.Node + driveToFail, driveID string + ) + + Step(fmt.Sprintf("get nodes where %s app is running", ctx.App.Key), func() { + appNodes, err = Inst().S.GetNodesForApp(ctx) + Expect(err).NotTo(HaveOccurred()) + Expect(appNodes).NotTo(BeEmpty()) + nodeWithDrive = appNodes[0] + }) + + Step(fmt.Sprintf("get drive from node %v", nodeWithDrive), func() { + drives, err = Inst().V.GetStorageDevices(nodeWithDrive) + Expect(err).NotTo(HaveOccurred()) + Expect(drives).NotTo(BeEmpty()) + driveToFail = drives[0] + }) + + Step(fmt.Sprintf("induce a drive failure on %v on node %v", driveToFail, nodeWithDrive), func() { + driveID, err = Inst().N.YankDrive(nodeWithDrive, driveToFail, node.ConnectionOpts{ + Timeout: 1 * time.Minute, + TimeBeforeRetry: 5 * time.Second, + }) + Expect(err).NotTo(HaveOccurred()) + + Step("wait for the drive to fail", func() { + time.Sleep(30 * time.Second) + }) + + Step(fmt.Sprintf("check if apps are running"), func() { + ValidateContext(ctx) + }) + + }) + + Step(fmt.Sprintf("recover drive and the storage driver"), func() { + err = Inst().N.RecoverDrive(nodeWithDrive, driveToFail, driveID, node.ConnectionOpts{ + Timeout: 1 * time.Minute, + TimeBeforeRetry: 5 * time.Second, + }) + Expect(err).NotTo(HaveOccurred()) + + err = Inst().V.RecoverDriver(nodeWithDrive) + Expect(err).NotTo(HaveOccurred()) + }) + + Step(fmt.Sprintf("check if volume driver is up"), func() { + err = Inst().V.WaitForNode(nodeWithDrive) + Expect(err).NotTo(HaveOccurred()) + }) + } + }) + + Step("validate and destroy apps", func() { + for _, ctx := range contexts { + ValidateAndDestroy(ctx, nil) + } + }) + + }) +} + +var _ = Describe("Induce drive failure on of the nodes", func() { + driveFailureTest("drivefailure") +}) + +var _ = AfterSuite(func() { + ValidateCleanup() +}) + +func init() { + ParseFlags() +} From 6765b8295dcf49b4aa619f0ce1c5b9d74033cdc9 Mon Sep 17 00:00:00 2001 From: Aditya Dani Date: Thu, 9 Nov 2017 18:32:01 +0000 Subject: [PATCH 4/5] Handle review comments --- drivers/node/ssh/ssh.go | 2 +- drivers/volume/portworx/portworx.go | 10 +++++----- tests/drive_failure/drive_failure_test.go | 7 ++----- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/node/ssh/ssh.go b/drivers/node/ssh/ssh.go index ccf2e0196..dc355c35c 100644 --- a/drivers/node/ssh/ssh.go +++ b/drivers/node/ssh/ssh.go @@ -204,7 +204,7 @@ func (s *ssh) YankDrive(n node.Node, driveNameToFail string, options node.Connec } driveID = strings.TrimRight(driveID, "\n") - driveNameToFail = strings.TrimRight(strings.TrimLeft(driveNameToFail, "/"), "/") + driveNameToFail = strings.Trim(driveNameToFail, "/") devices := strings.Split(driveNameToFail, "/") // Disable the block device, so that it returns IO errors diff --git a/drivers/volume/portworx/portworx.go b/drivers/volume/portworx/portworx.go index 569c7277d..112e724b9 100644 --- a/drivers/volume/portworx/portworx.go +++ b/drivers/volume/portworx/portworx.go @@ -162,24 +162,24 @@ func (d *portworx) GetStorageDevices(n node.Node) ([]string, error) { resourcesKey = "Resources" pathKey = "path" ) - pxNode, err := d.getClusterManager().Inspect(n.Name) + pxNode, err := d.getClusterManager().Inspect(n.VolDriverNodeID) if err != nil { - return []string{}, err + return nil, err } - devPaths := []string{} storageInfo, ok := pxNode.NodeData[storageInfoKey] if !ok { - return []string{}, fmt.Errorf("Unable to find storage info for node: %v", n.Name) + return nil, fmt.Errorf("Unable to find storage info for node: %v", n.Name) } storageInfoMap := storageInfo.(map[string]interface{}) resourcesMapIntf, ok := storageInfoMap[resourcesKey] if !ok || resourcesMapIntf == nil { - return []string{}, fmt.Errorf("Unable to find resource info for node: %v", n.Name) + return nil, fmt.Errorf("Unable to find resource info for node: %v", n.Name) } resourcesMap := resourcesMapIntf.(map[string]interface{}) + devPaths := []string{} for _, v := range resourcesMap { resource := v.(map[string]interface{}) path, _ := resource[pathKey] diff --git a/tests/drive_failure/drive_failure_test.go b/tests/drive_failure/drive_failure_test.go index a81f8e7ef..15d6bd052 100644 --- a/tests/drive_failure/drive_failure_test.go +++ b/tests/drive_failure/drive_failure_test.go @@ -20,7 +20,8 @@ var _ = BeforeSuite(func() { InitInstance() }) -func driveFailureTest(testName string) { +var _ = Describe("Induce drive failure on of the nodes", func() { + testName := "drivefailure" It("has to schedule apps and induce a drive failure on one of the nodes", func() { var err error contexts := ScheduleAndValidate(testName) @@ -90,10 +91,6 @@ func driveFailureTest(testName string) { }) }) -} - -var _ = Describe("Induce drive failure on of the nodes", func() { - driveFailureTest("drivefailure") }) var _ = AfterSuite(func() { From 89657a47e93ac0e1b5b94640b55483ae2b7265eb Mon Sep 17 00:00:00 2001 From: Aditya Dani Date: Mon, 11 Dec 2017 01:09:41 -0800 Subject: [PATCH 5/5] Handle review comments --- .../k8s/specs/mysql/px-mysql-storage.yaml | 2 +- tests/drive_failure/drive_failure_test.go | 44 +++++++++++-------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/drivers/scheduler/k8s/specs/mysql/px-mysql-storage.yaml b/drivers/scheduler/k8s/specs/mysql/px-mysql-storage.yaml index 3c3a1f6b1..173c3f0cd 100644 --- a/drivers/scheduler/k8s/specs/mysql/px-mysql-storage.yaml +++ b/drivers/scheduler/k8s/specs/mysql/px-mysql-storage.yaml @@ -5,7 +5,7 @@ metadata: name: px-mysql-sc provisioner: kubernetes.io/portworx-volume parameters: - repl: "2" + repl: "3" --- ##### Portworx persistent volume claim kind: PersistentVolumeClaim diff --git a/tests/drive_failure/drive_failure_test.go b/tests/drive_failure/drive_failure_test.go index 15d6bd052..6f54b6462 100644 --- a/tests/drive_failure/drive_failure_test.go +++ b/tests/drive_failure/drive_failure_test.go @@ -20,7 +20,7 @@ var _ = BeforeSuite(func() { InitInstance() }) -var _ = Describe("Induce drive failure on of the nodes", func() { +var _ = Describe("Induce drive failure on one of the nodes", func() { testName := "drivefailure" It("has to schedule apps and induce a drive failure on one of the nodes", func() { var err error @@ -29,10 +29,9 @@ var _ = Describe("Induce drive failure on of the nodes", func() { Step("get nodes for all apps in test and induce drive failure on one of the nodes", func() { for _, ctx := range contexts { var ( - drives []string - appNodes []node.Node - nodeWithDrive node.Node - driveToFail, driveID string + drives []string + appNodes []node.Node + nodeWithDrive node.Node ) Step(fmt.Sprintf("get nodes where %s app is running", ctx.App.Key), func() { @@ -46,17 +45,19 @@ var _ = Describe("Induce drive failure on of the nodes", func() { drives, err = Inst().V.GetStorageDevices(nodeWithDrive) Expect(err).NotTo(HaveOccurred()) Expect(drives).NotTo(BeEmpty()) - driveToFail = drives[0] }) - Step(fmt.Sprintf("induce a drive failure on %v on node %v", driveToFail, nodeWithDrive), func() { - driveID, err = Inst().N.YankDrive(nodeWithDrive, driveToFail, node.ConnectionOpts{ - Timeout: 1 * time.Minute, - TimeBeforeRetry: 5 * time.Second, - }) - Expect(err).NotTo(HaveOccurred()) - - Step("wait for the drive to fail", func() { + driveInfoMap := make(map[string]string) + Step(fmt.Sprintf("induce a failure on all drives on the node %v", nodeWithDrive), func() { + for _, driveToFail := range drives { + driveID, err := Inst().N.YankDrive(nodeWithDrive, driveToFail, node.ConnectionOpts{ + Timeout: 1 * time.Minute, + TimeBeforeRetry: 5 * time.Second, + }) + driveInfoMap[driveToFail] = driveID + Expect(err).NotTo(HaveOccurred()) + } + Step("wait for the drives to fail", func() { time.Sleep(30 * time.Second) }) @@ -66,12 +67,17 @@ var _ = Describe("Induce drive failure on of the nodes", func() { }) - Step(fmt.Sprintf("recover drive and the storage driver"), func() { - err = Inst().N.RecoverDrive(nodeWithDrive, driveToFail, driveID, node.ConnectionOpts{ - Timeout: 1 * time.Minute, - TimeBeforeRetry: 5 * time.Second, + Step(fmt.Sprintf("recover all drives and the storage driver"), func() { + for _, driveToFail := range drives { + err = Inst().N.RecoverDrive(nodeWithDrive, driveToFail, driveInfoMap[driveToFail], node.ConnectionOpts{ + Timeout: 2 * time.Minute, + TimeBeforeRetry: 5 * time.Second, + }) + Expect(err).NotTo(HaveOccurred()) + } + Step("wait for the drives to recover", func() { + time.Sleep(30 * time.Second) }) - Expect(err).NotTo(HaveOccurred()) err = Inst().V.RecoverDriver(nodeWithDrive) Expect(err).NotTo(HaveOccurred())