From 1ec7b48fde2c1deb02ba554c1ed964c467b58b87 Mon Sep 17 00:00:00 2001 From: agireesh Date: Mon, 2 Jun 2025 01:05:18 -0400 Subject: [PATCH] NetApp - Fixed Device busy error when multiple cinder volume clone created in parallel A few cinder volume clone operations failed during bulk clone creation. Added retry logic to ensure the NetApp driver retries any failed clone operations. Closes-Bug: #2112245 Change-Id: I734ade0fc9c938c15eb46e3b74e44c843c71d3e3 --- .../netapp/dataontap/test_block_cmode.py | 52 +++++++++++++ .../drivers/netapp/dataontap/block_cmode.py | 77 +++++++++++++++++-- cinder/volume/drivers/netapp/options.py | 14 +++- ...reation-device-issue-b9d82f7a826c9f2b.yaml | 8 ++ 4 files changed, 142 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/bug-2112245-bulk-vms-creation-device-issue-b9d82f7a826c9f2b.yaml diff --git a/cinder/tests/unit/volume/drivers/netapp/dataontap/test_block_cmode.py b/cinder/tests/unit/volume/drivers/netapp/dataontap/test_block_cmode.py index 90851da7cbf..aebeff23db6 100644 --- a/cinder/tests/unit/volume/drivers/netapp/dataontap/test_block_cmode.py +++ b/cinder/tests/unit/volume/drivers/netapp/dataontap/test_block_cmode.py @@ -379,6 +379,58 @@ class NetAppBlockStorageCmodeLibraryTestCase(test.TestCase): qos_policy_group_is_adaptive=False, source_snapshot=None, is_snapshot=True) + def test_clone_lun_busy_exception(self): + """Test for when clone lun is throwing device busy error.""" + self.library._get_lun_attr = mock.Mock( + return_value={'Volume': 'fakeLUN'}) + self.library.zapi_client = mock.Mock() + lun = fake.FAKE_LUN_GET_ITER_RESULT + self.library.zapi_client.get_lun_by_args.return_value = lun + self.library._add_lun_to_table = mock.Mock() + msg = 'Device busy' + self.mock_object(self.library.zapi_client, + 'clone_lun', + mock.Mock(side_effect=netapp_api.NaApiError( + message=msg))) + self.mock_object(self.library, + '_retry_clone_lun', + mock.Mock(return_value=None) + ) + self.library._clone_lun('fakeLUN', 'newFakeLUN', is_snapshot=True) + + self.library.zapi_client.clone_lun.assert_called_once_with( + 'fakeLUN', 'fakeLUN', 'newFakeLUN', 'true', block_count=0, + dest_block=0, src_block=0, qos_policy_group_name=None, + qos_policy_group_is_adaptive=False, + source_snapshot=None, is_snapshot=True) + + def test__retry_clone_lun_success(self): + self.library.zapi_client = mock.Mock() + self.library._retry_clone_lun('fakeSourceLUN', + 'fakeLUN', + 'newFakeLUN', + 'false', + ) + self.library.zapi_client.clone_lun.assert_called_once_with( + 'fakeSourceLUN', 'fakeLUN', 'newFakeLUN', 'false', block_count=0, + dest_block=0, src_block=0, qos_policy_group_name=None, + qos_policy_group_is_adaptive=False, source_snapshot=None, + is_snapshot=False) + + def test_retry_clone_lun_failure(self): + self.library.zapi_client = mock.Mock() + self.mock_object(self.library.zapi_client, + 'clone_lun', + mock.Mock( + side_effect=na_utils.NetAppDriverException), + ) + self.assertRaises( + na_utils.NetAppDriverException, + self.library._retry_clone_lun, + 'fakeLUN', 'fakeLUN', + 'newFakeLUN', 'false', + ) + def test_get_fc_target_wwpns(self): ports = [fake.FC_FORMATTED_TARGET_WWPNS[0], fake.FC_FORMATTED_TARGET_WWPNS[1]] diff --git a/cinder/volume/drivers/netapp/dataontap/block_cmode.py b/cinder/volume/drivers/netapp/dataontap/block_cmode.py index fb3e67ca070..b10c3f59491 100644 --- a/cinder/volume/drivers/netapp/dataontap/block_cmode.py +++ b/cinder/volume/drivers/netapp/dataontap/block_cmode.py @@ -23,6 +23,7 @@ """ Volume driver library for NetApp C-mode block storage systems. """ +import time from oslo_log import log as logging from oslo_service import loopingcall @@ -33,6 +34,7 @@ from cinder import exception from cinder.i18n import _ from cinder.objects import fields from cinder.volume.drivers.netapp.dataontap import block_base +from cinder.volume.drivers.netapp.dataontap.client import api as netapp_api from cinder.volume.drivers.netapp.dataontap.performance import perf_cmode from cinder.volume.drivers.netapp.dataontap.utils import capabilities from cinder.volume.drivers.netapp.dataontap.utils import data_motion @@ -235,14 +237,30 @@ class NetAppBlockStorageCmodeLibrary( metadata = self._get_lun_attr(name, 'metadata') volume = metadata['Volume'] - self.zapi_client.clone_lun( - volume, name, new_name, space_reserved, - qos_policy_group_name=qos_policy_group_name, - src_block=src_block, dest_block=dest_block, - block_count=block_count, - source_snapshot=source_snapshot, - is_snapshot=is_snapshot, - qos_policy_group_is_adaptive=qos_policy_group_is_adaptive) + try: + self.zapi_client.clone_lun( + volume, name, new_name, space_reserved, + qos_policy_group_name=qos_policy_group_name, + src_block=src_block, dest_block=dest_block, + block_count=block_count, + source_snapshot=source_snapshot, + is_snapshot=is_snapshot, + qos_policy_group_is_adaptive=qos_policy_group_is_adaptive, + ) + except netapp_api.NaApiError as e: + with excutils.save_and_reraise_exception() as exc_context: + if 'Device busy' in e.message: + self._retry_clone_lun( + volume, name, new_name, space_reserved, + qos_policy_group_name=qos_policy_group_name, + src_block=src_block, dest_block=dest_block, + block_count=block_count, + source_snapshot=source_snapshot, + is_snapshot=is_snapshot, + qos_policy_group_is_adaptive=( + qos_policy_group_is_adaptive), + ) + exc_context.reraise = False LOG.debug("Cloned LUN with new name %s", new_name) lun = self.zapi_client.get_lun_by_args(vserver=self.vserver, @@ -260,6 +278,49 @@ class NetAppBlockStorageCmodeLibrary( clone_lun['Size'], clone_lun)) + def _retry_clone_lun(self, volume, name, new_name, space_reserved, + qos_policy_group_name=None, src_block=0, + dest_block=0, block_count=0, + source_snapshot=None, is_snapshot=False, + qos_policy_group_is_adaptive=False): + """Retry lun clone creation when ONTAP throws device busy error""" + # timeout and interval are configurable parameters that the user can + # specify under the backend stanza. If the user does not set these + # values, default values will be used. For example, if timeout is set + # to 60 seconds and interval is set to 5 seconds, then this code will + # retry the LUN clone every 5 seconds until the 60-second timeout is + # reached. + timeout = self.configuration.safe_get('netapp_lun_clone_busy_timeout') + interval = self.configuration.safe_get( + 'netapp_lun_clone_busy_interval') + retries = int(timeout / interval) + + for attempt in range(1, retries + 1): + try: + self.zapi_client.clone_lun( + volume, name, new_name, space_reserved, + qos_policy_group_name=qos_policy_group_name, + src_block=src_block, dest_block=dest_block, + block_count=block_count, + source_snapshot=source_snapshot, + is_snapshot=is_snapshot, + qos_policy_group_is_adaptive=qos_policy_group_is_adaptive, + ) + LOG.info("LUN clone succeeded on attempt %s.", attempt) + break + except netapp_api.NaApiError as e: + if 'Device busy' in e.message: + LOG.debug("Attempt %s failed with device busy error." + "Retrying after %s seconds...", attempt, + interval) + if attempt == retries: + msg = _("Timed out after %s retry for LUN clone" + " creation") + raise na_utils.NetAppDriverException(msg % retries) + time.sleep(interval) + else: + raise netapp_api.NaApiError(e.code, e.message) + def _get_fc_target_wwpns(self, include_partner=True): return self.zapi_client.get_fc_target_wwpns() diff --git a/cinder/volume/drivers/netapp/options.py b/cinder/volume/drivers/netapp/options.py index 7a365f73a15..ca3959a85fa 100644 --- a/cinder/volume/drivers/netapp/options.py +++ b/cinder/volume/drivers/netapp/options.py @@ -257,7 +257,19 @@ netapp_san_opts = [ 'applied to the names of objects from the storage ' 'backend which represent pools in Cinder. This option ' 'is only utilized when the storage protocol is ' - 'configured to use iSCSI or FC.')), ] + 'configured to use iSCSI or FC.')), + cfg.IntOpt('netapp_lun_clone_busy_timeout', + min=0, + default=30, + help='Specifies the maximum time (in seconds) to retry' + ' the LUN clone operation when an ONTAP "device busy"' + ' error occurs.'), + cfg.IntOpt('netapp_lun_clone_busy_interval', + min=0, + default=3, + help='Specifies the time interval (in seconds) to retry' + ' the LUN clone operation when an ONTAP "device busy"' + ' error occurs.')] netapp_replication_opts = [ cfg.MultiOpt('netapp_replication_aggregate_map', diff --git a/releasenotes/notes/bug-2112245-bulk-vms-creation-device-issue-b9d82f7a826c9f2b.yaml b/releasenotes/notes/bug-2112245-bulk-vms-creation-device-issue-b9d82f7a826c9f2b.yaml new file mode 100644 index 00000000000..5c47c51fda1 --- /dev/null +++ b/releasenotes/notes/bug-2112245-bulk-vms-creation-device-issue-b9d82f7a826c9f2b.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + NetApp Driver `bug #2112245 + `_: Fixed the issue where + a few cinder volume clone operations failed during bulk clone creation. + Added retry logic to ensure the NetApp driver retries any failed clone + operations.