Merge "NetApp - Fixed Device busy error when multiple cinder volume clone created in parallel"

This commit is contained in:
Zuul
2025-06-27 23:34:34 +00:00
committed by Gerrit Code Review
4 changed files with 142 additions and 9 deletions

View File

@@ -379,6 +379,58 @@ class NetAppBlockStorageCmodeLibraryTestCase(test.TestCase):
qos_policy_group_is_adaptive=False,
source_snapshot=None, is_snapshot=True)
def test_clone_lun_busy_exception(self):
"""Test for when clone lun is throwing device busy error."""
self.library._get_lun_attr = mock.Mock(
return_value={'Volume': 'fakeLUN'})
self.library.zapi_client = mock.Mock()
lun = fake.FAKE_LUN_GET_ITER_RESULT
self.library.zapi_client.get_lun_by_args.return_value = lun
self.library._add_lun_to_table = mock.Mock()
msg = 'Device busy'
self.mock_object(self.library.zapi_client,
'clone_lun',
mock.Mock(side_effect=netapp_api.NaApiError(
message=msg)))
self.mock_object(self.library,
'_retry_clone_lun',
mock.Mock(return_value=None)
)
self.library._clone_lun('fakeLUN', 'newFakeLUN', is_snapshot=True)
self.library.zapi_client.clone_lun.assert_called_once_with(
'fakeLUN', 'fakeLUN', 'newFakeLUN', 'true', block_count=0,
dest_block=0, src_block=0, qos_policy_group_name=None,
qos_policy_group_is_adaptive=False,
source_snapshot=None, is_snapshot=True)
def test__retry_clone_lun_success(self):
self.library.zapi_client = mock.Mock()
self.library._retry_clone_lun('fakeSourceLUN',
'fakeLUN',
'newFakeLUN',
'false',
)
self.library.zapi_client.clone_lun.assert_called_once_with(
'fakeSourceLUN', 'fakeLUN', 'newFakeLUN', 'false', block_count=0,
dest_block=0, src_block=0, qos_policy_group_name=None,
qos_policy_group_is_adaptive=False, source_snapshot=None,
is_snapshot=False)
def test_retry_clone_lun_failure(self):
self.library.zapi_client = mock.Mock()
self.mock_object(self.library.zapi_client,
'clone_lun',
mock.Mock(
side_effect=na_utils.NetAppDriverException),
)
self.assertRaises(
na_utils.NetAppDriverException,
self.library._retry_clone_lun,
'fakeLUN', 'fakeLUN',
'newFakeLUN', 'false',
)
def test_get_fc_target_wwpns(self):
ports = [fake.FC_FORMATTED_TARGET_WWPNS[0],
fake.FC_FORMATTED_TARGET_WWPNS[1]]

View File

@@ -23,6 +23,7 @@
"""
Volume driver library for NetApp C-mode block storage systems.
"""
import time
from oslo_log import log as logging
from oslo_service import loopingcall
@@ -33,6 +34,7 @@ from cinder import exception
from cinder.i18n import _
from cinder.objects import fields
from cinder.volume.drivers.netapp.dataontap import block_base
from cinder.volume.drivers.netapp.dataontap.client import api as netapp_api
from cinder.volume.drivers.netapp.dataontap.performance import perf_cmode
from cinder.volume.drivers.netapp.dataontap.utils import capabilities
from cinder.volume.drivers.netapp.dataontap.utils import data_motion
@@ -235,14 +237,30 @@ class NetAppBlockStorageCmodeLibrary(
metadata = self._get_lun_attr(name, 'metadata')
volume = metadata['Volume']
self.zapi_client.clone_lun(
volume, name, new_name, space_reserved,
qos_policy_group_name=qos_policy_group_name,
src_block=src_block, dest_block=dest_block,
block_count=block_count,
source_snapshot=source_snapshot,
is_snapshot=is_snapshot,
qos_policy_group_is_adaptive=qos_policy_group_is_adaptive)
try:
self.zapi_client.clone_lun(
volume, name, new_name, space_reserved,
qos_policy_group_name=qos_policy_group_name,
src_block=src_block, dest_block=dest_block,
block_count=block_count,
source_snapshot=source_snapshot,
is_snapshot=is_snapshot,
qos_policy_group_is_adaptive=qos_policy_group_is_adaptive,
)
except netapp_api.NaApiError as e:
with excutils.save_and_reraise_exception() as exc_context:
if 'Device busy' in e.message:
self._retry_clone_lun(
volume, name, new_name, space_reserved,
qos_policy_group_name=qos_policy_group_name,
src_block=src_block, dest_block=dest_block,
block_count=block_count,
source_snapshot=source_snapshot,
is_snapshot=is_snapshot,
qos_policy_group_is_adaptive=(
qos_policy_group_is_adaptive),
)
exc_context.reraise = False
LOG.debug("Cloned LUN with new name %s", new_name)
lun = self.zapi_client.get_lun_by_args(vserver=self.vserver,
@@ -260,6 +278,49 @@ class NetAppBlockStorageCmodeLibrary(
clone_lun['Size'],
clone_lun))
def _retry_clone_lun(self, volume, name, new_name, space_reserved,
qos_policy_group_name=None, src_block=0,
dest_block=0, block_count=0,
source_snapshot=None, is_snapshot=False,
qos_policy_group_is_adaptive=False):
"""Retry lun clone creation when ONTAP throws device busy error"""
# timeout and interval are configurable parameters that the user can
# specify under the backend stanza. If the user does not set these
# values, default values will be used. For example, if timeout is set
# to 60 seconds and interval is set to 5 seconds, then this code will
# retry the LUN clone every 5 seconds until the 60-second timeout is
# reached.
timeout = self.configuration.safe_get('netapp_lun_clone_busy_timeout')
interval = self.configuration.safe_get(
'netapp_lun_clone_busy_interval')
retries = int(timeout / interval)
for attempt in range(1, retries + 1):
try:
self.zapi_client.clone_lun(
volume, name, new_name, space_reserved,
qos_policy_group_name=qos_policy_group_name,
src_block=src_block, dest_block=dest_block,
block_count=block_count,
source_snapshot=source_snapshot,
is_snapshot=is_snapshot,
qos_policy_group_is_adaptive=qos_policy_group_is_adaptive,
)
LOG.info("LUN clone succeeded on attempt %s.", attempt)
break
except netapp_api.NaApiError as e:
if 'Device busy' in e.message:
LOG.debug("Attempt %s failed with device busy error."
"Retrying after %s seconds...", attempt,
interval)
if attempt == retries:
msg = _("Timed out after %s retry for LUN clone"
" creation")
raise na_utils.NetAppDriverException(msg % retries)
time.sleep(interval)
else:
raise netapp_api.NaApiError(e.code, e.message)
def _get_fc_target_wwpns(self, include_partner=True):
return self.zapi_client.get_fc_target_wwpns()

View File

@@ -257,7 +257,19 @@ netapp_san_opts = [
'applied to the names of objects from the storage '
'backend which represent pools in Cinder. This option '
'is only utilized when the storage protocol is '
'configured to use iSCSI or FC.')), ]
'configured to use iSCSI or FC.')),
cfg.IntOpt('netapp_lun_clone_busy_timeout',
min=0,
default=30,
help='Specifies the maximum time (in seconds) to retry'
' the LUN clone operation when an ONTAP "device busy"'
' error occurs.'),
cfg.IntOpt('netapp_lun_clone_busy_interval',
min=0,
default=3,
help='Specifies the time interval (in seconds) to retry'
' the LUN clone operation when an ONTAP "device busy"'
' error occurs.')]
netapp_replication_opts = [
cfg.MultiOpt('netapp_replication_aggregate_map',

View File

@@ -0,0 +1,8 @@
---
fixes:
- |
NetApp Driver `bug #2112245
<https://bugs.launchpad.net/cinder/+bug/2112245>`_: Fixed the issue where
a few cinder volume clone operations failed during bulk clone creation.
Added retry logic to ensure the NetApp driver retries any failed clone
operations.