 da557011ec
			
		
	
	da557011ec
	
	
	
		
			
			The handoffs_first mode in the replicator has the useful behavior of processing all handoff parts across all disks until there aren't any handoffs anymore on the node [1] and then it seemingly tries to drop back into normal operation. In practice I've only ever heard of handoffs_first used while rebalancing and turned off as soon as the rebalance finishes - it's not recommended to run with handoffs_first mode turned on and it emits a warning on startup if option is enabled. The handoffs_first mode on the reconstructor doesn't work - it was prioritizing handoffs *per-part* [2] - which is really unfortunate because in the reconstructor during a rebalance it's often *much* more attractive from an efficiency disk/network perspective to revert a partition from a handoff than it is to rebuild an entire partition from another primary using the other EC fragments in the cluster. This change deprecates handoffs_first in favor of handoffs_only in the reconstructor which is far more useful - and just like handoffs_first mode in the replicator - it gives the operator the option of forcing the consistency engine to focus on rebalance. The handoffs_only behavior is somewhat consistent with the replicator's handoffs_first option (any error on any handoff in the replicactor will make it essentially handoff only forever) but the option does what you want and is named correctly in the reconstructor. For consistency with the replicator the reconstructor will mostly honor the handoffs_first option, but if you set handoffs_only in the config it always takes precedence. Having handoffs_first in your config always results in a warning, but if handoff_only is not set and handoffs_first is true the reconstructor will assume you need handoffs_only and behaves as such. When running in handoffs_only mode the reconstructor will start to log a warning every cycle if you leave it running in handoffs_only after it finishes reverting handoffs. However you should be monitoring on-disk partitions and disable the option as soon as the cluster finishes the full rebalance cycle. 1. Ia324728d42c606e2f9e7d29b4ab5fcbff6e47aea fixed replicator handoffs_first "mode" 2. Unlike replication each partition in a EC policy can have a different kind of job per frag_index, but the cardinality of jobs is typically only one (either sync or revert) unless there's been a bunch of errors during write and then handoffs partitions maybe hold a number of different fragments. Known-Issues: handoffs_only is not documented outside of the example config, see lp bug #1626290 Closes-Bug: #1653018 Change-Id: Idde4b6cf92fab6c45f2c0c2733277701eb436898
		
			
				
	
	
		
			429 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			429 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| [DEFAULT]
 | |
| # bind_ip = 0.0.0.0
 | |
| bind_port = 6200
 | |
| # bind_timeout = 30
 | |
| # backlog = 4096
 | |
| # user = swift
 | |
| # swift_dir = /etc/swift
 | |
| # devices = /srv/node
 | |
| # mount_check = true
 | |
| # disable_fallocate = false
 | |
| # expiring_objects_container_divisor = 86400
 | |
| # expiring_objects_account_name = expiring_objects
 | |
| #
 | |
| # Use an integer to override the number of pre-forked processes that will
 | |
| # accept connections.  NOTE: if servers_per_port is set, this setting is
 | |
| # ignored.
 | |
| # workers = auto
 | |
| #
 | |
| # Make object-server run this many worker processes per unique port of "local"
 | |
| # ring devices across all storage policies. The default value of 0 disables this
 | |
| # feature.
 | |
| # servers_per_port = 0
 | |
| #
 | |
| # Maximum concurrent requests per worker
 | |
| # max_clients = 1024
 | |
| #
 | |
| # You can specify default log routing here if you want:
 | |
| # log_name = swift
 | |
| # log_facility = LOG_LOCAL0
 | |
| # log_level = INFO
 | |
| # log_address = /dev/log
 | |
| # The following caps the length of log lines to the value given; no limit if
 | |
| # set to 0, the default.
 | |
| # log_max_line_length = 0
 | |
| #
 | |
| # comma separated list of functions to call to setup custom log handlers.
 | |
| # functions get passed: conf, name, log_to_console, log_route, fmt, logger,
 | |
| # adapted_logger
 | |
| # log_custom_handlers =
 | |
| #
 | |
| # If set, log_udp_host will override log_address
 | |
| # log_udp_host =
 | |
| # log_udp_port = 514
 | |
| #
 | |
| # You can enable StatsD logging here:
 | |
| # log_statsd_host =
 | |
| # log_statsd_port = 8125
 | |
| # log_statsd_default_sample_rate = 1.0
 | |
| # log_statsd_sample_rate_factor = 1.0
 | |
| # log_statsd_metric_prefix =
 | |
| #
 | |
| # eventlet_debug = false
 | |
| #
 | |
| # You can set fallocate_reserve to the number of bytes or percentage of disk
 | |
| # space you'd like fallocate to reserve, whether there is space for the given
 | |
| # file size or not. Percentage will be used if the value ends with a '%'.
 | |
| # fallocate_reserve = 1%
 | |
| #
 | |
| # Time to wait while attempting to connect to another backend node.
 | |
| # conn_timeout = 0.5
 | |
| # Time to wait while sending each chunk of data to another backend node.
 | |
| # node_timeout = 3
 | |
| # Time to wait while sending a container update on object update.
 | |
| # container_update_timeout = 1.0
 | |
| # Time to wait while receiving each chunk of data from a client or another
 | |
| # backend node.
 | |
| # client_timeout = 60
 | |
| #
 | |
| # network_chunk_size = 65536
 | |
| # disk_chunk_size = 65536
 | |
| #
 | |
| # Reclamation of tombstone files is performed primarily by the replicator and
 | |
| # the reconstructor but the object-server and object-auditor also reference
 | |
| # this value - it should be the same for all object services in the cluster,
 | |
| # and not greater than the container services reclaim_age
 | |
| # reclaim_age = 604800
 | |
| #
 | |
| # You can set scheduling priority of processes. Niceness values range from -20
 | |
| # (most favorable to the process) to 19 (least favorable to the process).
 | |
| # nice_priority =
 | |
| #
 | |
| # You can set I/O scheduling class and priority of processes. I/O niceness
 | |
| # class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
 | |
| # IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
 | |
| # 0 to 7. The higher the value, the lower the I/O priority of the process.
 | |
| # Work only with ionice_class.
 | |
| # ionice_class =
 | |
| # ionice_priority =
 | |
| 
 | |
| [pipeline:main]
 | |
| pipeline = healthcheck recon object-server
 | |
| 
 | |
| [app:object-server]
 | |
| use = egg:swift#object
 | |
| # You can override the default log routing for this app here:
 | |
| # set log_name = object-server
 | |
| # set log_facility = LOG_LOCAL0
 | |
| # set log_level = INFO
 | |
| # set log_requests = true
 | |
| # set log_address = /dev/log
 | |
| #
 | |
| # max_upload_time = 86400
 | |
| #
 | |
| # slow is the total amount of seconds an object PUT/DELETE request takes at
 | |
| # least. If it is faster, the object server will sleep this amount of time minus
 | |
| # the already passed transaction time.  This is only useful for simulating slow
 | |
| # devices on storage nodes during testing and development.
 | |
| # slow = 0
 | |
| #
 | |
| # Objects smaller than this are not evicted from the buffercache once read
 | |
| # keep_cache_size = 5242880
 | |
| #
 | |
| # If true, objects for authenticated GET requests may be kept in buffer cache
 | |
| # if small enough
 | |
| # keep_cache_private = false
 | |
| #
 | |
| # on PUTs, sync data every n MB
 | |
| # mb_per_sync = 512
 | |
| #
 | |
| # Comma separated list of headers that can be set in metadata on an object.
 | |
| # This list is in addition to X-Object-Meta-* headers and cannot include
 | |
| # Content-Type, etag, Content-Length, or deleted
 | |
| # allowed_headers = Content-Disposition, Content-Encoding, X-Delete-At, X-Object-Manifest, X-Static-Large-Object
 | |
| #
 | |
| # auto_create_account_prefix = .
 | |
| #
 | |
| # Configure parameter for creating specific server
 | |
| # To handle all verbs, including replication verbs, do not specify
 | |
| # "replication_server" (this is the default). To only handle replication,
 | |
| # set to a True value (e.g. "True" or "1"). To handle only non-replication
 | |
| # verbs, set to "False". Unless you have a separate replication network, you
 | |
| # should not specify any value for "replication_server".
 | |
| # replication_server = false
 | |
| #
 | |
| # Set to restrict the number of concurrent incoming SSYNC requests
 | |
| # Set to 0 for unlimited
 | |
| # Note that SSYNC requests are only used by the object reconstructor or the
 | |
| # object replicator when configured to use ssync.
 | |
| # replication_concurrency = 4
 | |
| #
 | |
| # Restricts incoming SSYNC requests to one per device,
 | |
| # replication_currency above allowing. This can help control I/O to each
 | |
| # device, but you may wish to set this to False to allow multiple SSYNC
 | |
| # requests (up to the above replication_concurrency setting) per device.
 | |
| # replication_one_per_device = True
 | |
| #
 | |
| # Number of seconds to wait for an existing replication device lock before
 | |
| # giving up.
 | |
| # replication_lock_timeout = 15
 | |
| #
 | |
| # These next two settings control when the SSYNC subrequest handler will
 | |
| # abort an incoming SSYNC attempt. An abort will occur if there are at
 | |
| # least threshold number of failures and the value of failures / successes
 | |
| # exceeds the ratio. The defaults of 100 and 1.0 means that at least 100
 | |
| # failures have to occur and there have to be more failures than successes for
 | |
| # an abort to occur.
 | |
| # replication_failure_threshold = 100
 | |
| # replication_failure_ratio = 1.0
 | |
| #
 | |
| # Use splice() for zero-copy object GETs. This requires Linux kernel
 | |
| # version 3.0 or greater. If you set "splice = yes" but the kernel
 | |
| # does not support it, error messages will appear in the object server
 | |
| # logs at startup, but your object servers should continue to function.
 | |
| #
 | |
| # splice = no
 | |
| #
 | |
| # You can set scheduling priority of processes. Niceness values range from -20
 | |
| # (most favorable to the process) to 19 (least favorable to the process).
 | |
| # nice_priority =
 | |
| #
 | |
| # You can set I/O scheduling class and priority of processes. I/O niceness
 | |
| # class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
 | |
| # IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
 | |
| # 0 to 7. The higher the value, the lower the I/O priority of the process.
 | |
| # Work only with ionice_class.
 | |
| # ionice_class =
 | |
| # ionice_priority =
 | |
| 
 | |
| [filter:healthcheck]
 | |
| use = egg:swift#healthcheck
 | |
| # An optional filesystem path, which if present, will cause the healthcheck
 | |
| # URL to return "503 Service Unavailable" with a body of "DISABLED BY FILE"
 | |
| # disable_path =
 | |
| 
 | |
| [filter:recon]
 | |
| use = egg:swift#recon
 | |
| #recon_cache_path = /var/cache/swift
 | |
| #recon_lock_path = /var/lock
 | |
| 
 | |
| [object-replicator]
 | |
| # You can override the default log routing for this app here (don't use set!):
 | |
| # log_name = object-replicator
 | |
| # log_facility = LOG_LOCAL0
 | |
| # log_level = INFO
 | |
| # log_address = /dev/log
 | |
| #
 | |
| # daemonize = on
 | |
| #
 | |
| # Time in seconds to wait between replication passes
 | |
| # interval = 30
 | |
| # run_pause is deprecated, use interval instead
 | |
| # run_pause = 30
 | |
| #
 | |
| # concurrency = 1
 | |
| # stats_interval = 300
 | |
| #
 | |
| # default is rsync, alternative is ssync
 | |
| # sync_method = rsync
 | |
| #
 | |
| # max duration of a partition rsync
 | |
| # rsync_timeout = 900
 | |
| #
 | |
| # bandwidth limit for rsync in kB/s. 0 means unlimited
 | |
| # rsync_bwlimit = 0
 | |
| #
 | |
| # passed to rsync for io op timeout
 | |
| # rsync_io_timeout = 30
 | |
| #
 | |
| # Allow rsync to compress data which is transmitted to destination node
 | |
| # during sync. However, this is applicable only when destination node is in
 | |
| # a different region than the local one.
 | |
| # NOTE: Objects that are already compressed (for example: .tar.gz, .mp3) might
 | |
| # slow down the syncing process.
 | |
| # rsync_compress = no
 | |
| #
 | |
| # Format of the rsync module where the replicator will send data. See
 | |
| # etc/rsyncd.conf-sample for some usage examples.
 | |
| # rsync_module = {replication_ip}::object
 | |
| #
 | |
| # node_timeout = <whatever's in the DEFAULT section or 10>
 | |
| # max duration of an http request; this is for REPLICATE finalization calls and
 | |
| # so should be longer than node_timeout
 | |
| # http_timeout = 60
 | |
| #
 | |
| # attempts to kill all workers if nothing replicates for lockup_timeout seconds
 | |
| # lockup_timeout = 1800
 | |
| #
 | |
| # ring_check_interval = 15
 | |
| # recon_cache_path = /var/cache/swift
 | |
| #
 | |
| # limits how long rsync error log lines are
 | |
| # 0 means to log the entire line
 | |
| # rsync_error_log_line_length = 0
 | |
| #
 | |
| # handoffs_first and handoff_delete are options for a special case
 | |
| # such as disk full in the cluster. These two options SHOULD NOT BE
 | |
| # CHANGED, except for such an extreme situations. (e.g. disks filled up
 | |
| # or are about to fill up. Anyway, DO NOT let your drives fill up)
 | |
| # handoffs_first is the flag to replicate handoffs prior to canonical
 | |
| # partitions. It allows to force syncing and deleting handoffs quickly.
 | |
| # If set to a True value(e.g. "True" or "1"), partitions
 | |
| # that are not supposed to be on the node will be replicated first.
 | |
| # handoffs_first = False
 | |
| #
 | |
| # handoff_delete is the number of replicas which are ensured in swift.
 | |
| # If the number less than the number of replicas is set, object-replicator
 | |
| # could delete local handoffs even if all replicas are not ensured in the
 | |
| # cluster. Object-replicator would remove local handoff partition directories
 | |
| # after syncing partition when the number of successful responses is greater
 | |
| # than or equal to this number. By default(auto), handoff partitions will be
 | |
| # removed  when it has successfully replicated to all the canonical nodes.
 | |
| # handoff_delete = auto
 | |
| #
 | |
| # You can set scheduling priority of processes. Niceness values range from -20
 | |
| # (most favorable to the process) to 19 (least favorable to the process).
 | |
| # nice_priority =
 | |
| #
 | |
| # You can set I/O scheduling class and priority of processes. I/O niceness
 | |
| # class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
 | |
| # IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
 | |
| # 0 to 7. The higher the value, the lower the I/O priority of the process.
 | |
| # Work only with ionice_class.
 | |
| # ionice_class =
 | |
| # ionice_priority =
 | |
| 
 | |
| [object-reconstructor]
 | |
| # You can override the default log routing for this app here (don't use set!):
 | |
| # Unless otherwise noted, each setting below has the same meaning as described
 | |
| # in the [object-replicator] section, however these settings apply to the EC
 | |
| # reconstructor
 | |
| #
 | |
| # log_name = object-reconstructor
 | |
| # log_facility = LOG_LOCAL0
 | |
| # log_level = INFO
 | |
| # log_address = /dev/log
 | |
| #
 | |
| # daemonize = on
 | |
| #
 | |
| # Time in seconds to wait between reconstruction passes
 | |
| # interval = 30
 | |
| # run_pause is deprecated, use interval instead
 | |
| # run_pause = 30
 | |
| #
 | |
| # concurrency = 1
 | |
| # stats_interval = 300
 | |
| # node_timeout = 10
 | |
| # http_timeout = 60
 | |
| # lockup_timeout = 1800
 | |
| # ring_check_interval = 15
 | |
| # recon_cache_path = /var/cache/swift
 | |
| # The handoffs_only mode option is for special case emergency situations during
 | |
| # rebalance such as disk full in the cluster.  This option SHOULD NOT BE
 | |
| # CHANGED, except for extreme situations.  When handoffs_only mode is enabled
 | |
| # the reconstructor will *only* revert rebalance fragments to primaries and not
 | |
| # attempt to sync any primary parts with neighbor primaries.  This will force
 | |
| # the reconstructor to sync and delete handoffs fragments more quickly and
 | |
| # minimize the time of the rebalance by limiting the number of rebuilds.  The
 | |
| # handoffs_only option is only for temporary use, it should be disabled as soon
 | |
| # as the emergency situation is resolved.  When handoffs_only is not set, the
 | |
| # deprecated handoffs_first option will be honored as a synonym, but may be
 | |
| # ignored in a future release.
 | |
| # handoffs_only = False
 | |
| #
 | |
| # You can set scheduling priority of processes. Niceness values range from -20
 | |
| # (most favorable to the process) to 19 (least favorable to the process).
 | |
| # nice_priority =
 | |
| #
 | |
| # You can set I/O scheduling class and priority of processes. I/O niceness
 | |
| # class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
 | |
| # IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
 | |
| # 0 to 7. The higher the value, the lower the I/O priority of the process.
 | |
| # Work only with ionice_class.
 | |
| # ionice_class =
 | |
| # ionice_priority =
 | |
| 
 | |
| [object-updater]
 | |
| # You can override the default log routing for this app here (don't use set!):
 | |
| # log_name = object-updater
 | |
| # log_facility = LOG_LOCAL0
 | |
| # log_level = INFO
 | |
| # log_address = /dev/log
 | |
| #
 | |
| # interval = 300
 | |
| # concurrency = 1
 | |
| # node_timeout = <whatever's in the DEFAULT section or 10>
 | |
| # slowdown will sleep that amount between objects
 | |
| # slowdown = 0.01
 | |
| #
 | |
| # recon_cache_path = /var/cache/swift
 | |
| #
 | |
| # You can set scheduling priority of processes. Niceness values range from -20
 | |
| # (most favorable to the process) to 19 (least favorable to the process).
 | |
| # nice_priority =
 | |
| #
 | |
| # You can set I/O scheduling class and priority of processes. I/O niceness
 | |
| # class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
 | |
| # IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
 | |
| # 0 to 7. The higher the value, the lower the I/O priority of the process.
 | |
| # Work only with ionice_class.
 | |
| # ionice_class =
 | |
| # ionice_priority =
 | |
| 
 | |
| [object-auditor]
 | |
| # You can override the default log routing for this app here (don't use set!):
 | |
| # log_name = object-auditor
 | |
| # log_facility = LOG_LOCAL0
 | |
| # log_level = INFO
 | |
| # log_address = /dev/log
 | |
| #
 | |
| # Time in seconds to wait between auditor passes
 | |
| # interval = 30
 | |
| #
 | |
| # You can set the disk chunk size that the auditor uses making it larger if
 | |
| # you like for more efficient local auditing of larger objects
 | |
| # disk_chunk_size = 65536
 | |
| # files_per_second = 20
 | |
| # concurrency = 1
 | |
| # bytes_per_second = 10000000
 | |
| # log_time = 3600
 | |
| # zero_byte_files_per_second = 50
 | |
| # recon_cache_path = /var/cache/swift
 | |
| 
 | |
| # Takes a comma separated list of ints. If set, the object auditor will
 | |
| # increment a counter for every object whose size is <= to the given break
 | |
| # points and report the result after a full scan.
 | |
| # object_size_stats =
 | |
| #
 | |
| # You can set scheduling priority of processes. Niceness values range from -20
 | |
| # (most favorable to the process) to 19 (least favorable to the process).
 | |
| # nice_priority =
 | |
| #
 | |
| # You can set I/O scheduling class and priority of processes. I/O niceness
 | |
| # class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
 | |
| # IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
 | |
| # 0 to 7. The higher the value, the lower the I/O priority of the process.
 | |
| # Work only with ionice_class.
 | |
| # ionice_class =
 | |
| # ionice_priority =
 | |
| 
 | |
| # The auditor will cleanup old rsync tempfiles after they are "old
 | |
| # enough" to delete.  You can configure the time elapsed in seconds
 | |
| # before rsync tempfiles will be unlinked, or the default value of
 | |
| # "auto" try to use object-replicator's rsync_timeout + 900 and fallback
 | |
| # to 86400 (1 day).
 | |
| # rsync_tempfile_timeout = auto
 | |
| 
 | |
| # Note: Put it at the beginning of the pipleline to profile all middleware. But
 | |
| # it is safer to put this after healthcheck.
 | |
| [filter:xprofile]
 | |
| use = egg:swift#xprofile
 | |
| # This option enable you to switch profilers which should inherit from python
 | |
| # standard profiler. Currently the supported value can be 'cProfile',
 | |
| # 'eventlet.green.profile' etc.
 | |
| # profile_module = eventlet.green.profile
 | |
| #
 | |
| # This prefix will be used to combine process ID and timestamp to name the
 | |
| # profile data file.  Make sure the executing user has permission to write
 | |
| # into this path (missing path segments will be created, if necessary).
 | |
| # If you enable profiling in more than one type of daemon, you must override
 | |
| # it with an unique value like: /var/log/swift/profile/object.profile
 | |
| # log_filename_prefix = /tmp/log/swift/profile/default.profile
 | |
| #
 | |
| # the profile data will be dumped to local disk based on above naming rule
 | |
| # in this interval.
 | |
| # dump_interval = 5.0
 | |
| #
 | |
| # Be careful, this option will enable profiler to dump data into the file with
 | |
| # time stamp which means there will be lots of files piled up in the directory.
 | |
| # dump_timestamp = false
 | |
| #
 | |
| # This is the path of the URL to access the mini web UI.
 | |
| # path = /__profile__
 | |
| #
 | |
| # Clear the data when the wsgi server shutdown.
 | |
| # flush_at_shutdown = false
 | |
| #
 | |
| # unwind the iterator of applications
 | |
| # unwind = false
 |