From 4735980723dbdc10e86dab15a34a2eab9073b693 Mon Sep 17 00:00:00 2001 From: Peter Portante Date: Tue, 20 Aug 2013 18:32:28 -0400 Subject: Fix infinite loop for temp file renames on ENOENT For whatever reason, it appears that GlusterFS, or perhaps FUSE can continuously return ENOENT on a rename system call even when we have double checked that there is no reason to do so. That is a bug for that sub system. However, our response to that bug can result in an infinite loop, which is bad. This code reduces that to 10 attempts. In addition, we restructed the open retry loop to match, providing module constants for the upper bounds of both retry loops. BUG: XXXXXX (https://bugzilla.redhat.com/show_bug.cgi?id=XXXXXX) Change-Id: Ia2d6dd427daba3ea0461863c5ffe3aef27c88f9b Signed-off-by: Peter Portante Reviewed-on: http://review.gluster.org/5670 Reviewed-by: Luis Pabon Tested-by: Luis Pabon --- gluster/swift/obj/diskfile.py | 86 ++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 41 deletions(-) (limited to 'gluster/swift/obj/diskfile.py') diff --git a/gluster/swift/obj/diskfile.py b/gluster/swift/obj/diskfile.py index ce69b6d..ea39659 100644 --- a/gluster/swift/obj/diskfile.py +++ b/gluster/swift/obj/diskfile.py @@ -50,6 +50,9 @@ DEFAULT_BYTES_PER_SYNC = (512 * 1024 * 1024) # keep these lower-case DISALLOWED_HEADERS = set('content-length content-type deleted etag'.split()) +MAX_RENAME_ATTEMPTS = 10 +MAX_OPEN_ATTEMPTS = 10 + def _random_sleep(): sleep(random.uniform(0.5, 0.15)) @@ -233,15 +236,9 @@ if _fs_conf.read(os.path.join('/etc/swift', 'fs.conf')): in TRUE_VALUES except (NoSectionError, NoOptionError): _use_put_mount = False - try: - _relaxed_writes = _fs_conf.get('DEFAULT', 'relaxed_writes', "no") \ - in TRUE_VALUES - except (NoSectionError, NoOptionError): - _relaxed_writes = False else: _mkdir_locking = False _use_put_mount = False - _relaxed_writes = False if _mkdir_locking: make_directory = _make_directory_locked @@ -324,27 +321,28 @@ class DiskWriter(SwiftDiskWriter): # disk. write_metadata(self.fd, metadata) - if not _relaxed_writes: - # We call fsync() before calling drop_cache() to lower the - # amount of redundant work the drop cache code will perform on - # the pages (now that after fsync the pages will be all - # clean). - do_fsync(self.fd) - # From the Department of the Redundancy Department, make sure - # we call drop_cache() after fsync() to avoid redundant work - # (pages all clean). - drop_buffer_cache(self.fd, 0, self.upload_size) + # We call fsync() before calling drop_cache() to lower the + # amount of redundant work the drop cache code will perform on + # the pages (now that after fsync the pages will be all + # clean). + do_fsync(self.fd) + # From the Department of the Redundancy Department, make sure + # we call drop_cache() after fsync() to avoid redundant work + # (pages all clean). + drop_buffer_cache(self.fd, 0, self.upload_size) # At this point we know that the object's full directory path # exists, so we can just rename it directly without using Swift's # swift.common.utils.renamer(), which makes the directory path and # adds extra stat() calls. data_file = os.path.join(df.put_datadir, df._obj) + attempts = 1 while True: try: os.rename(self.tmppath, data_file) except OSError as err: - if err.errno in (errno.ENOENT, errno.EIO): + if err.errno in (errno.ENOENT, errno.EIO) \ + and attempts < MAX_RENAME_ATTEMPTS: # FIXME: Why either of these two error conditions is # happening is unknown at this point. This might be a # FUSE issue of some sort or a possible race @@ -368,7 +366,7 @@ class DiskWriter(SwiftDiskWriter): self.tmppath, data_file)) else: # Data file target name now has a bad path! - dfstats = do_stat(self.put_datadir) + dfstats = do_stat(df.put_datadir) if not dfstats: raise DiskFileError( 'DiskFile.put(): path to object, %s, no' @@ -393,6 +391,7 @@ class DiskWriter(SwiftDiskWriter): self.tmppath, data_file, str(err), df.put_datadir, dfstats)) + attempts += 1 continue else: raise GlusterFileSystemOSError( @@ -557,8 +556,7 @@ class DiskFile(SwiftDiskFile): see if the directory object already exists, that is left to the caller (this avoids a potentially duplicate stat() system call). - The "dir_path" must be relative to its container, - self._container_path. + The "dir_path" must be relative to its container, self._container_path. The "metadata" object is an optional set of metadata to apply to the newly created directory object. If not present, no initial metadata is @@ -624,7 +622,8 @@ class DiskFile(SwiftDiskFile): # Assume the full directory path exists to the file already, and # construct the proper name for the temporary file. - for i in range(0, 1000): + attempts = 1 + while True: tmpfile = '.' + self._obj + '.' + md5(self._obj + str(random.random())).hexdigest() tmppath = os.path.join(self.put_datadir, tmpfile) @@ -635,20 +634,29 @@ class DiskFile(SwiftDiskFile): if gerr.errno == errno.ENOSPC: # Raise DiskFileNoSpace to be handled by upper layers raise DiskFileNoSpace() + if gerr.errno not in (errno.ENOENT, errno.EEXIST, errno.EIO): + # FIXME: Other cases we should handle? + raise + if attempts >= MAX_OPEN_ATTEMPTS: + # We failed after N attempts to create the temporary + # file. + raise DiskFileError('DiskFile.mkstemp(): failed to' + ' successfully create a temporary file' + ' without running into a name conflict' + ' after %d of %d attempts for: %s' % ( + attempts, MAX_OPEN_ATTEMPTS, + data_file)) if gerr.errno == errno.EEXIST: # Retry with a different random number. - continue - if gerr.errno == errno.EIO: + attempts += 1 + elif gerr.errno == errno.EIO: # FIXME: Possible FUSE issue or race condition, let's # sleep on it and retry the operation. _random_sleep() logging.warn("DiskFile.mkstemp(): %s ... retrying in" " 0.1 secs", gerr) - continue - if gerr.errno != errno.ENOENT: - # FIXME: Other cases we should handle? - raise - if not self._obj_path: + attempts += 1 + elif not self._obj_path: # No directory hierarchy and the create failed telling us # the container or volume directory does not exist. This # could be a FUSE issue or some race condition, so let's @@ -656,26 +664,22 @@ class DiskFile(SwiftDiskFile): _random_sleep() logging.warn("DiskFile.mkstemp(): %s ... retrying in" " 0.1 secs", gerr) - continue - if i != 0: + attempts += 1 + elif attempts > 1: # Got ENOENT after previously making the path. This could # also be a FUSE issue or some race condition, nap and # retry. _random_sleep() logging.warn("DiskFile.mkstemp(): %s ... retrying in" " 0.1 secs" % gerr) - continue - # It looks like the path to the object does not already exist - self._create_dir_object(self._obj_path) - continue + attempts += 1 + else: + # It looks like the path to the object does not already + # exist; don't count this as an attempt, though, since + # we perform the open() system call optimistically. + self._create_dir_object(self._obj_path) else: break - else: - # We failed after 1,000 attempts to create the temporary file. - raise DiskFileError('DiskFile.mkstemp(): failed to successfully' - ' create a temporary file without running' - ' into a name conflict after 1,000 attempts' - ' for: %s' % (data_file,)) dw = None try: # Ensure it is properly owned before we make it available. @@ -769,8 +773,8 @@ class DiskFile(SwiftDiskFile): :raises DiskFileError: on file size mismatch. :raises DiskFileNotExist: on file not existing (including deleted) """ - #Marker directory. if self._is_dir: + # Directories have no size. return 0 try: file_size = 0 -- cgit