diff options
author | Petr Mladek <pmladek@suse.com> | 2021-11-02 10:39:27 +0100 |
---|---|---|
committer | Petr Mladek <pmladek@suse.com> | 2021-11-02 10:39:27 +0100 |
commit | 40e64a88dadcfa168914065baf7f035de957bbe0 (patch) | |
tree | 06c8c4a9e6c1b478aa6851794c6a33bec1ce6ec4 /fs/xfs/xfs_log.c | |
parent | 24a1dffbecafeb00d8830985eb7a318e37aabc4e (diff) | |
parent | 6a7ca80f4033c9cf3003625b2ef8b497f4ec44da (diff) | |
download | linux-40e64a88dadcfa168914065baf7f035de957bbe0.tar.gz linux-40e64a88dadcfa168914065baf7f035de957bbe0.tar.bz2 linux-40e64a88dadcfa168914065baf7f035de957bbe0.zip |
Merge branch 'for-5.16-vsprintf-pgp' into for-linus
Diffstat (limited to 'fs/xfs/xfs_log.c')
-rw-r--r-- | fs/xfs/xfs_log.c | 721 |
1 files changed, 375 insertions, 346 deletions
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 60ac5fd63f1e..f6cd2d4aa770 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -41,6 +41,8 @@ xlog_dealloc_log( /* local state machine functions */ STATIC void xlog_state_done_syncing( struct xlog_in_core *iclog); +STATIC void xlog_state_do_callback( + struct xlog *log); STATIC int xlog_state_get_iclog_space( struct xlog *log, @@ -50,11 +52,6 @@ xlog_state_get_iclog_space( int *continued_write, int *logoffsetp); STATIC void -xlog_state_switch_iclogs( - struct xlog *log, - struct xlog_in_core *iclog, - int eventual_size); -STATIC void xlog_grant_push_ail( struct xlog *log, int need_bytes); @@ -246,7 +243,7 @@ xlog_grant_head_wait( list_add_tail(&tic->t_queue, &head->waiters); do { - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) goto shutdown; xlog_grant_push_ail(log, need_bytes); @@ -260,7 +257,7 @@ xlog_grant_head_wait( trace_xfs_log_grant_wake(log, tic); spin_lock(&head->lock); - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) goto shutdown; } while (xlog_space_left(log, &head->grant) < need_bytes); @@ -298,7 +295,7 @@ xlog_grant_head_check( int free_bytes; int error = 0; - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); /* * If there are other waiters on the queue then give them a chance at @@ -359,13 +356,13 @@ xfs_log_writable( * mounts allow internal writes for log recovery and unmount purposes, * so don't restrict that case. */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) return false; if (xfs_readonly_buftarg(mp->m_ddev_targp)) return false; if (xfs_readonly_buftarg(mp->m_log->l_targ)) return false; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xlog_is_shutdown(mp->m_log)) return false; return true; } @@ -382,7 +379,7 @@ xfs_log_regrant( int need_bytes; int error = 0; - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); @@ -450,7 +447,7 @@ xfs_log_reserve( ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); @@ -487,6 +484,42 @@ out_error: } /* + * Run all the pending iclog callbacks and wake log force waiters and iclog + * space waiters so they can process the newly set shutdown state. We really + * don't care what order we process callbacks here because the log is shut down + * and so state cannot change on disk anymore. + * + * We avoid processing actively referenced iclogs so that we don't run callbacks + * while the iclog owner might still be preparing the iclog for IO submssion. + * These will be caught by xlog_state_iclog_release() and call this function + * again to process any callbacks that may have been added to that iclog. + */ +static void +xlog_state_shutdown_callbacks( + struct xlog *log) +{ + struct xlog_in_core *iclog; + LIST_HEAD(cb_list); + + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + do { + if (atomic_read(&iclog->ic_refcnt)) { + /* Reference holder will re-run iclog callbacks. */ + continue; + } + list_splice_init(&iclog->ic_callbacks, &cb_list); + wake_up_all(&iclog->ic_write_wait); + wake_up_all(&iclog->ic_force_wait); + } while ((iclog = iclog->ic_next) != log->l_iclog); + + wake_up_all(&log->l_flush_wait); + spin_unlock(&log->l_icloglock); + + xlog_cil_process_committed(&cb_list); +} + +/* * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. * @@ -520,12 +553,11 @@ xlog_state_release_iclog( xfs_lsn_t old_tail_lsn) { xfs_lsn_t tail_lsn; + bool last_ref; + lockdep_assert_held(&log->l_icloglock); trace_xlog_iclog_release(iclog, _RET_IP_); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - /* * Grabbing the current log tail needs to be atomic w.r.t. the writing * of the tail LSN into the iclog so we guarantee that the log tail does @@ -543,7 +575,23 @@ xlog_state_release_iclog( iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); } - if (!atomic_dec_and_test(&iclog->ic_refcnt)) + last_ref = atomic_dec_and_test(&iclog->ic_refcnt); + + if (xlog_is_shutdown(log)) { + /* + * If there are no more references to this iclog, process the + * pending iclog callbacks that were waiting on the release of + * this iclog. + */ + if (last_ref) { + spin_unlock(&log->l_icloglock); + xlog_state_shutdown_callbacks(log); + spin_lock(&log->l_icloglock); + } + return -EIO; + } + + if (!last_ref) return 0; if (iclog->ic_state != XLOG_STATE_WANT_SYNC) { @@ -580,25 +628,27 @@ xfs_log_mount( xfs_daddr_t blk_offset, int num_bblks) { - bool fatal = xfs_sb_version_hascrc(&mp->m_sb); + struct xlog *log; + bool fatal = xfs_has_crc(mp); int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + if (!xfs_has_norecovery(mp)) { xfs_notice(mp, "Mounting V%d Filesystem", XFS_SB_VERSION_NUM(&mp->m_sb)); } else { xfs_notice(mp, "Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", XFS_SB_VERSION_NUM(&mp->m_sb)); - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + ASSERT(xfs_is_readonly(mp)); } - mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); - if (IS_ERR(mp->m_log)) { - error = PTR_ERR(mp->m_log); + log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + if (IS_ERR(log)) { + error = PTR_ERR(log); goto out; } + mp->m_log = log; /* * Validate the given log space and drop a critical message via syslog @@ -663,51 +713,51 @@ xfs_log_mount( xfs_warn(mp, "AIL initialisation failed: error %d", error); goto out_free_log; } - mp->m_log->l_ailp = mp->m_ail; + log->l_ailp = mp->m_ail; /* * skip log recovery on a norecovery mount. pretend it all * just worked. */ - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); - - if (readonly) - mp->m_flags &= ~XFS_MOUNT_RDONLY; - - error = xlog_recover(mp->m_log); - + if (!xfs_has_norecovery(mp)) { + /* + * log recovery ignores readonly state and so we need to clear + * mount-based read only state so it can write to disk. + */ + bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, + &mp->m_opstate); + error = xlog_recover(log); if (readonly) - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); - xlog_recover_cancel(mp->m_log); + xlog_recover_cancel(log); goto out_destroy_ail; } } - error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, + error = xfs_sysfs_init(&log->l_kobj, &xfs_log_ktype, &mp->m_kobj, "log"); if (error) goto out_destroy_ail; /* Normal transactions can now occur */ - mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); /* * Now the log has been fully initialised and we know were our * space grant counters are, we can initialise the permanent ticket * needed for delayed logging to work. */ - xlog_cil_init_post_recovery(mp->m_log); + xlog_cil_init_post_recovery(log); return 0; out_destroy_ail: xfs_trans_ail_destroy(mp); out_free_log: - xlog_dealloc_log(mp->m_log); + xlog_dealloc_log(log); out: return error; } @@ -726,19 +776,22 @@ int xfs_log_mount_finish( struct xfs_mount *mp) { - int error = 0; - bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); - bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED; + struct xlog *log = mp->m_log; + bool readonly; + int error = 0; - if (mp->m_flags & XFS_MOUNT_NORECOVERY) { - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + if (xfs_has_norecovery(mp)) { + ASSERT(xfs_is_readonly(mp)); return 0; - } else if (readonly) { - /* Allow unlinked processing to proceed */ - mp->m_flags &= ~XFS_MOUNT_RDONLY; } /* + * log recovery ignores readonly state and so we need to clear + * mount-based read only state so it can write to disk. + */ + readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + + /* * During the second phase of log recovery, we need iget and * iput to behave like they do for an active filesystem. * xfs_fs_drop_inode needs to be able to prevent the deletion @@ -759,7 +812,8 @@ xfs_log_mount_finish( * mount failure occurs. */ mp->m_super->s_flags |= SB_ACTIVE; - error = xlog_recover_finish(mp->m_log); + if (xlog_recovery_needed(log)) + error = xlog_recover_finish(log); if (!error) xfs_log_work_queue(mp); mp->m_super->s_flags &= ~SB_ACTIVE; @@ -774,17 +828,24 @@ xfs_log_mount_finish( * Don't push in the error case because the AIL may have pending intents * that aren't removed until recovery is cancelled. */ - if (!error && recovered) { - xfs_log_force(mp, XFS_LOG_SYNC); - xfs_ail_push_all_sync(mp->m_ail); + if (xlog_recovery_needed(log)) { + if (!error) { + xfs_log_force(mp, XFS_LOG_SYNC); + xfs_ail_push_all_sync(mp->m_ail); + } + xfs_notice(mp, "Ending recovery (logdev: %s)", + mp->m_logname ? mp->m_logname : "internal"); + } else { + xfs_info(mp, "Ending clean mount"); } xfs_buftarg_drain(mp->m_ddev_targp); + clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); if (readonly) - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); /* Make sure the log is dead if we're returning failure. */ - ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR)); + ASSERT(!error || xlog_is_shutdown(log)); return error; } @@ -830,7 +891,7 @@ xlog_wait_on_iclog( struct xlog *log = iclog->ic_log; trace_xlog_iclog_wait_on(iclog, _RET_IP_); - if (!XLOG_FORCED_SHUTDOWN(log) && + if (!xlog_is_shutdown(log) && iclog->ic_state != XLOG_STATE_ACTIVE && iclog->ic_state != XLOG_STATE_DIRTY) { XFS_STATS_INC(log->l_mp, xs_log_force_sleep); @@ -839,7 +900,7 @@ xlog_wait_on_iclog( spin_unlock(&log->l_icloglock); } - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; return 0; } @@ -870,7 +931,7 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(ulf); - return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); + return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS); } /* @@ -893,7 +954,7 @@ xlog_unmount_write( error = xlog_write_unmount_record(log, tic); /* * At this point, we're umounting anyway, so there's no point in - * transitioning log state to IOERROR. Just continue... + * transitioning log state to shutdown. Just continue... */ out_err: if (error) @@ -940,7 +1001,7 @@ xfs_log_unmount_write( xfs_log_force(mp, XFS_LOG_SYNC); - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return; /* @@ -972,6 +1033,20 @@ int xfs_log_quiesce( struct xfs_mount *mp) { + /* + * Clear log incompat features since we're quiescing the log. Report + * failures, though it's not fatal to have a higher log feature + * protection level than the log contents actually require. + */ + if (xfs_clear_incompat_log_features(mp)) { + int error; + + error = xfs_sync_sb(mp, false); + if (error) + xfs_warn(mp, + "Failed to clear log incompat features on quiesce"); + } + cancel_delayed_work_sync(&mp->m_log->l_work); xfs_log_force(mp, XFS_LOG_SYNC); @@ -1049,11 +1124,11 @@ xfs_log_space_wake( struct xlog *log = mp->m_log; int free_bytes; - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return; if (!list_empty_careful(&log->l_write_head.waiters)) { - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_write_head.lock); free_bytes = xlog_space_left(log, &log->l_write_head.grant); @@ -1062,7 +1137,7 @@ xfs_log_space_wake( } if (!list_empty_careful(&log->l_reserve_head.waiters)) { - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_reserve_head.lock); free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); @@ -1140,7 +1215,7 @@ xfs_log_cover( ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) && !xfs_ail_min_lsn(mp->m_log->l_ailp)) || - XFS_FORCED_SHUTDOWN(mp)); + xlog_is_shutdown(mp->m_log)); if (!xfs_log_writable(mp)) return 0; @@ -1157,7 +1232,7 @@ xfs_log_cover( * handles this for us. */ need_covered = xfs_log_need_covered(mp); - if (!need_covered && !xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (!need_covered && !xfs_has_lazysbcount(mp)) return 0; /* @@ -1230,16 +1305,18 @@ xlog_assign_tail_lsn( * wrap the tail, we should blow up. Rather than catch this case here, * we depend on other ASSERTions in other parts of the code. XXXmiken * - * This code also handles the case where the reservation head is behind - * the tail. The details of this case are described below, but the end - * result is that we return the size of the log as the amount of space left. + * If reservation head is behind the tail, we have a problem. Warn about it, + * but then treat it as if the log is empty. + * + * If the log is shut down, the head and tail may be invalid or out of whack, so + * shortcut invalidity asserts in this case so that we don't trigger them + * falsely. */ STATIC int xlog_space_left( struct xlog *log, atomic64_t *head) { - int free_bytes; int tail_bytes; int tail_cycle; int head_cycle; @@ -1249,29 +1326,30 @@ xlog_space_left( xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); tail_bytes = BBTOB(tail_bytes); if (tail_cycle == head_cycle && head_bytes >= tail_bytes) - free_bytes = log->l_logsize - (head_bytes - tail_bytes); - else if (tail_cycle + 1 < head_cycle) + return log->l_logsize - (head_bytes - tail_bytes); + if (tail_cycle + 1 < head_cycle) return 0; - else if (tail_cycle < head_cycle) { + + /* Ignore potential inconsistency when shutdown. */ + if (xlog_is_shutdown(log)) + return log->l_logsize; + + if (tail_cycle < head_cycle) { ASSERT(tail_cycle == (head_cycle - 1)); - free_bytes = tail_bytes - head_bytes; - } else { - /* - * The reservation head is behind the tail. - * In this case we just want to return the size of the - * log as the amount of space left. - */ - xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); - xfs_alert(log->l_mp, - " tail_cycle = %d, tail_bytes = %d", - tail_cycle, tail_bytes); - xfs_alert(log->l_mp, - " GH cycle = %d, GH bytes = %d", - head_cycle, head_bytes); - ASSERT(0); - free_bytes = log->l_logsize; + return tail_bytes - head_bytes; } - return free_bytes; + + /* + * The reservation head is behind the tail. In this case we just want to + * return the size of the log as the amount of space left. + */ + xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); + xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d", + tail_cycle, tail_bytes); + xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d", + head_cycle, head_bytes); + ASSERT(0); + return log->l_logsize; } @@ -1349,6 +1427,32 @@ xfs_log_work_queue( } /* + * Clear the log incompat flags if we have the opportunity. + * + * This only happens if we're about to log the second dummy transaction as part + * of covering the log and we can get the log incompat feature usage lock. + */ +static inline void +xlog_clear_incompat( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; + + if (!xfs_sb_has_incompat_log_feature(&mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_ALL)) + return; + + if (log->l_covered_state != XLOG_STATE_COVER_DONE2) + return; + + if (!down_write_trylock(&log->l_incompat_users)) + return; + + xfs_clear_incompat_log_features(mp); + up_write(&log->l_incompat_users); +} + +/* * Every sync period we need to unpin all items in the AIL and push them to * disk. If there is nothing dirty, then we might need to cover the log to * indicate that the filesystem is idle. @@ -1374,6 +1478,7 @@ xfs_log_worker( * synchronously log the superblock instead to ensure the * superblock is immediately unpinned and can be written back. */ + xlog_clear_incompat(log); xfs_sync_sb(mp, true); } else xfs_log_force(mp, 0); @@ -1417,7 +1522,7 @@ xlog_alloc_log( log->l_logBBstart = blk_offset; log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; - log->l_flags |= XLOG_ACTIVE_RECOVERY; + set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; @@ -1426,7 +1531,7 @@ xlog_alloc_log( xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ - if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) log->l_iclog_roundoff = mp->m_sb.sb_logsunit; else log->l_iclog_roundoff = BBSIZE; @@ -1435,7 +1540,7 @@ xlog_alloc_log( xlog_grant_head_init(&log->l_write_head); error = -EFSCORRUPTED; - if (xfs_sb_version_hassector(&mp->m_sb)) { + if (xfs_has_sector(mp)) { log2_size = mp->m_sb.sb_logsectlog; if (log2_size < BBSHIFT) { xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", @@ -1452,7 +1557,7 @@ xlog_alloc_log( /* for larger sector sizes, must have v2 or external log */ if (log2_size && log->l_logBBstart > 0 && - !xfs_sb_version_haslogv2(&mp->m_sb)) { + !xfs_has_logv2(mp)) { xfs_warn(mp, "log sector size (0x%x) invalid for configuration.", log2_size); @@ -1461,6 +1566,8 @@ xlog_alloc_log( } log->l_sectBBsize = 1 << log2_size; + init_rwsem(&log->l_incompat_users); + xlog_get_iclog_buffer_size(mp, log); spin_lock_init(&log->l_icloglock); @@ -1476,7 +1583,6 @@ xlog_alloc_log( */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { - int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec); @@ -1488,8 +1594,8 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, - KM_MAYFAIL | KM_ZERO); + iclog->ic_data = kvzalloc(log->l_iclog_size, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; #ifdef DEBUG @@ -1499,7 +1605,7 @@ xlog_alloc_log( memset(head, 0, sizeof(xlog_rec_header_t)); head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); head->h_version = cpu_to_be32( - xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + xfs_has_logv2(log->l_mp) ? 2 : 1); head->h_size = cpu_to_be32(log->l_iclog_size); /* new fields */ head->h_fmt = cpu_to_be32(XLOG_FMT); @@ -1551,37 +1657,6 @@ out: } /* xlog_alloc_log */ /* - * Write out the commit record of a transaction associated with the given - * ticket to close off a running log write. Return the lsn of the commit record. - */ -int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *lsn) -{ - struct xfs_log_iovec reg = { - .i_addr = NULL, - .i_len = 0, - .i_type = XLOG_REG_TYPE_COMMIT, - }; - struct xfs_log_vec vec = { - .lv_niovecs = 1, - .lv_iovecp = ®, - }; - int error; - - if (XLOG_FORCED_SHUTDOWN(log)) - return -EIO; - - error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS); - if (error) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); - return error; -} - -/* * Compute the LSN that we'd need to push the log tail towards in order to have * (a) enough on-disk log space to log the number of bytes specified, (b) at * least 25% of the log space free, and (c) at least 256 blocks free. If the @@ -1653,7 +1728,7 @@ xlog_grant_push_ail( xfs_lsn_t threshold_lsn; threshold_lsn = xlog_grant_push_threshold(log, need_bytes); - if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log)) + if (threshold_lsn == NULLCOMMITLSN || xlog_is_shutdown(log)) return; /* @@ -1689,7 +1764,7 @@ xlog_pack_data( dp += BBSIZE; } - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { xlog_in_core_2_t *xhdr = iclog->ic_data; for ( ; i < BTOBB(size); i++) { @@ -1726,7 +1801,7 @@ xlog_cksum( offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; int i; int xheads; @@ -1795,7 +1870,7 @@ xlog_write_iclog( * across the log IO to archieve that. */ down(&iclog->ic_sema); - if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) { + if (xlog_is_shutdown(log)) { /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of @@ -1953,7 +2028,7 @@ xlog_sync( /* real byte length */ size = iclog->ic_offset; - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) + if (xfs_has_logv2(log->l_mp)) size += roundoff; iclog->ic_header.h_len = cpu_to_be32(size); @@ -2303,8 +2378,7 @@ xlog_write_copy_finish( int *data_cnt, int *partial_copy, int *partial_copy_len, - int log_offset, - struct xlog_in_core **commit_iclog) + int log_offset) { int error; @@ -2323,27 +2397,20 @@ xlog_write_copy_finish( *partial_copy = 0; *partial_copy_len = 0; - if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { - /* no more space in this iclog - push it. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; - - if (iclog->ic_state == XLOG_STATE_ACTIVE) - xlog_state_switch_iclogs(log, iclog, 0); - else - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR); - if (!commit_iclog) - goto release_iclog; - spin_unlock(&log->l_icloglock); - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } + if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t)) + return 0; - return 0; + /* no more space in this iclog - push it. */ + spin_lock(&log->l_icloglock); + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + xlog_is_shutdown(log)); release_iclog: error = xlog_state_release_iclog(log, iclog, 0); spin_unlock(&log->l_icloglock); @@ -2393,10 +2460,9 @@ release_iclog: int xlog_write( struct xlog *log, + struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, uint optype) { struct xlog_in_core *iclog = NULL; @@ -2426,8 +2492,6 @@ xlog_write( } len = xlog_write_calc_vec_length(ticket, log_vector, optype); - if (start_lsn) - *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2440,9 +2504,15 @@ xlog_write( ASSERT(log_offset <= iclog->ic_size - 1); ptr = iclog->ic_datap + log_offset; - /* Start_lsn is the first lsn written to. */ - if (start_lsn && !*start_lsn) - *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + /* + * If we have a context pointer, pass it the first iclog we are + * writing to so it can record state needed for iclog write + * ordering. + */ + if (ctx) { + xlog_cil_set_ctx_write_state(ctx, iclog); + ctx = NULL; + } /* * This loop writes out as many regions as can fit in the amount @@ -2521,8 +2591,7 @@ xlog_write( &record_cnt, &data_cnt, &partial_copy, &partial_copy_len, - log_offset, - commit_iclog); + log_offset); if (error) return error; @@ -2560,12 +2629,7 @@ next_lv: spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (commit_iclog) { - ASSERT(optype & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } else { - error = xlog_state_release_iclog(log, iclog, 0); - } + error = xlog_state_release_iclog(log, iclog, 0); spin_unlock(&log->l_icloglock); return error; @@ -2751,8 +2815,7 @@ xlog_state_set_callback( static bool xlog_state_iodone_process_iclog( struct xlog *log, - struct xlog_in_core *iclog, - bool *ioerror) + struct xlog_in_core *iclog) { xfs_lsn_t lowest_lsn; xfs_lsn_t header_lsn; @@ -2764,15 +2827,6 @@ xlog_state_iodone_process_iclog( * Skip all iclogs in the ACTIVE & DIRTY states: */ return false; - case XLOG_STATE_IOERROR: - /* - * Between marking a filesystem SHUTDOWN and stopping the log, - * we do flush all iclogs to disk (if there wasn't a log I/O - * error). So, we do want things to go smoothly in case of just - * a SHUTDOWN w/o a LOG_IO_ERROR. - */ - *ioerror = true; - return false; case XLOG_STATE_DONE_SYNC: /* * Now that we have an iclog that is in the DONE_SYNC state, do @@ -2796,72 +2850,75 @@ xlog_state_iodone_process_iclog( } } -STATIC void -xlog_state_do_callback( +/* + * Loop over all the iclogs, running attached callbacks on them. Return true if + * we ran any callbacks, indicating that we dropped the icloglock. We don't need + * to handle transient shutdown state here at all because + * xlog_state_shutdown_callbacks() will be run to do the necessary shutdown + * cleanup of the callbacks. + */ +static bool +xlog_state_do_iclog_callbacks( struct xlog *log) + __releases(&log->l_icloglock) + __acquires(&log->l_icloglock) { - struct xlog_in_core *iclog; - struct xlog_in_core *first_iclog; - bool cycled_icloglock; - bool ioerror; - int flushcnt = 0; - int repeats = 0; + struct xlog_in_core *first_iclog = log->l_iclog; + struct xlog_in_core *iclog = first_iclog; + bool ran_callback = false; - spin_lock(&log->l_icloglock); do { - /* - * Scan all iclogs starting with the one pointed to by the - * log. Reset this starting point each time the log is - * unlocked (during callbacks). - * - * Keep looping through iclogs until one full pass is made - * without running any callbacks. - */ - first_iclog = log->l_iclog; - iclog = log->l_iclog; - cycled_icloglock = false; - ioerror = false; - repeats++; + LIST_HEAD(cb_list); - do { - LIST_HEAD(cb_list); + if (xlog_state_iodone_process_iclog(log, iclog)) + break; + if (iclog->ic_state != XLOG_STATE_CALLBACK) { + iclog = iclog->ic_next; + continue; + } + list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); - if (xlog_state_iodone_process_iclog(log, iclog, - &ioerror)) - break; + trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); + xlog_cil_process_committed(&cb_list); + trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); + ran_callback = true; - if (iclog->ic_state != XLOG_STATE_CALLBACK && - iclog->ic_state != XLOG_STATE_IOERROR) { - iclog = iclog->ic_next; - continue; - } - list_splice_init(&iclog->ic_callbacks, &cb_list); - spin_unlock(&log->l_icloglock); + spin_lock(&log->l_icloglock); + xlog_state_clean_iclog(log, iclog); + iclog = iclog->ic_next; + } while (iclog != first_iclog); + + return ran_callback; +} - trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); - xlog_cil_process_committed(&cb_list); - trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); - cycled_icloglock = true; - spin_lock(&log->l_icloglock); - if (XLOG_FORCED_SHUTDOWN(log)) - wake_up_all(&iclog->ic_force_wait); - else - xlog_state_clean_iclog(log, iclog); - iclog = iclog->ic_next; - } while (first_iclog != iclog); +/* + * Loop running iclog completion callbacks until there are no more iclogs in a + * state that can run callbacks. + */ +STATIC void +xlog_state_do_callback( + struct xlog *log) +{ + int flushcnt = 0; + int repeats = 0; + + spin_lock(&log->l_icloglock); + while (xlog_state_do_iclog_callbacks(log)) { + if (xlog_is_shutdown(log)) + break; - if (repeats > 5000) { + if (++repeats > 5000) { flushcnt += repeats; repeats = 0; xfs_warn(log->l_mp, "%s: possible infinite loop (%d iterations)", __func__, flushcnt); } - } while (!ioerror && cycled_icloglock); + } - if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE || - log->l_iclog->ic_state == XLOG_STATE_IOERROR) + if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE) wake_up_all(&log->l_flush_wait); spin_unlock(&log->l_icloglock); @@ -2871,13 +2928,6 @@ xlog_state_do_callback( /* * Finish transitioning this iclog to the dirty state. * - * Make sure that we completely execute this routine only when this is - * the last call to the iclog. There is a good chance that iclog flushes, - * when we reach the end of the physical log, get turned into 2 separate - * calls to bwrite. Hence, one iclog flush could generate two calls to this - * routine. By using the reference count bwritecnt, we guarantee that only - * the second completion goes through. - * * Callbacks could take time, so they are done outside the scope of the * global state machine log lock. */ @@ -2896,7 +2946,7 @@ xlog_state_done_syncing( * split log writes, on the second, we shut down the file system and * no iclogs should ever be attempted to be written to disk again. */ - if (!XLOG_FORCED_SHUTDOWN(log)) { + if (!xlog_is_shutdown(log)) { ASSERT(iclog->ic_state == XLOG_STATE_SYNCING); iclog->ic_state = XLOG_STATE_DONE_SYNC; } @@ -2944,7 +2994,7 @@ xlog_state_get_iclog_space( restart: spin_lock(&log->l_icloglock); - if (XLOG_FORCED_SHUTDOWN(log)) { + if (xlog_is_shutdown(log)) { spin_unlock(&log->l_icloglock); return -EIO; } @@ -3122,7 +3172,7 @@ xfs_log_ticket_ungrant( * This routine will mark the current iclog in the ring as WANT_SYNC and move * the current iclog pointer to the next iclog in the ring. */ -STATIC void +void xlog_state_switch_iclogs( struct xlog *log, struct xlog_in_core *iclog, @@ -3237,10 +3287,10 @@ xfs_log_force( xlog_cil_force(log); spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - if (iclog->ic_state == XLOG_STATE_IOERROR) + if (xlog_is_shutdown(log)) goto out_error; + iclog = log->l_iclog; trace_xlog_iclog_force(iclog, _RET_IP_); if (iclog->ic_state == XLOG_STATE_DIRTY || @@ -3294,6 +3344,20 @@ out_error: return -EIO; } +/* + * Force the log to a specific LSN. + * + * If an iclog with that lsn can be found: + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * Synchronous forces are implemented with a wait queue. All callers trying + * to force a given lsn to disk must wait on the queue attached to the + * specific in-core log. When given in-core log finally completes its write + * to disk, that thread will wake up all threads waiting on the queue. + */ static int xlog_force_lsn( struct xlog *log, @@ -3306,10 +3370,10 @@ xlog_force_lsn( bool completed; spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - if (iclog->ic_state == XLOG_STATE_IOERROR) + if (xlog_is_shutdown(log)) goto out_error; + iclog = log->l_iclog; while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; @@ -3379,18 +3443,13 @@ out_error: } /* - * Force the in-core log to disk for a specific LSN. - * - * Find in-core log with lsn. - * If it is in the DIRTY state, just return. - * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC - * state and go to sleep or return. - * If it is in any other state, go to sleep or return. + * Force the log to a specific checkpoint sequence. * - * Synchronous forces are implemented with a wait queue. All callers trying - * to force a given lsn to disk must wait on the queue attached to the - * specific in-core log. When given in-core log finally completes its write - * to disk, that thread will wake up all threads waiting on the queue. + * First force the CIL so that all the required changes have been flushed to the + * iclogs. If the CIL force completed it will return a commit LSN that indicates + * the iclog that needs to be flushed to stable storage. If the caller needs + * a synchronous log force, we will wait on the iclog with the LSN returned by + * xlog_cil_force_seq() to be completed. */ int xfs_log_force_seq( @@ -3619,17 +3678,15 @@ xlog_verify_grant_tail( xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); if (tail_cycle != cycle) { if (cycle - 1 != tail_cycle && - !(log->l_flags & XLOG_TAIL_WARN)) { + !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "%s: cycle - 1 != tail_cycle", __func__); - log->l_flags |= XLOG_TAIL_WARN; } if (space > BBTOB(tail_blocks) && - !(log->l_flags & XLOG_TAIL_WARN)) { + !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "%s: space > BBTOB(tail_blocks)", __func__); - log->l_flags |= XLOG_TAIL_WARN; } } } @@ -3765,105 +3822,66 @@ xlog_verify_iclog( #endif /* - * Mark all iclogs IOERROR. l_icloglock is held by the caller. - */ -STATIC int -xlog_state_ioerror( - struct xlog *log) -{ - xlog_in_core_t *iclog, *ic; - - iclog = log->l_iclog; - if (iclog->ic_state != XLOG_STATE_IOERROR) { - /* - * Mark all the incore logs IOERROR. - * From now on, no log flushes will result. - */ - ic = iclog; - do { - ic->ic_state = XLOG_STATE_IOERROR; - ic = ic->ic_next; - } while (ic != iclog); - return 0; - } - /* - * Return non-zero, if state transition has already happened. - */ - return 1; -} - -/* - * This is called from xfs_force_shutdown, when we're forcibly - * shutting down the filesystem, typically because of an IO error. + * Perform a forced shutdown on the log. This should be called once and once + * only by the high level filesystem shutdown code to shut the log subsystem + * down cleanly. + * * Our main objectives here are to make sure that: - * a. if !logerror, flush the logs to disk. Anything modified - * after this is ignored. - * b. the filesystem gets marked 'SHUTDOWN' for all interested - * parties to find out, 'atomically'. - * c. those who're sleeping on log reservations, pinned objects and - * other resources get woken up, and be told the bad news. - * d. nothing new gets queued up after (b) and (c) are done. + * a. if the shutdown was not due to a log IO error, flush the logs to + * disk. Anything modified after this is ignored. + * b. the log gets atomically marked 'XLOG_IO_ERROR' for all interested + * parties to find out. Nothing new gets queued after this is done. + * c. Tasks sleeping on log reservations, pinned objects and + * other resources get woken up. * - * Note: for the !logerror case we need to flush the regions held in memory out - * to disk first. This needs to be done before the log is marked as shutdown, - * otherwise the iclog writes will fail. + * Return true if the shutdown cause was a log IO error and we actually shut the + * log down. */ -int -xfs_log_force_umount( - struct xfs_mount *mp, - int logerror) +bool +xlog_force_shutdown( + struct xlog *log, + int shutdown_flags) { - struct xlog *log; - int retval; - - log = mp->m_log; + bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); /* - * If this happens during log recovery, don't worry about - * locking; the log isn't open for business yet. + * If this happens during log recovery then we aren't using the runtime + * log mechanisms yet so there's nothing to shut down. */ - if (!log || - log->l_flags & XLOG_ACTIVE_RECOVERY) { - mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - if (mp->m_sb_bp) - mp->m_sb_bp->b_flags |= XBF_DONE; - return 0; - } + if (!log || xlog_in_recovery(log)) + return false; - /* - * Somebody could've already done the hard work for us. - * No need to get locks for this. - */ - if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) { - ASSERT(XLOG_FORCED_SHUTDOWN(log)); - return 1; - } + ASSERT(!xlog_is_shutdown(log)); /* * Flush all the completed transactions to disk before marking the log - * being shut down. We need to do it in this order to ensure that - * completed operations are safely on disk before we shut down, and that - * we don't have to issue any buffer IO after the shutdown flags are set - * to guarantee this. + * being shut down. We need to do this first as shutting down the log + * before the force will prevent the log force from flushing the iclogs + * to disk. + * + * Re-entry due to a log IO error shutdown during the log force is + * prevented by the atomicity of higher level shutdown code. */ - if (!logerror) - xfs_log_force(mp, XFS_LOG_SYNC); + if (!log_error) + xfs_log_force(log->l_mp, XFS_LOG_SYNC); /* - * mark the filesystem and the as in a shutdown state and wake - * everybody up to tell them the bad news. + * Atomically set the shutdown state. If the shutdown state is already + * set, there someone else is performing the shutdown and so we are done + * here. This should never happen because we should only ever get called + * once by the first shutdown caller. + * + * Much of the log state machine transitions assume that shutdown state + * cannot change once they hold the log->l_icloglock. Hence we need to + * hold that lock here, even though we use the atomic test_and_set_bit() + * operation to set the shutdown state. */ spin_lock(&log->l_icloglock); - mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - if (mp->m_sb_bp) - mp->m_sb_bp->b_flags |= XBF_DONE; - - /* - * Mark the log and the iclogs with IO error flags to prevent any - * further log IO from being issued or completed. - */ - log->l_flags |= XLOG_IO_ERROR; - retval = xlog_state_ioerror(log); + if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { + spin_unlock(&log->l_icloglock); + ASSERT(0); + return false; + } spin_unlock(&log->l_icloglock); /* @@ -3883,12 +3901,12 @@ xfs_log_force_umount( * avoid races. */ spin_lock(&log->l_cilp->xc_push_lock); + wake_up_all(&log->l_cilp->xc_start_wait); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); - xlog_state_do_callback(log); + xlog_state_shutdown_callbacks(log); - /* return non-zero if log IOERROR transition had already happened */ - return retval; + return log_error; } STATIC int @@ -3926,7 +3944,7 @@ xfs_log_check_lsn( * resets the in-core LSN. We can't validate in this mode, but * modifications are not allowed anyways so just return true. */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) return true; /* @@ -3952,11 +3970,22 @@ xfs_log_check_lsn( return valid; } -bool -xfs_log_in_recovery( - struct xfs_mount *mp) +/* + * Notify the log that we're about to start using a feature that is protected + * by a log incompat feature flag. This will prevent log covering from + * clearing those flags. + */ +void +xlog_use_incompat_feat( + struct xlog *log) { - struct xlog *log = mp->m_log; + down_read(&log->l_incompat_users); +} - return log->l_flags & XLOG_ACTIVE_RECOVERY; +/* Notify the log that we've finished using log incompat features. */ +void +xlog_drop_incompat_feat( + struct xlog *log) +{ + up_read(&log->l_incompat_users); } |