Commit 110b9384 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

* 'for-linus' of git://oss.sgi.com/xfs/xfs:
  xfs: Ensure inode allocation buffers are fully replayed
  xfs: enable background pushing of the CIL
  xfs: forced unmounts need to push the CIL
  xfs: Introduce delayed logging core code
  xfs: Delayed logging design documentation
  xfs: Improve scalability of busy extent tracking
  xfs: make the log ticket ID available outside the log infrastructure
  xfs: clean up log ticket overrun debug output
  xfs: Clean up XFS_BLI_* flag namespace
  xfs: modify buffer item reference counting
  xfs: allow log ticket allocation to take allocation flags
  xfs: Don't reuse the same transaction ID for duplicated transactions.
parents 4961ab93 88e88374
This diff is collapsed.
......@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
xfs_itable.o \
xfs_dfrag.o \
xfs_log.o \
xfs_log_cil.o \
xfs_log_recover.o \
xfs_mount.o \
xfs_mru_cache.o \
......
......@@ -37,6 +37,7 @@
#include "xfs_sb.h"
#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
......@@ -850,6 +851,12 @@ xfs_buf_lock_value(
* Note that this in no way locks the underlying pages, so it is only
* useful for synchronizing concurrent use of buffer objects, not for
* synchronizing independent access to the underlying pages.
*
* If we come across a stale, pinned, locked buffer, we know that we
* are being asked to lock a buffer that has been reallocated. Because
* it is pinned, we know that the log has not been pushed to disk and
* hence it will still be locked. Rather than sleeping until someone
* else pushes the log, push it ourselves before trying to get the lock.
*/
void
xfs_buf_lock(
......@@ -857,6 +864,8 @@ xfs_buf_lock(
{
trace_xfs_buf_lock(bp, _RET_IP_);
if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
xfs_log_force(bp->b_mount, 0);
if (atomic_read(&bp->b_io_remaining))
blk_run_address_space(bp->b_target->bt_mapping);
down(&bp->b_sema);
......
......@@ -19,6 +19,7 @@
#include "xfs_dmapi.h"
#include "xfs_sb.h"
#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_quota.h"
......
......@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
/*
* Table driven mount option parser.
......@@ -374,6 +376,13 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DMAPI;
} else if (!strcmp(this_char, MNTOPT_DMI)) {
mp->m_flags |= XFS_MOUNT_DMAPI;
} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
mp->m_flags |= XFS_MOUNT_DELAYLOG;
cmn_err(CE_WARN,
"Enabling EXPERIMENTAL delayed logging feature "
"- use at your own risk.\n");
} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
} else if (!strcmp(this_char, "ihashsize")) {
cmn_err(CE_WARN,
"XFS: ihashsize no longer used, option is deprecated.");
......@@ -535,6 +544,7 @@ xfs_showargs(
{ XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
{ XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
{ XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
......@@ -1755,7 +1765,7 @@ xfs_init_zones(void)
* but it is much faster.
*/
xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
(((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
(((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
NBWORD) * sizeof(int))), "xfs_buf_item");
if (!xfs_buf_item_zone)
goto out_destroy_trans_zone;
......
......@@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
);
#define XFS_BUSY_SYNC \
{ 0, "async" }, \
{ 1, "sync" }
TRACE_EVENT(xfs_alloc_busy,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
xfs_extlen_t len, int slot),
TP_ARGS(mp, agno, agbno, len, slot),
TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len, int sync),
TP_ARGS(trans, agno, agbno, len, sync),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(struct xfs_trans *, tp)
__field(int, tid)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
__field(int, slot)
__field(int, sync)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->dev = trans->t_mountp->m_super->s_dev;
__entry->tp = trans;
__entry->tid = trans->t_ticket->t_tid;
__entry->agno = agno;
__entry->agbno = agbno;
__entry->len = len;
__entry->slot = slot;
__entry->sync = sync;
),
TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->tp,
__entry->tid,
__entry->agno,
__entry->agbno,
__entry->len,
__entry->slot)
__print_symbolic(__entry->sync, XFS_BUSY_SYNC))
);
#define XFS_BUSY_STATES \
{ 0, "found" }, \
{ 1, "missing" }
TRACE_EVENT(xfs_alloc_unbusy,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
int slot, int found),
TP_ARGS(mp, agno, slot, found),
xfs_agblock_t agbno, xfs_extlen_t len),
TP_ARGS(mp, agno, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(int, slot)
__field(int, found)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
__entry->slot = slot;
__entry->found = found;
__entry->agbno = agbno;
__entry->len = len;
),
TP_printk("dev %d:%d agno %u slot %d %s",
TP_printk("dev %d:%d agno %u agbno %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->slot,
__print_symbolic(__entry->found, XFS_BUSY_STATES))
__entry->agbno,
__entry->len)
);
#define XFS_BUSY_STATES \
{ 0, "missing" }, \
{ 1, "found" }
TRACE_EVENT(xfs_alloc_busysearch,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
xfs_extlen_t len, xfs_lsn_t lsn),
TP_ARGS(mp, agno, agbno, len, lsn),
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len, int found),
TP_ARGS(mp, agno, agbno, len, found),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
__field(xfs_lsn_t, lsn)
__field(int, found)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
__entry->agbno = agbno;
__entry->len = len;
__entry->lsn = lsn;
__entry->found = found;
),
TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
TP_printk("dev %d:%d agno %u agbno %u len %u %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
__entry->len,
__print_symbolic(__entry->found, XFS_BUSY_STATES))
);
TRACE_EVENT(xfs_trans_commit_lsn,
TP_PROTO(struct xfs_trans *trans),
TP_ARGS(trans),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(struct xfs_trans *, tp)
__field(xfs_lsn_t, lsn)
),
TP_fast_assign(
__entry->dev = trans->t_mountp->m_super->s_dev;
__entry->tp = trans;
__entry->lsn = trans->t_commit_lsn;
),
TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->tp,
__entry->lsn)
);
......
......@@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk(
for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
xfs_qm_dqinit_core(curid, type, d);
xfs_trans_dquot_buf(tp, bp,
(type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
XFS_BLI_GDQUOT_BUF)));
(type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
XFS_BLF_GDQUOT_BUF)));
xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
......
......@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
} xfs_agfl_t;
/*
* Busy block/extent entry. Used in perag to mark blocks that have been freed
* but whose transactions aren't committed to disk yet.
* Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
* have been freed but whose transactions aren't committed to disk yet.
*
* Note that we use the transaction ID to record the transaction, not the
* transaction structure itself. See xfs_alloc_busy_insert() for details.
*/
typedef struct xfs_perag_busy {
xfs_agblock_t busy_start;
xfs_extlen_t busy_length;
struct xfs_trans *busy_tp; /* transaction that did the free */
} xfs_perag_busy_t;
struct xfs_busy_extent {
struct rb_node rb_node; /* ag by-bno indexed search tree */
struct list_head list; /* transaction busy extent list */
xfs_agnumber_t agno;
xfs_agblock_t bno;
xfs_extlen_t length;
xlog_tid_t tid; /* transaction that created this */
};
/*
* Per-ag incore structure, copies of information in agf and agi,
......@@ -216,7 +222,8 @@ typedef struct xfs_perag {
xfs_agino_t pagl_leftrec;
xfs_agino_t pagl_rightrec;
#ifdef __KERNEL__
spinlock_t pagb_lock; /* lock for pagb_list */
spinlock_t pagb_lock; /* lock for pagb_tree */
struct rb_root pagb_tree; /* ordered tree of busy extents */
atomic_t pagf_fstrms; /* # of filestreams active in this AG */
......@@ -226,7 +233,6 @@ typedef struct xfs_perag {
int pag_ici_reclaimable; /* reclaimable inodes */
#endif
int pagb_count; /* pagb slots in use */
xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
} xfs_perag_t;
/*
......
......@@ -46,11 +46,9 @@
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
STATIC void
xfs_alloc_search_busy(xfs_trans_t *tp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len);
static int
xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t bno, xfs_extlen_t len);
/*
* Prototypes for per-ag allocation routines
......@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
be32_to_cpu(agf->agf_length));
xfs_alloc_log_agf(args->tp, args->agbp,
XFS_AGF_FREEBLKS);
/* search the busylist for these blocks */
xfs_alloc_search_busy(args->tp, args->agno,
args->agbno, args->len);
/*
* Search the busylist for these blocks and mark the
* transaction as synchronous if blocks are found. This
* avoids the need to block due to a synchronous log
* force to ensure correct ordering as the synchronous
* transaction will guarantee that for us.
*/
if (xfs_alloc_busy_search(args->mp, args->agno,
args->agbno, args->len))
xfs_trans_set_sync(args->tp);
}
if (!args->isfl)
xfs_trans_mod_sb(args->tp,
......@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
* when the iclog commits to disk. If a busy block is allocated,
* the iclog is pushed up to the LSN that freed the block.
*/
xfs_alloc_mark_busy(tp, agno, bno, len);
xfs_alloc_busy_insert(tp, agno, bno, len);
return 0;
error0:
......@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
*bnop = bno;
/*
* As blocks are freed, they are added to the per-ag busy list
* and remain there until the freeing transaction is committed to
* disk. Now that we have allocated blocks, this list must be
* searched to see if a block is being reused. If one is, then
* the freeing transaction must be pushed to disk NOW by forcing
* to disk all iclogs up that transaction's LSN.
* As blocks are freed, they are added to the per-ag busy list and
* remain there until the freeing transaction is committed to disk.
* Now that we have allocated blocks, this list must be searched to see
* if a block is being reused. If one is, then the freeing transaction
* must be pushed to disk before this transaction.
*
* We do this by setting the current transaction to a sync transaction
* which guarantees that the freeing transaction is on disk before this
* transaction. This is done instead of a synchronous log force here so
* that we don't sit and wait with the AGF locked in the transaction
* during the log force.
*/
xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
xfs_trans_set_sync(tp);
return 0;
}
......@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
spin_lock_init(&pag->pagb_lock);
pag->pagb_count = 0;
memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
pag->pagb_tree = RB_ROOT;
pag->pagf_init = 1;
}
#ifdef DEBUG
......@@ -2479,127 +2490,263 @@ error0:
* list is reused, the transaction that freed it must be forced to disk
* before continuing to use the block.
*
* xfs_alloc_mark_busy - add to the per-ag busy list
* xfs_alloc_clear_busy - remove an item from the per-ag busy list
* xfs_alloc_busy_insert - add to the per-ag busy list
* xfs_alloc_busy_clear - remove an item from the per-ag busy list
* xfs_alloc_busy_search - search for a busy extent
*/
/*
* Insert a new extent into the busy tree.
*
* The busy extent tree is indexed by the start block of the busy extent.
* there can be multiple overlapping ranges in the busy extent tree but only
* ever one entry at a given start block. The reason for this is that
* multi-block extents can be freed, then smaller chunks of that extent
* allocated and freed again before the first transaction commit is on disk.
* If the exact same start block is freed a second time, we have to wait for
* that busy extent to pass out of the tree before the new extent is inserted.
* There are two main cases we have to handle here.
*
* The first case is a transaction that triggers a "free - allocate - free"
* cycle. This can occur during btree manipulations as a btree block is freed
* to the freelist, then allocated from the free list, then freed again. In
* this case, the second extxpnet free is what triggers the duplicate and as
* such the transaction IDs should match. Because the extent was allocated in
* this transaction, the transaction must be marked as synchronous. This is
* true for all cases where the free/alloc/free occurs in the one transaction,
* hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
* This serves to catch violations of the second case quite effectively.
*
* The second case is where the free/alloc/free occur in different
* transactions. In this case, the thread freeing the extent the second time
* can't mark the extent busy immediately because it is already tracked in a
* transaction that may be committing. When the log commit for the existing
* busy extent completes, the busy extent will be removed from the tree. If we
* allow the second busy insert to continue using that busy extent structure,
* it can be freed before this transaction is safely in the log. Hence our
* only option in this case is to force the log to remove the existing busy
* extent from the list before we insert the new one with the current
* transaction ID.
*
* The problem we are trying to avoid in the free-alloc-free in separate
* transactions is most easily described with a timeline:
*
* Thread 1 Thread 2 Thread 3 xfslogd
* xact alloc
* free X
* mark busy
* commit xact
* free xact
* xact alloc
* alloc X
* busy search
* mark xact sync
* commit xact
* free xact
* force log
* checkpoint starts
* ....
* xact alloc
* free X
* mark busy
* finds match
* *** KABOOM! ***
* ....
* log IO completes
* unbusy X
* checkpoint completes
*
* By issuing a log force in thread 3 @ "KABOOM", the thread will block until
* the checkpoint completes, and the busy extent it matched will have been
* removed from the tree when it is woken. Hence it can then continue safely.
*
* However, to ensure this matching process is robust, we need to use the
* transaction ID for identifying transaction, as delayed logging results in
* the busy extent and transaction lifecycles being different. i.e. the busy
* extent is active for a lot longer than the transaction. Hence the
* transaction structure can be freed and reallocated, then mark the same
* extent busy again in the new transaction. In this case the new transaction
* will have a different tid but can have the same address, and hence we need
* to check against the tid.
*
* Future: for delayed logging, we could avoid the log force if the extent was
* first freed in the current checkpoint sequence. This, however, requires the
* ability to pin the current checkpoint in memory until this transaction
* commits to ensure that both the original free and the current one combine
* logically into the one checkpoint. If the checkpoint sequences are
* different, however, we still need to wait on a log force.
*/
void
xfs_alloc_mark_busy(xfs_trans_t *tp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len)
xfs_alloc_busy_insert(
struct xfs_trans *tp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len)
{
xfs_perag_busy_t *bsy;
struct xfs_busy_extent *new;
struct xfs_busy_extent *busyp;
struct xfs_perag *pag;
int n;
struct rb_node **rbp;
struct rb_node *parent;
int match;
pag = xfs_perag_get(tp->t_mountp, agno);
spin_lock(&pag->pagb_lock);
/* search pagb_list for an open slot */
for (bsy = pag->pagb_list, n = 0;
n < XFS_PAGB_NUM_SLOTS;
bsy++, n++) {
if (bsy->busy_tp == NULL) {
break;
}
new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
if (!new) {
/*
* No Memory! Since it is now not possible to track the free
* block, make this a synchronous transaction to insure that
* the block is not reused before this transaction commits.
*/
trace_xfs_alloc_busy(tp, agno, bno, len, 1);
xfs_trans_set_sync(tp);
return;
}
trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
new->agno = agno;
new->bno = bno;
new->length = len;
new->tid = xfs_log_get_trans_ident(tp);
if (n < XFS_PAGB_NUM_SLOTS) {
bsy = &pag->pagb_list[n];
pag->pagb_count++;
bsy->busy_start = bno;
bsy->busy_length = len;
bsy->busy_tp = tp;
xfs_trans_add_busy(tp, agno, n);
} else {
INIT_LIST_HEAD(&new->list);
/* trace before insert to be able to see failed inserts */
trace_xfs_alloc_busy(tp, agno, bno, len, 0);
pag = xfs_perag_get(tp->t_mountp, new->agno);
restart:
spin_lock(&pag->pagb_lock);
rbp = &pag->pagb_tree.rb_node;
parent = NULL;
busyp = NULL;
match = 0;
while (*rbp && match >= 0) {
parent = *rbp;
busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
if (new->bno < busyp->bno) {
/* may overlap, but exact start block is lower */
rbp = &(*rbp)->rb_left;
if (new->bno + new->length > busyp->bno)
match = busyp->tid == new->tid ? 1 : -1;
} else if (new->bno > busyp->bno) {
/* may overlap, but exact start block is higher */
rbp = &(*rbp)->rb_right;
if (bno < busyp->bno + busyp->length)
match = busyp->tid == new->tid ? 1 : -1;
} else {
match = busyp->tid == new->tid ? 1 : -1;
break;
}
}
if (match < 0) {
/* overlap marked busy in different transaction */
spin_unlock(&pag->pagb_lock);
xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
goto restart;
}
if (match > 0) {
/*
* The busy list is full! Since it is now not possible to
* track the free block, make this a synchronous transaction
* to insure that the block is not reused before this
* transaction commits.
* overlap marked busy in same transaction. Update if exact
* start block match, otherwise combine the busy extents into
* a single range.
*/
xfs_trans_set_sync(tp);
}
if (busyp->bno == new->bno) {
busyp->length = max(busyp->length, new->length);
spin_unlock(&pag->pagb_lock);
ASSERT(tp->t_flags & XFS_TRANS_SYNC);
xfs_perag_put(pag);
kmem_free(new);
return;
}
rb_erase(&busyp->rb_node, &pag->pagb_tree);
new->length = max(busyp->bno + busyp->length,
new->bno + new->length) -
min(busyp->bno, new->bno);
new->bno = min(busyp->bno, new->bno);
} else
busyp = NULL;
rb_link_node(&new->rb_node, parent, rbp);
rb_insert_color(&new->rb_node, &pag->pagb_tree);
list_add(&new->list, &tp->t_busy);
spin_unlock(&pag->pagb_lock);
xfs_perag_put(pag);
kmem_free(busyp);
}
void
xfs_alloc_clear_busy(xfs_trans_t *tp,
xfs_agnumber_t agno,
int idx)
/*
* Search for a busy extent within the range of the extent we are about to
* allocate. You need to be holding the busy extent tree lock when calling
* xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
* extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
* match. This is done so that a non-zero return indicates an overlap that
* will require a synchronous transaction, but it can still be
* used to distinguish between a partial or exact match.
*/
static int
xfs_alloc_busy_search(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len)
{
struct xfs_perag *pag;
xfs_perag_busy_t *list;
struct rb_node *rbp;
struct xfs_busy_extent *busyp;
int match = 0;