fs-writeback.c 30.9 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
/*
 * fs/fs-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
 * Contains all the functions related to writing back and waiting
 * upon dirty inodes against superblocks, and writing back dirty
 * pages against inodes.  ie: data writeback.  Writeout of the
 * inode itself is not handled here.
 *
11
 * 10Apr2002	Andrew Morton
Linus Torvalds's avatar
Linus Torvalds committed
12
13
14
15
16
 *		Split out of fs/inode.c
 *		Additions for address_space-based writeback
 */

#include <linux/kernel.h>
Jens Axboe's avatar
Jens Axboe committed
17
#include <linux/module.h>
Linus Torvalds's avatar
Linus Torvalds committed
18
19
20
21
#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
22
23
#include <linux/kthread.h>
#include <linux/freezer.h>
Linus Torvalds's avatar
Linus Torvalds committed
24
25
26
27
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
28
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
29

30
#define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
31

32
33
34
35
36
/*
 * We don't actually have pdflush, but this one is exported though /proc...
 */
int nr_pdflush_threads;

37
38
39
40
41
42
43
44
45
46
47
/*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
struct wb_writeback_args {
	long nr_pages;
	struct super_block *sb;
	enum writeback_sync_modes sync_mode;
	int for_kupdate;
	int range_cyclic;
};

48
49
/*
 * Work items for the bdi_writeback threads
50
 */
51
52
53
54
55
56
57
struct bdi_work {
	struct list_head list;
	struct rcu_head rcu_head;

	unsigned long seen;
	atomic_t pending;

58
	struct wb_writeback_args args;
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

	unsigned long state;
};

enum {
	WS_USED_B = 0,
	WS_ONSTACK_B,
};

#define WS_USED (1 << WS_USED_B)
#define WS_ONSTACK (1 << WS_ONSTACK_B)

static inline bool bdi_work_on_stack(struct bdi_work *work)
{
	return test_bit(WS_ONSTACK_B, &work->state);
}

static inline void bdi_work_init(struct bdi_work *work,
				 struct writeback_control *wbc)
{
	INIT_RCU_HEAD(&work->rcu_head);
80
81
82
83
84
	work->args.sb = wbc->sb;
	work->args.nr_pages = wbc->nr_to_write;
	work->args.sync_mode = wbc->sync_mode;
	work->args.range_cyclic = wbc->range_cyclic;
	work->args.for_kupdate = 0;
85
86
87
	work->state = WS_USED;
}

88
89
90
91
/**
 * writeback_in_progress - determine whether there is writeback in progress
 * @bdi: the device's backing_dev_info structure.
 *
92
93
 * Determine whether there is writeback waiting to be handled against a
 * backing device.
94
95
96
 */
int writeback_in_progress(struct backing_dev_info *bdi)
{
97
	return !list_empty(&bdi->work_list);
98
99
}

100
static void bdi_work_clear(struct bdi_work *work)
101
{
102
103
104
	clear_bit(WS_USED_B, &work->state);
	smp_mb__after_clear_bit();
	wake_up_bit(&work->state, WS_USED_B);
105
106
}

107
static void bdi_work_free(struct rcu_head *head)
108
{
109
	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
110

111
112
113
114
	if (!bdi_work_on_stack(work))
		kfree(work);
	else
		bdi_work_clear(work);
115
116
}

117
static void wb_work_complete(struct bdi_work *work)
Linus Torvalds's avatar
Linus Torvalds committed
118
{
119
	const enum writeback_sync_modes sync_mode = work->args.sync_mode;
Linus Torvalds's avatar
Linus Torvalds committed
120
121

	/*
122
123
124
125
	 * For allocated work, we can clear the done/seen bit right here.
	 * For on-stack work, we need to postpone both the clear and free
	 * to after the RCU grace period, since the stack could be invalidated
	 * as soon as bdi_work_clear() has done the wakeup.
Linus Torvalds's avatar
Linus Torvalds committed
126
	 */
127
128
129
130
131
	if (!bdi_work_on_stack(work))
		bdi_work_clear(work);
	if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
		call_rcu(&work->rcu_head, bdi_work_free);
}
Linus Torvalds's avatar
Linus Torvalds committed
132

133
134
static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
{
Linus Torvalds's avatar
Linus Torvalds committed
135
	/*
136
137
	 * The caller has retrieved the work arguments from this work,
	 * drop our reference. If this is the last ref, delete and free it
Linus Torvalds's avatar
Linus Torvalds committed
138
	 */
139
140
	if (atomic_dec_and_test(&work->pending)) {
		struct backing_dev_info *bdi = wb->bdi;
Linus Torvalds's avatar
Linus Torvalds committed
141

142
143
144
		spin_lock(&bdi->wb_lock);
		list_del_rcu(&work->list);
		spin_unlock(&bdi->wb_lock);
Linus Torvalds's avatar
Linus Torvalds committed
145

146
147
148
		wb_work_complete(work);
	}
}
Linus Torvalds's avatar
Linus Torvalds committed
149

150
151
static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
{
152
153
154
155
	work->seen = bdi->wb_mask;
	BUG_ON(!work->seen);
	atomic_set(&work->pending, bdi->wb_cnt);
	BUG_ON(!bdi->wb_cnt);
Linus Torvalds's avatar
Linus Torvalds committed
156

157
158
159
160
	/*
	 * Make sure stores are seen before it appears on the list
	 */
	smp_mb();
Linus Torvalds's avatar
Linus Torvalds committed
161

162
163
164
	spin_lock(&bdi->wb_lock);
	list_add_tail_rcu(&work->list, &bdi->work_list);
	spin_unlock(&bdi->wb_lock);
165
166
167
168
169
170
171
172
173

	/*
	 * If the default thread isn't there, make sure we add it. When
	 * it gets created and wakes up, we'll run this work.
	 */
	if (unlikely(list_empty_careful(&bdi->wb_list)))
		wake_up_process(default_backing_dev_info.wb.task);
	else {
		struct bdi_writeback *wb = &bdi->wb;
Linus Torvalds's avatar
Linus Torvalds committed
174
175

		/*
176
177
		 * End work now if this wb has no dirty IO pending. Otherwise
		 * wakeup the handling thread
Linus Torvalds's avatar
Linus Torvalds committed
178
		 */
179
180
181
		if (!wb_has_dirty_io(wb))
			wb_clear_pending(wb, work);
		else if (wb->task)
182
			wake_up_process(wb->task);
Linus Torvalds's avatar
Linus Torvalds committed
183
184
185
	}
}

186
187
188
189
190
191
192
193
194
/*
 * Used for on-stack allocated work items. The caller needs to wait until
 * the wb threads have acked the work before it's safe to continue.
 */
static void bdi_wait_on_work_clear(struct bdi_work *work)
{
	wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
		    TASK_UNINTERRUPTIBLE);
}
Linus Torvalds's avatar
Linus Torvalds committed
195

196
197
static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
				 struct writeback_control *wbc)
Linus Torvalds's avatar
Linus Torvalds committed
198
{
199
200
	struct bdi_work *work;

201
202
203
204
	/*
	 * This is WB_SYNC_NONE writeback, so if allocation fails just
	 * wakeup the thread for old dirty data writeback
	 */
205
	work = kmalloc(sizeof(*work), GFP_ATOMIC);
206
	if (work) {
207
		bdi_work_init(work, wbc);
208
209
210
		bdi_queue_work(bdi, work);
	} else {
		struct bdi_writeback *wb = &bdi->wb;
211

212
213
214
		if (wb->task)
			wake_up_process(wb->task);
	}
215
216
217
218
}

void bdi_start_writeback(struct writeback_control *wbc)
{
219
220
221
222
223
	/*
	 * WB_SYNC_NONE is opportunistic writeback. If this allocation fails,
	 * bdi_queue_work() will wake up the thread and flush old data. This
	 * should ensure some amount of progress in freeing memory.
	 */
224
225
226
	if (wbc->sync_mode != WB_SYNC_ALL)
		bdi_alloc_queue_work(wbc->bdi, wbc);
	else {
227
		struct bdi_work work;
228

229
230
		bdi_work_init(&work, wbc);
		work.state |= WS_ONSTACK;
231

232
233
		bdi_queue_work(wbc->bdi, &work);
		bdi_wait_on_work_clear(&work);
234
	}
Linus Torvalds's avatar
Linus Torvalds committed
235
236
}

237
238
239
240
241
/*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
 *
 * Before stamping the inode's ->dirtied_when, we check to see whether it is
242
 * already the most-recently-dirtied inode on the b_dirty list.  If that is
243
244
245
246
247
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
static void redirty_tail(struct inode *inode)
{
248
	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
249

250
	if (!list_empty(&wb->b_dirty)) {
251
		struct inode *tail;
252

253
		tail = list_entry(wb->b_dirty.next, struct inode, i_list);
254
		if (time_before(inode->dirtied_when, tail->dirtied_when))
255
256
			inode->dirtied_when = jiffies;
	}
257
	list_move(&inode->i_list, &wb->b_dirty);
258
259
}

260
/*
261
 * requeue inode for re-scanning after bdi->b_io list is exhausted.
262
 */
263
static void requeue_io(struct inode *inode)
264
{
265
266
267
	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;

	list_move(&inode->i_list, &wb->b_more_io);
268
269
}

Joern Engel's avatar
Joern Engel committed
270
271
272
273
274
275
276
277
278
static void inode_sync_complete(struct inode *inode)
{
	/*
	 * Prevent speculative execution through spin_unlock(&inode_lock);
	 */
	smp_mb();
	wake_up_bit(&inode->i_state, __I_SYNC);
}

279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
static bool inode_dirtied_after(struct inode *inode, unsigned long t)
{
	bool ret = time_after(inode->dirtied_when, t);
#ifndef CONFIG_64BIT
	/*
	 * For inodes being constantly redirtied, dirtied_when can get stuck.
	 * It _appears_ to be in the future, but is actually in distant past.
	 * This test is necessary to prevent such wrapped-around relative times
	 * from permanently stopping the whole pdflush writeback.
	 */
	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
	return ret;
}

294
295
296
297
298
299
300
301
302
303
304
/*
 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
 */
static void move_expired_inodes(struct list_head *delaying_queue,
			       struct list_head *dispatch_queue,
				unsigned long *older_than_this)
{
	while (!list_empty(delaying_queue)) {
		struct inode *inode = list_entry(delaying_queue->prev,
						struct inode, i_list);
		if (older_than_this &&
305
		    inode_dirtied_after(inode, *older_than_this))
306
307
308
309
310
311
312
313
			break;
		list_move(&inode->i_list, dispatch_queue);
	}
}

/*
 * Queue all expired dirty inodes for io, eldest first.
 */
314
static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
315
{
316
317
	list_splice_init(&wb->b_more_io, wb->b_io.prev);
	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
318
319
}

320
static int write_inode(struct inode *inode, int sync)
321
{
322
323
324
	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
		return inode->i_sb->s_op->write_inode(inode, sync);
	return 0;
325
326
}

Linus Torvalds's avatar
Linus Torvalds committed
327
/*
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
 * Wait for writeback on an inode to complete.
 */
static void inode_wait_for_writeback(struct inode *inode)
{
	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
	wait_queue_head_t *wqh;

	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
	do {
		spin_unlock(&inode_lock);
		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
		spin_lock(&inode_lock);
	} while (inode->i_state & I_SYNC);
}

/*
 * Write out an inode's dirty pages.  Called under inode_lock.  Either the
 * caller has ref on the inode (either via __iget or via syscall against an fd)
 * or the inode has I_WILL_FREE set (via generic_forget_inode)
 *
Linus Torvalds's avatar
Linus Torvalds committed
348
349
350
351
352
353
354
355
356
 * If `wait' is set, wait on the writeout.
 *
 * The whole writeout design is quite complex and fragile.  We want to avoid
 * starvation of particular inodes when others are being redirtied, prevent
 * livelocks, etc.
 *
 * Called under inode_lock.
 */
static int
357
writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
Linus Torvalds's avatar
Linus Torvalds committed
358
359
360
{
	struct address_space *mapping = inode->i_mapping;
	int wait = wbc->sync_mode == WB_SYNC_ALL;
361
	unsigned dirty;
Linus Torvalds's avatar
Linus Torvalds committed
362
363
	int ret;

364
365
366
367
368
369
370
371
	if (!atomic_read(&inode->i_count))
		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
	else
		WARN_ON(inode->i_state & I_WILL_FREE);

	if (inode->i_state & I_SYNC) {
		/*
		 * If this inode is locked for writeback and we are not doing
372
		 * writeback-for-data-integrity, move it to b_more_io so that
373
374
375
		 * writeback can proceed with the other inodes on s_io.
		 *
		 * We'll have another go at writing back this inode when we
376
		 * completed a full scan of b_io.
377
378
379
380
381
382
383
384
385
386
387
388
		 */
		if (!wait) {
			requeue_io(inode);
			return 0;
		}

		/*
		 * It's a data-integrity sync.  We must wait.
		 */
		inode_wait_for_writeback(inode);
	}

Joern Engel's avatar
Joern Engel committed
389
	BUG_ON(inode->i_state & I_SYNC);
Linus Torvalds's avatar
Linus Torvalds committed
390

Joern Engel's avatar
Joern Engel committed
391
	/* Set I_SYNC, reset I_DIRTY */
Linus Torvalds's avatar
Linus Torvalds committed
392
	dirty = inode->i_state & I_DIRTY;
Joern Engel's avatar
Joern Engel committed
393
	inode->i_state |= I_SYNC;
Linus Torvalds's avatar
Linus Torvalds committed
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
	inode->i_state &= ~I_DIRTY;

	spin_unlock(&inode_lock);

	ret = do_writepages(mapping, wbc);

	/* Don't write the inode if only I_DIRTY_PAGES was set */
	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
		int err = write_inode(inode, wait);
		if (ret == 0)
			ret = err;
	}

	if (wait) {
		int err = filemap_fdatawait(mapping);
		if (ret == 0)
			ret = err;
	}

	spin_lock(&inode_lock);
Joern Engel's avatar
Joern Engel committed
414
	inode->i_state &= ~I_SYNC;
415
	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
Linus Torvalds's avatar
Linus Torvalds committed
416
417
418
419
420
		if (!(inode->i_state & I_DIRTY) &&
		    mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
			/*
			 * We didn't write back all the pages.  nfs_writepages()
			 * sometimes bales out without doing anything. Redirty
421
			 * the inode; Move it from b_io onto b_more_io/b_dirty.
422
423
424
			 */
			/*
			 * akpm: if the caller was the kupdate function we put
425
			 * this inode at the head of b_dirty so it gets first
426
427
428
429
430
			 * consideration.  Otherwise, move it to the tail, for
			 * the reasons described there.  I'm not really sure
			 * how much sense this makes.  Presumably I had a good
			 * reasons for doing it this way, and I'd rather not
			 * muck with it at present.
Linus Torvalds's avatar
Linus Torvalds committed
431
432
433
			 */
			if (wbc->for_kupdate) {
				/*
434
				 * For the kupdate function we move the inode
435
				 * to b_more_io so it will get more writeout as
436
				 * soon as the queue becomes uncongested.
Linus Torvalds's avatar
Linus Torvalds committed
437
438
				 */
				inode->i_state |= I_DIRTY_PAGES;
439
440
441
442
443
444
445
446
447
448
449
				if (wbc->nr_to_write <= 0) {
					/*
					 * slice used up: queue for next turn
					 */
					requeue_io(inode);
				} else {
					/*
					 * somehow blocked: retry later
					 */
					redirty_tail(inode);
				}
Linus Torvalds's avatar
Linus Torvalds committed
450
451
452
453
454
455
456
457
458
			} else {
				/*
				 * Otherwise fully redirty the inode so that
				 * other inodes on this superblock will get some
				 * writeout.  Otherwise heavy writing to one
				 * file would indefinitely suspend writeout of
				 * all the other files.
				 */
				inode->i_state |= I_DIRTY_PAGES;
459
				redirty_tail(inode);
Linus Torvalds's avatar
Linus Torvalds committed
460
461
462
463
464
465
			}
		} else if (inode->i_state & I_DIRTY) {
			/*
			 * Someone redirtied the inode while were writing back
			 * the pages.
			 */
466
			redirty_tail(inode);
Linus Torvalds's avatar
Linus Torvalds committed
467
468
469
470
471
472
473
474
475
476
477
478
		} else if (atomic_read(&inode->i_count)) {
			/*
			 * The inode is clean, inuse
			 */
			list_move(&inode->i_list, &inode_in_use);
		} else {
			/*
			 * The inode is clean, unused
			 */
			list_move(&inode->i_list, &inode_unused);
		}
	}
Joern Engel's avatar
Joern Engel committed
479
	inode_sync_complete(inode);
Linus Torvalds's avatar
Linus Torvalds committed
480
481
482
	return ret;
}

483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
/*
 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
 * before calling writeback. So make sure that we do pin it, so it doesn't
 * go away while we are writing inodes from it.
 *
 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
 * 1 if we failed.
 */
static int pin_sb_for_writeback(struct writeback_control *wbc,
				   struct inode *inode)
{
	struct super_block *sb = inode->i_sb;

	/*
	 * Caller must already hold the ref for this
	 */
	if (wbc->sync_mode == WB_SYNC_ALL) {
		WARN_ON(!rwsem_is_locked(&sb->s_umount));
		return 0;
	}

	spin_lock(&sb_lock);
	sb->s_count++;
	if (down_read_trylock(&sb->s_umount)) {
		if (sb->s_root) {
			spin_unlock(&sb_lock);
			return 0;
		}
		/*
		 * umounted, drop rwsem again and fall through to failure
		 */
		up_read(&sb->s_umount);
	}

	sb->s_count--;
	spin_unlock(&sb_lock);
	return 1;
}

static void unpin_sb_for_writeback(struct writeback_control *wbc,
				   struct inode *inode)
{
	struct super_block *sb = inode->i_sb;

	if (wbc->sync_mode == WB_SYNC_ALL)
		return;

	up_read(&sb->s_umount);
	put_super(sb);
}

static void writeback_inodes_wb(struct bdi_writeback *wb,
				struct writeback_control *wbc)
Linus Torvalds's avatar
Linus Torvalds committed
536
{
537
	struct super_block *sb = wbc->sb;
538
	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
Linus Torvalds's avatar
Linus Torvalds committed
539
540
	const unsigned long start = jiffies;	/* livelock avoidance */

541
	spin_lock(&inode_lock);
Linus Torvalds's avatar
Linus Torvalds committed
542

543
544
	if (!wbc->for_kupdate || list_empty(&wb->b_io))
		queue_io(wb, wbc->older_than_this);
545

546
547
	while (!list_empty(&wb->b_io)) {
		struct inode *inode = list_entry(wb->b_io.prev,
Linus Torvalds's avatar
Linus Torvalds committed
548
549
550
						struct inode, i_list);
		long pages_skipped;

551
552
553
554
555
556
557
558
		/*
		 * super block given and doesn't match, skip this inode
		 */
		if (sb && sb != inode->i_sb) {
			redirty_tail(inode);
			continue;
		}

559
		if (!bdi_cap_writeback_dirty(wb->bdi)) {
560
			redirty_tail(inode);
561
			if (is_blkdev_sb) {
Linus Torvalds's avatar
Linus Torvalds committed
562
563
564
565
566
567
568
569
570
571
572
573
574
575
				/*
				 * Dirty memory-backed blockdev: the ramdisk
				 * driver does this.  Skip just this inode
				 */
				continue;
			}
			/*
			 * Dirty memory-backed inode against a filesystem other
			 * than the kernel-internal bdev filesystem.  Skip the
			 * entire superblock.
			 */
			break;
		}

576
		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
577
578
579
580
			requeue_io(inode);
			continue;
		}

581
		if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
Linus Torvalds's avatar
Linus Torvalds committed
582
			wbc->encountered_congestion = 1;
583
			if (!is_blkdev_sb)
Linus Torvalds's avatar
Linus Torvalds committed
584
				break;		/* Skip a congested fs */
585
			requeue_io(inode);
Linus Torvalds's avatar
Linus Torvalds committed
586
587
588
			continue;		/* Skip a congested blockdev */
		}

589
590
591
592
593
		/*
		 * Was this inode dirtied after sync_sb_inodes was called?
		 * This keeps sync from extra jobs and livelock.
		 */
		if (inode_dirtied_after(inode, start))
Linus Torvalds's avatar
Linus Torvalds committed
594
595
			break;

596
597
598
599
		if (pin_sb_for_writeback(wbc, inode)) {
			requeue_io(inode);
			continue;
		}
Linus Torvalds's avatar
Linus Torvalds committed
600

601
		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
Linus Torvalds's avatar
Linus Torvalds committed
602
603
		__iget(inode);
		pages_skipped = wbc->pages_skipped;
604
		writeback_single_inode(inode, wbc);
605
		unpin_sb_for_writeback(wbc, inode);
Linus Torvalds's avatar
Linus Torvalds committed
606
607
608
609
610
		if (wbc->pages_skipped != pages_skipped) {
			/*
			 * writeback is not making progress due to locked
			 * buffers.  Skip this inode for now.
			 */
611
			redirty_tail(inode);
Linus Torvalds's avatar
Linus Torvalds committed
612
613
614
		}
		spin_unlock(&inode_lock);
		iput(inode);
615
		cond_resched();
Linus Torvalds's avatar
Linus Torvalds committed
616
		spin_lock(&inode_lock);
617
618
		if (wbc->nr_to_write <= 0) {
			wbc->more_io = 1;
Linus Torvalds's avatar
Linus Torvalds committed
619
			break;
620
		}
621
		if (!list_empty(&wb->b_more_io))
622
			wbc->more_io = 1;
Linus Torvalds's avatar
Linus Torvalds committed
623
	}
Nick Piggin's avatar
Nick Piggin committed
624

625
626
627
628
	spin_unlock(&inode_lock);
	/* Leave any unwritten inodes on b_io */
}

629
630
631
632
633
634
635
void writeback_inodes_wbc(struct writeback_control *wbc)
{
	struct backing_dev_info *bdi = wbc->bdi;

	writeback_inodes_wb(&bdi->wb, wbc);
}

636
/*
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
 * The maximum number of pages to writeout in a single bdi flush/kupdate
 * operation.  We do this so we don't hold I_SYNC against an inode for
 * enormous amounts of time, which would block a userspace task which has
 * been forced to throttle against that inode.  Also, the code reevaluates
 * the dirty each time it has written this many pages.
 */
#define MAX_WRITEBACK_PAGES     1024

static inline bool over_bground_thresh(void)
{
	unsigned long background_thresh, dirty_thresh;

	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);

	return (global_page_state(NR_FILE_DIRTY) +
		global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
}

/*
 * Explicit flushing or periodic writeback of "old" data.
657
 *
658
659
660
661
 * Define "old": the first time one of an inode's pages is dirtied, we mark the
 * dirtying-time in the inode's address_space.  So this periodic writeback code
 * just walks the superblock inode list, writing back any inodes which are
 * older than a specific point in time.
662
 *
663
664
665
 * Try to run once per dirty_writeback_interval.  But if a writeback event
 * takes longer than a dirty_writeback_interval interval, then leave a
 * one-second gap.
666
 *
667
668
 * older_than_this takes precedence over nr_to_write.  So we'll only write back
 * all dirty pages if they are all attached to "old" mappings.
669
 */
670
671
static long wb_writeback(struct bdi_writeback *wb,
			 struct wb_writeback_args *args)
672
{
673
674
	struct writeback_control wbc = {
		.bdi			= wb->bdi,
675
676
		.sb			= args->sb,
		.sync_mode		= args->sync_mode,
677
		.older_than_this	= NULL,
678
679
		.for_kupdate		= args->for_kupdate,
		.range_cyclic		= args->range_cyclic,
680
681
682
	};
	unsigned long oldest_jif;
	long wrote = 0;
683

684
685
686
687
688
	if (wbc.for_kupdate) {
		wbc.older_than_this = &oldest_jif;
		oldest_jif = jiffies -
				msecs_to_jiffies(dirty_expire_interval * 10);
	}
689
690
691
692
	if (!wbc.range_cyclic) {
		wbc.range_start = 0;
		wbc.range_end = LLONG_MAX;
	}
Nick Piggin's avatar
Nick Piggin committed
693

694
695
696
697
698
	for (;;) {
		/*
		 * Don't flush anything for non-integrity writeback where
		 * no nr_pages was given
		 */
699
700
		if (!args->for_kupdate && args->nr_pages <= 0 &&
		     args->sync_mode == WB_SYNC_NONE)
701
			break;
702

Nick Piggin's avatar
Nick Piggin committed
703
		/*
704
705
706
		 * If no specific pages were given and this is just a
		 * periodic background writeout and we are below the
		 * background dirty threshold, don't do anything
Nick Piggin's avatar
Nick Piggin committed
707
		 */
708
709
		if (args->for_kupdate && args->nr_pages <= 0 &&
		    !over_bground_thresh())
710
			break;
Nick Piggin's avatar
Nick Piggin committed
711

712
713
714
715
716
		wbc.more_io = 0;
		wbc.encountered_congestion = 0;
		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
		wbc.pages_skipped = 0;
		writeback_inodes_wb(wb, &wbc);
717
		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
718
719
720
721
722
723
724
		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;

		/*
		 * If we ran out of stuff to write, bail unless more_io got set
		 */
		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
			if (wbc.more_io && !wbc.for_kupdate)
Nick Piggin's avatar
Nick Piggin committed
725
				continue;
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
			break;
		}
	}

	return wrote;
}

/*
 * Return the next bdi_work struct that hasn't been processed by this
 * wb thread yet
 */
static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
					   struct bdi_writeback *wb)
{
	struct bdi_work *work, *ret = NULL;

	rcu_read_lock();

	list_for_each_entry_rcu(work, &bdi->work_list, list) {
		if (!test_and_clear_bit(wb->nr, &work->seen))
			continue;

		ret = work;
		break;
	}

	rcu_read_unlock();
	return ret;
}

static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
	unsigned long expired;
	long nr_pages;

	expired = wb->last_old_flush +
			msecs_to_jiffies(dirty_writeback_interval * 10);
	if (time_before(jiffies, expired))
		return 0;

	wb->last_old_flush = jiffies;
	nr_pages = global_page_state(NR_FILE_DIRTY) +
			global_page_state(NR_UNSTABLE_NFS) +
			(inodes_stat.nr_inodes - inodes_stat.nr_unused);

771
772
773
774
775
776
777
778
779
780
	if (nr_pages) {
		struct wb_writeback_args args = {
			.nr_pages	= nr_pages,
			.sync_mode	= WB_SYNC_NONE,
			.for_kupdate	= 1,
			.range_cyclic	= 1,
		};

		return wb_writeback(wb, &args);
	}
781
782
783
784
785
786
787
788
789
790
791

	return 0;
}

/*
 * Retrieve work items and do the writeback they describe
 */
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
	struct backing_dev_info *bdi = wb->bdi;
	struct bdi_work *work;
792
	long wrote = 0;
793
794

	while ((work = get_next_work_item(bdi, wb)) != NULL) {
795
		struct wb_writeback_args args = work->args;
796
797
798
799
800

		/*
		 * Override sync mode, in case we must wait for completion
		 */
		if (force_wait)
801
			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
802
803
804
805
806

		/*
		 * If this isn't a data integrity operation, just notify
		 * that we have seen this work and we are now starting it.
		 */
807
		if (args.sync_mode == WB_SYNC_NONE)
808
809
			wb_clear_pending(wb, work);

810
		wrote += wb_writeback(wb, &args);
811
812
813
814
815

		/*
		 * This is a data integrity writeback, so only do the
		 * notification when we have completed the work.
		 */
816
		if (args.sync_mode == WB_SYNC_ALL)
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
			wb_clear_pending(wb, work);
	}

	/*
	 * Check for periodic writeback, kupdated() style
	 */
	wrote += wb_check_old_data_flush(wb);

	return wrote;
}

/*
 * Handle writeback of dirty data for the device backed by this bdi. Also
 * wakes up periodically and does kupdated style flushing.
 */
int bdi_writeback_task(struct bdi_writeback *wb)
{
	unsigned long last_active = jiffies;
	unsigned long wait_jiffies = -1UL;
	long pages_written;

	while (!kthread_should_stop()) {
		pages_written = wb_do_writeback(wb, 0);

		if (pages_written)
			last_active = jiffies;
		else if (wait_jiffies != -1UL) {
			unsigned long max_idle;

Nick Piggin's avatar
Nick Piggin committed
846
			/*
847
848
849
			 * Longest period of inactivity that we tolerate. If we
			 * see dirty data again later, the task will get
			 * recreated automatically.
Nick Piggin's avatar
Nick Piggin committed
850
			 */
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
			max_idle = max(5UL * 60 * HZ, wait_jiffies);
			if (time_after(jiffies, max_idle + last_active))
				break;
		}

		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
		set_current_state(TASK_INTERRUPTIBLE);
		schedule_timeout(wait_jiffies);
		try_to_freeze();
	}

	return 0;
}

/*
866
867
868
 * Schedule writeback for all backing devices. Can only be used for
 * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback()
 * and pass in the superblock.
869
870
871
872
873
 */
static void bdi_writeback_all(struct writeback_control *wbc)
{
	struct backing_dev_info *bdi;

874
875
	WARN_ON(wbc->sync_mode == WB_SYNC_ALL);

876
	rcu_read_lock();
877

878
	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
879
880
		if (!bdi_has_dirty_io(bdi))
			continue;
Nick Piggin's avatar
Nick Piggin committed
881

882
		bdi_alloc_queue_work(bdi, wbc);
883
884
	}

885
	rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
886
887
888
}

/*
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
 * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
 * the whole world.
 */
void wakeup_flusher_threads(long nr_pages)
{
	struct writeback_control wbc = {
		.sync_mode	= WB_SYNC_NONE,
		.older_than_this = NULL,
		.range_cyclic	= 1,
	};

	if (nr_pages == 0)
		nr_pages = global_page_state(NR_FILE_DIRTY) +
				global_page_state(NR_UNSTABLE_NFS);
	wbc.nr_to_write = nr_pages;
	bdi_writeback_all(&wbc);
}

static noinline void block_dump___mark_inode_dirty(struct inode *inode)
{
	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
		struct dentry *dentry;
		const char *name = "?";

		dentry = d_find_alias(inode);
		if (dentry) {
			spin_lock(&dentry->d_lock);
			name = (const char *) dentry->d_name.name;
		}
		printk(KERN_DEBUG
		       "%s(%d): dirtied inode %lu (%s) on %s\n",
		       current->comm, task_pid_nr(current), inode->i_ino,
		       name, inode->i_sb->s_id);
		if (dentry) {
			spin_unlock(&dentry->d_lock);
			dput(dentry);
		}
	}
}

/**
 *	__mark_inode_dirty -	internal function
 *	@inode: inode to mark
 *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
 *	Mark an inode as dirty. Callers should use mark_inode_dirty or
 *  	mark_inode_dirty_sync.
Linus Torvalds's avatar
Linus Torvalds committed
935
 *
936
937
938
939
940
941
942
943
944
 * Put the inode on the super block's dirty list.
 *
 * CAREFUL! We mark it dirty unconditionally, but move it onto the
 * dirty list only if it is hashed or if it refers to a blockdev.
 * If it was not hashed, it will never be added to the dirty list
 * even if it is later hashed, as it will have been marked dirty already.
 *
 * In short, make sure you hash any inodes _before_ you start marking
 * them dirty.
Linus Torvalds's avatar
Linus Torvalds committed
945
 *
946
947
 * This function *must* be atomic for the I_DIRTY_PAGES case -
 * set_page_dirty() is called under spinlock in several places.
Linus Torvalds's avatar
Linus Torvalds committed
948
 *
949
950
951
952
953
954
 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
 * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
 * the kernel-internal blockdev inode represents the dirtying time of the
 * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
 * page->mapping->host, so the page-dirtying time is recorded in the internal
 * blockdev inode.
Linus Torvalds's avatar
Linus Torvalds committed
955
 */
956
void __mark_inode_dirty(struct inode *inode, int flags)
Linus Torvalds's avatar
Linus Torvalds committed
957
{
958
	struct super_block *sb = inode->i_sb;
Linus Torvalds's avatar
Linus Torvalds committed
959

960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
	/*
	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
	 * dirty the inode itself
	 */
	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
		if (sb->s_op->dirty_inode)
			sb->s_op->dirty_inode(inode);
	}

	/*
	 * make sure that changes are seen by all cpus before we test i_state
	 * -- mikulas
	 */
	smp_mb();

	/* avoid the locking if we can */
	if ((inode->i_state & flags) == flags)
		return;

	if (unlikely(block_dump))
		block_dump___mark_inode_dirty(inode);

	spin_lock(&inode_lock);
	if ((inode->i_state & flags) != flags) {
		const int was_dirty = inode->i_state & I_DIRTY;

		inode->i_state |= flags;

		/*
		 * If the inode is being synced, just update its dirty state.
		 * The unlocker will place the inode on the appropriate
		 * superblock list, based upon its state.
		 */
		if (inode->i_state & I_SYNC)
			goto out;

		/*
		 * Only add valid (hashed) inodes to the superblock's
		 * dirty list.  Add blockdev inodes as well.
		 */
		if (!S_ISBLK(inode->i_mode)) {
For faster browsing, not all history is shown. View entire blame