rd.c 14 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/*
 * ramdisk.c - Multiple RAM disk driver - gzip-loading version - v. 0.8 beta.
 *
 * (C) Chad Page, Theodore Ts'o, et. al, 1995.
 *
 * This RAM disk is designed to have filesystems created on it and mounted
 * just like a regular floppy disk.
 *
 * It also does something suggested by Linus: use the buffer cache as the
 * RAM disk data.  This makes it possible to dynamically allocate the RAM disk
 * buffer - with some consequences I have to deal with as I write this.
 *
 * This code is based on the original ramdisk.c, written mostly by
 * Theodore Ts'o (TYT) in 1991.  The code was largely rewritten by
 * Chad Page to use the buffer cache to store the RAM disk data in
 * 1995; Theodore then took over the driver again, and cleaned it up
 * for inclusion in the mainline kernel.
 *
 * The original CRAMDISK code was written by Richard Lyons, and
 * adapted by Chad Page to use the new RAM disk interface.  Theodore
 * Ts'o rewrote it so that both the compressed RAM disk loader and the
 * kernel decompressor uses the same inflate.c codebase.  The RAM disk
 * loader now also loads into a dynamic (buffer cache based) RAM disk,
 * not the old static RAM disk.  Support for the old static RAM disk has
 * been completely removed.
 *
 * Loadable module support added by Tom Dyas.
 *
 * Further cleanups by Chad Page (page0588@sundance.sjsu.edu):
 *	Cosmetic changes in #ifdef MODULE, code movement, etc.
 * 	When the RAM disk module is removed, free the protected buffers
 * 	Default RAM disk size changed to 2.88 MB
 *
 *  Added initrd: Werner Almesberger & Hans Lermen, Feb '96
 *
 * 4/25/96 : Made RAM disk size a parameter (default is now 4 MB)
 *		- Chad Page
 *
 * Add support for fs images split across >1 disk, Paul Gortmaker, Mar '98
 *
 * Make block size and block size shift for RAM disks a global macro
 * and set blk_size for -ENOSPC,     Werner Fink <werner@suse.de>, Apr '99
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <asm/atomic.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
#include <linux/buffer_head.h>		/* for invalidate_bdev() */
#include <linux/backing-dev.h>
#include <linux/blkpg.h>
#include <linux/writeback.h>

#include <asm/uaccess.h>

/* Various static variables go here.  Most are used only in the RAM disk code.
 */

static struct gendisk *rd_disks[CONFIG_BLK_DEV_RAM_COUNT];
static struct block_device *rd_bdev[CONFIG_BLK_DEV_RAM_COUNT];/* Protected device data */
static struct request_queue *rd_queue[CONFIG_BLK_DEV_RAM_COUNT];

/*
 * Parameters for the boot-loading of the RAM disk.  These are set by
 * init/main.c (from arguments to the kernel command line) or from the
 * architecture-specific setup routine (from the stored boot sector
 * information).
 */
75
int rd_size = CONFIG_BLK_DEV_RAM_SIZE;		/* Size of the RAM disks */
Linus Torvalds's avatar
Linus Torvalds committed
76
77
78
79
80
81
82
83
84
85
86
/*
 * It would be very desirable to have a soft-blocksize (that in the case
 * of the ramdisk driver is also the hardblocksize ;) of PAGE_SIZE because
 * doing that we'll achieve a far better MM footprint. Using a rd_blocksize of
 * BLOCK_SIZE in the worst case we'll make PAGE_SIZE/BLOCK_SIZE buffer-pages
 * unfreeable. With a rd_blocksize of PAGE_SIZE instead we are sure that only
 * 1 page will be protected. Depending on the size of the ramdisk you
 * may want to change the ramdisk blocksize to achieve a better or worse MM
 * behaviour. The default is still BLOCK_SIZE (needed by rd_load_image that
 * supposes the filesystem in the image uses a BLOCK_SIZE blocksize).
 */
87
static int rd_blocksize = CONFIG_BLK_DEV_RAM_BLOCKSIZE;
Linus Torvalds's avatar
Linus Torvalds committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

/*
 * Copyright (C) 2000 Linus Torvalds.
 *               2000 Transmeta Corp.
 * aops copied from ramfs.
 */

/*
 * If a ramdisk page has buffers, some may be uptodate and some may be not.
 * To bring the page uptodate we zero out the non-uptodate buffers.  The
 * page must be locked.
 */
static void make_page_uptodate(struct page *page)
{
	if (page_has_buffers(page)) {
		struct buffer_head *bh = page_buffers(page);
		struct buffer_head *head = bh;

		do {
			if (!buffer_uptodate(bh)) {
				memset(bh->b_data, 0, bh->b_size);
				/*
				 * akpm: I'm totally undecided about this.  The
				 * buffer has just been magically brought "up to
				 * date", but nobody should want to be reading
				 * it anyway, because it hasn't been used for
				 * anything yet.  It is still in a "not read
				 * from disk yet" state.
				 *
				 * But non-uptodate buffers against an uptodate
				 * page are against the rules.  So do it anyway.
				 */
				 set_buffer_uptodate(bh);
			}
		} while ((bh = bh->b_this_page) != head);
	} else {
		memset(page_address(page), 0, PAGE_CACHE_SIZE);
	}
	flush_dcache_page(page);
	SetPageUptodate(page);
}

static int ramdisk_readpage(struct file *file, struct page *page)
{
	if (!PageUptodate(page))
		make_page_uptodate(page);
	unlock_page(page);
	return 0;
}

static int ramdisk_prepare_write(struct file *file, struct page *page,
				unsigned offset, unsigned to)
{
	if (!PageUptodate(page))
		make_page_uptodate(page);
	return 0;
}

static int ramdisk_commit_write(struct file *file, struct page *page,
				unsigned offset, unsigned to)
{
	set_page_dirty(page);
	return 0;
}

/*
 * ->writepage to the the blockdev's mapping has to redirty the page so that the
155
 * VM doesn't go and steal it.  We return AOP_WRITEPAGE_ACTIVATE so that the VM
Linus Torvalds's avatar
Linus Torvalds committed
156
157
158
159
160
161
162
163
164
165
 * won't try to (pointlessly) write the page again for a while.
 *
 * Really, these pages should not be on the LRU at all.
 */
static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
{
	if (!PageUptodate(page))
		make_page_uptodate(page);
	SetPageDirty(page);
	if (wbc->for_reclaim)
166
		return AOP_WRITEPAGE_ACTIVATE;
Linus Torvalds's avatar
Linus Torvalds committed
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
	unlock_page(page);
	return 0;
}

/*
 * This is a little speedup thing: short-circuit attempts to write back the
 * ramdisk blockdev inode to its non-existent backing store.
 */
static int ramdisk_writepages(struct address_space *mapping,
				struct writeback_control *wbc)
{
	return 0;
}

/*
 * ramdisk blockdev pages have their own ->set_page_dirty() because we don't
 * want them to contribute to dirty memory accounting.
 */
static int ramdisk_set_page_dirty(struct page *page)
{
187
188
	if (!TestSetPageDirty(page))
		return 1;
Linus Torvalds's avatar
Linus Torvalds committed
189
190
191
	return 0;
}

192
static const struct address_space_operations ramdisk_aops = {
Linus Torvalds's avatar
Linus Torvalds committed
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
	.readpage	= ramdisk_readpage,
	.prepare_write	= ramdisk_prepare_write,
	.commit_write	= ramdisk_commit_write,
	.writepage	= ramdisk_writepage,
	.set_page_dirty	= ramdisk_set_page_dirty,
	.writepages	= ramdisk_writepages,
};

static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector,
				struct address_space *mapping)
{
	pgoff_t index = sector >> (PAGE_CACHE_SHIFT - 9);
	unsigned int vec_offset = vec->bv_offset;
	int offset = (sector << 9) & ~PAGE_CACHE_MASK;
	int size = vec->bv_len;
	int err = 0;

	do {
		int count;
		struct page *page;
		char *src;
		char *dst;

		count = PAGE_CACHE_SIZE - offset;
		if (count > size)
			count = size;
		size -= count;

		page = grab_cache_page(mapping, index);
		if (!page) {
			err = -ENOMEM;
			goto out;
		}

		if (!PageUptodate(page))
			make_page_uptodate(page);

		index++;

		if (rw == READ) {
			src = kmap_atomic(page, KM_USER0) + offset;
			dst = kmap_atomic(vec->bv_page, KM_USER1) + vec_offset;
		} else {
			src = kmap_atomic(vec->bv_page, KM_USER0) + vec_offset;
			dst = kmap_atomic(page, KM_USER1) + offset;
		}
		offset = 0;
		vec_offset += count;

		memcpy(dst, src, count);

		kunmap_atomic(src, KM_USER0);
		kunmap_atomic(dst, KM_USER1);

		if (rw == READ)
			flush_dcache_page(vec->bv_page);
		else
			set_page_dirty(page);
		unlock_page(page);
		put_page(page);
	} while (size);

 out:
	return err;
}

/*
 *  Basically, my strategy here is to set up a buffer-head which can't be
 *  deleted, and make that my Ramdisk.  If the request is outside of the
 *  allocated size, we must get rid of it...
 *
 * 19-JAN-1998  Richard Gooch <rgooch@atnf.csiro.au>  Added devfs support
 *
 */
static int rd_make_request(request_queue_t *q, struct bio *bio)
{
	struct block_device *bdev = bio->bi_bdev;
	struct address_space * mapping = bdev->bd_inode->i_mapping;
	sector_t sector = bio->bi_sector;
	unsigned long len = bio->bi_size >> 9;
	int rw = bio_data_dir(bio);
	struct bio_vec *bvec;
	int ret = 0, i;

	if (sector + len > get_capacity(bdev->bd_disk))
		goto fail;

	if (rw==READA)
		rw=READ;

	bio_for_each_segment(bvec, bio, i) {
		ret |= rd_blkdev_pagecache_IO(rw, bvec, sector, mapping);
		sector += bvec->bv_len >> 9;
	}
	if (ret)
		goto fail;

	bio_endio(bio, bio->bi_size, 0);
	return 0;
fail:
	bio_io_error(bio, bio->bi_size);
	return 0;
} 

static int rd_ioctl(struct inode *inode, struct file *file,
			unsigned int cmd, unsigned long arg)
{
	int error;
	struct block_device *bdev = inode->i_bdev;

	if (cmd != BLKFLSBUF)
		return -ENOTTY;

	/*
	 * special: we want to release the ramdisk memory, it's not like with
	 * the other blockdevices where this ioctl only flushes away the buffer
	 * cache
	 */
	error = -EBUSY;
312
	mutex_lock(&bdev->bd_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
313
314
315
316
	if (bdev->bd_openers <= 2) {
		truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
		error = 0;
	}
317
	mutex_unlock(&bdev->bd_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
	return error;
}

/*
 * This is the backing_dev_info for the blockdev inode itself.  It doesn't need
 * writeback and it does not contribute to dirty memory accounting.
 */
static struct backing_dev_info rd_backing_dev_info = {
	.ra_pages	= 0,	/* No readahead */
	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK | BDI_CAP_MAP_COPY,
	.unplug_io_fn	= default_unplug_io_fn,
};

/*
 * This is the backing_dev_info for the files which live atop the ramdisk
 * "device".  These files do need writeback and they do contribute to dirty
 * memory accounting.
 */
static struct backing_dev_info rd_file_backing_dev_info = {
	.ra_pages	= 0,	/* No readahead */
	.capabilities	= BDI_CAP_MAP_COPY,	/* Does contribute to dirty memory */
	.unplug_io_fn	= default_unplug_io_fn,
};

static int rd_open(struct inode *inode, struct file *filp)
{
	unsigned unit = iminor(inode);

	if (rd_bdev[unit] == NULL) {
		struct block_device *bdev = inode->i_bdev;
		struct address_space *mapping;
		unsigned bsize;
350
		gfp_t gfp_mask;
Linus Torvalds's avatar
Linus Torvalds committed
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434

		inode = igrab(bdev->bd_inode);
		rd_bdev[unit] = bdev;
		bdev->bd_openers++;
		bsize = bdev_hardsect_size(bdev);
		bdev->bd_block_size = bsize;
		inode->i_blkbits = blksize_bits(bsize);
		inode->i_size = get_capacity(bdev->bd_disk)<<9;

		mapping = inode->i_mapping;
		mapping->a_ops = &ramdisk_aops;
		mapping->backing_dev_info = &rd_backing_dev_info;
		bdev->bd_inode_backing_dev_info = &rd_file_backing_dev_info;

		/*
		 * Deep badness.  rd_blkdev_pagecache_IO() needs to allocate
		 * pagecache pages within a request_fn.  We cannot recur back
		 * into the filesytem which is mounted atop the ramdisk, because
		 * that would deadlock on fs locks.  And we really don't want
		 * to reenter rd_blkdev_pagecache_IO when we're already within
		 * that function.
		 *
		 * So we turn off __GFP_FS and __GFP_IO.
		 *
		 * And to give this thing a hope of working, turn on __GFP_HIGH.
		 * Hopefully, there's enough regular memory allocation going on
		 * for the page allocator emergency pools to keep the ramdisk
		 * driver happy.
		 */
		gfp_mask = mapping_gfp_mask(mapping);
		gfp_mask &= ~(__GFP_FS|__GFP_IO);
		gfp_mask |= __GFP_HIGH;
		mapping_set_gfp_mask(mapping, gfp_mask);
	}

	return 0;
}

static struct block_device_operations rd_bd_op = {
	.owner =	THIS_MODULE,
	.open =		rd_open,
	.ioctl =	rd_ioctl,
};

/*
 * Before freeing the module, invalidate all of the protected buffers!
 */
static void __exit rd_cleanup(void)
{
	int i;

	for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
		struct block_device *bdev = rd_bdev[i];
		rd_bdev[i] = NULL;
		if (bdev) {
			invalidate_bdev(bdev, 1);
			blkdev_put(bdev);
		}
		del_gendisk(rd_disks[i]);
		put_disk(rd_disks[i]);
		blk_cleanup_queue(rd_queue[i]);
	}
	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
}

/*
 * This is the registration and initialization section of the RAM disk driver
 */
static int __init rd_init(void)
{
	int i;
	int err = -ENOMEM;

	if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 ||
			(rd_blocksize & (rd_blocksize-1))) {
		printk("RAMDISK: wrong blocksize %d, reverting to defaults\n",
		       rd_blocksize);
		rd_blocksize = BLOCK_SIZE;
	}

	for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
		rd_disks[i] = alloc_disk(1);
		if (!rd_disks[i])
			goto out;
435
436
437
438
439
440

		rd_queue[i] = blk_alloc_queue(GFP_KERNEL);
		if (!rd_queue[i]) {
			put_disk(rd_disks[i]);
			goto out;
		}
Linus Torvalds's avatar
Linus Torvalds committed
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
	}

	if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) {
		err = -EIO;
		goto out;
	}

	for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
		struct gendisk *disk = rd_disks[i];

		blk_queue_make_request(rd_queue[i], &rd_make_request);
		blk_queue_hardsect_size(rd_queue[i], rd_blocksize);

		/* rd_size is given in kB */
		disk->major = RAMDISK_MAJOR;
		disk->first_minor = i;
		disk->fops = &rd_bd_op;
		disk->queue = rd_queue[i];
		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
		sprintf(disk->disk_name, "ram%d", i);
		set_capacity(disk, rd_size * 2);
		add_disk(rd_disks[i]);
	}

	/* rd_size is given in kB */
	printk("RAMDISK driver initialized: "
		"%d RAM disks of %dK size %d blocksize\n",
		CONFIG_BLK_DEV_RAM_COUNT, rd_size, rd_blocksize);

	return 0;
out:
	while (i--) {
		put_disk(rd_disks[i]);
		blk_cleanup_queue(rd_queue[i]);
	}
	return err;
}

module_init(rd_init);
module_exit(rd_cleanup);

/* options - nonmodular */
#ifndef MODULE
static int __init ramdisk_size(char *str)
{
	rd_size = simple_strtol(str,NULL,0);
	return 1;
}
static int __init ramdisk_size2(char *str)	/* kludge */
{
	return ramdisk_size(str);
}
static int __init ramdisk_blocksize(char *str)
{
	rd_blocksize = simple_strtol(str,NULL,0);
	return 1;
}
__setup("ramdisk=", ramdisk_size);
__setup("ramdisk_size=", ramdisk_size2);
__setup("ramdisk_blocksize=", ramdisk_blocksize);
#endif

/* options - modular */
module_param(rd_size, int, 0);
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
module_param(rd_blocksize, int, 0);
MODULE_PARM_DESC(rd_blocksize, "Blocksize of each RAM disk in bytes.");
MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);

MODULE_LICENSE("GPL");