ctree.h 73.4 KB
Newer Older
Chris Mason's avatar
Chris Mason committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

Chris Mason's avatar
Chris Mason committed
19
20
#ifndef __BTRFS_CTREE__
#define __BTRFS_CTREE__
21

22
#include <linux/version.h>
23
24
#include <linux/mm.h>
#include <linux/highmem.h>
Chris Mason's avatar
Chris Mason committed
25
#include <linux/fs.h>
26
#include <linux/completion.h>
Chris Mason's avatar
Chris Mason committed
27
#include <linux/backing-dev.h>
28
#include <linux/wait.h>
29
#include <asm/kmap_types.h>
30
#include "extent_io.h"
31
#include "extent_map.h"
32
#include "async-thread.h"
Chris Mason's avatar
Chris Mason committed
33

34
struct btrfs_trans_handle;
Chris Mason's avatar
Chris Mason committed
35
struct btrfs_transaction;
36
37
38
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
Chris Mason's avatar
Chris Mason committed
39
extern struct kmem_cache *btrfs_path_cachep;
40
struct btrfs_ordered_sum;
41

42
#define BTRFS_MAGIC "_BHRfS_M"
43

44
#define BTRFS_MAX_LEVEL 8
45

46
47
#define BTRFS_COMPAT_EXTENT_TREE_V0

48
49
50
51
52
53
54
/*
 * files bigger than this get some pre-flushing when they are added
 * to the ordered operations list.  That way we limit the total
 * work done by the commit
 */
#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)

55
/* holds pointers to all of the tree roots */
56
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
57
58

/* stores information about which extents are in use, and reference counts */
Chris Mason's avatar
Chris Mason committed
59
#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
60
61
62
63
64

/*
 * chunk tree stores translations from logical -> physical block numbering
 * the super block points to the chunk tree
 */
65
#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
66
67
68
69
70

/*
 * stores information about which areas of a given device are in use.
 * one per device.  The tree of tree roots points to the device tree
 */
71
72
73
74
75
76
77
#define BTRFS_DEV_TREE_OBJECTID 4ULL

/* one per subvolume, storing files and directories */
#define BTRFS_FS_TREE_OBJECTID 5ULL

/* directory objectid inside the root tree */
#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
78

79
80
81
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL

82
83
84
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL

85
86
87
88
/* does write ahead logging to speed up fsyncs */
#define BTRFS_TREE_LOG_OBJECTID -6ULL
#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL

Zheng Yan's avatar
Zheng Yan committed
89
90
91
92
/* for space balancing */
#define BTRFS_TREE_RELOC_OBJECTID -8ULL
#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL

93
94
95
96
97
98
99
/*
 * extent checksums all have this objectid
 * this allows them to share the logging tree
 * for fsyncs
 */
#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL

Zheng Yan's avatar
Zheng Yan committed
100
101
102
/* dummy objectid represents multiple objectids */
#define BTRFS_MULTIPLE_OBJECTIDS -255ULL

103
/*
104
 * All files have objectids in this range.
105
 */
106
#define BTRFS_FIRST_FREE_OBJECTID 256ULL
107
#define BTRFS_LAST_FREE_OBJECTID -256ULL
108
#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
109

110
111
112
113
114
115
116

/*
 * the device items go into the chunk tree.  The key is in the form
 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
 */
#define BTRFS_DEV_ITEMS_OBJECTID 1ULL

Chris Mason's avatar
Chris Mason committed
117
118
119
120
121
122
/*
 * we can actually store much bigger names, but lets not confuse the rest
 * of linux
 */
#define BTRFS_NAME_LEN 255

Chris Mason's avatar
Chris Mason committed
123
124
/* 32 bytes in various csum fields */
#define BTRFS_CSUM_SIZE 32
125
126
127
128
129
130

/* csum types */
#define BTRFS_CSUM_TYPE_CRC32	0

static int btrfs_csum_sizes[] = { 4, 0 };

131
/* four bytes for CRC32 */
132
#define BTRFS_EMPTY_DIR_SIZE 0
Chris Mason's avatar
Chris Mason committed
133

Chris Mason's avatar
Chris Mason committed
134
135
136
137
138
139
140
141
#define BTRFS_FT_UNKNOWN	0
#define BTRFS_FT_REG_FILE	1
#define BTRFS_FT_DIR		2
#define BTRFS_FT_CHRDEV		3
#define BTRFS_FT_BLKDEV		4
#define BTRFS_FT_FIFO		5
#define BTRFS_FT_SOCK		6
#define BTRFS_FT_SYMLINK	7
Josef Bacik's avatar
Josef Bacik committed
142
143
#define BTRFS_FT_XATTR		8
#define BTRFS_FT_MAX		9
Chris Mason's avatar
Chris Mason committed
144

145
/*
Wu Fengguang's avatar
Wu Fengguang committed
146
147
148
149
150
151
152
153
154
 * The key defines the order in the tree, and so it also defines (optimal)
 * block layout.
 *
 * objectid corresponds to the inode number.
 *
 * type tells us things about the object, and is a kind of stream selector.
 * so for a given inode, keys with type of 1 might refer to the inode data,
 * type of 2 may point to file data in the btree and type == 3 may point to
 * extents.
155
156
 *
 * offset is the starting byte offset for this key in the stream.
Chris Mason's avatar
Chris Mason committed
157
158
159
160
 *
 * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
 * in cpu native order.  Otherwise they are identical and their sizes
 * should be the same (ie both packed)
161
 */
Chris Mason's avatar
Chris Mason committed
162
163
struct btrfs_disk_key {
	__le64 objectid;
164
	u8 type;
165
	__le64 offset;
Chris Mason's avatar
Chris Mason committed
166
167
168
} __attribute__ ((__packed__));

struct btrfs_key {
169
	u64 objectid;
170
	u8 type;
171
	u64 offset;
172
173
} __attribute__ ((__packed__));

174
175
176
177
struct btrfs_mapping_tree {
	struct extent_map_tree map_tree;
};

178
#define BTRFS_UUID_SIZE 16
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
struct btrfs_dev_item {
	/* the internal btrfs device id */
	__le64 devid;

	/* size of the device */
	__le64 total_bytes;

	/* bytes used */
	__le64 bytes_used;

	/* optimal io alignment for this device */
	__le32 io_align;

	/* optimal io width for this device */
	__le32 io_width;

	/* minimal io size for this device */
	__le32 sector_size;

	/* type and info about this device */
	__le64 type;

Yan Zheng's avatar
Yan Zheng committed
201
202
203
	/* expected generation for this device */
	__le64 generation;

204
205
	/*
	 * starting byte of this partition on the device,
Wu Fengguang's avatar
Wu Fengguang committed
206
	 * to allow for stripe alignment in the future
207
208
209
	 */
	__le64 start_offset;

210
211
212
213
214
215
216
217
218
	/* grouping information for allocation decisions */
	__le32 dev_group;

	/* seek speed 0-100 where 100 is fastest */
	u8 seek_speed;

	/* bandwidth 0-100 where 100 is fastest */
	u8 bandwidth;

219
	/* btrfs generated uuid for this device */
220
	u8 uuid[BTRFS_UUID_SIZE];
Yan Zheng's avatar
Yan Zheng committed
221
222
223

	/* uuid of FS who owns this device */
	u8 fsid[BTRFS_UUID_SIZE];
224
225
226
227
228
} __attribute__ ((__packed__));

struct btrfs_stripe {
	__le64 devid;
	__le64 offset;
229
	u8 dev_uuid[BTRFS_UUID_SIZE];
230
231
232
} __attribute__ ((__packed__));

struct btrfs_chunk {
233
234
235
236
	/* size of this chunk in bytes */
	__le64 length;

	/* objectid of the root referencing this chunk */
237
	__le64 owner;
238

239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
	__le64 stripe_len;
	__le64 type;

	/* optimal io alignment for this chunk */
	__le32 io_align;

	/* optimal io width for this chunk */
	__le32 io_width;

	/* minimal io size for this chunk */
	__le32 sector_size;

	/* 2^16 stripes is quite a lot, a second limit is the size of a single
	 * item in the btree
	 */
	__le16 num_stripes;
Chris Mason's avatar
Chris Mason committed
255
256
257

	/* sub stripes only matter for raid10 */
	__le16 sub_stripes;
258
259
260
261
262
263
264
265
266
267
268
	struct btrfs_stripe stripe;
	/* additional stripes go here */
} __attribute__ ((__packed__));

static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
	BUG_ON(num_stripes == 0);
	return sizeof(struct btrfs_chunk) +
		sizeof(struct btrfs_stripe) * (num_stripes - 1);
}

269
#define BTRFS_FSID_SIZE 16
270
271
272
273
274
275
276
277
278
279
280
281
#define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
#define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
#define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
#define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)

#define BTRFS_BACKREF_REV_MAX		256
#define BTRFS_BACKREF_REV_SHIFT		56
#define BTRFS_BACKREF_REV_MASK		(((u64)BTRFS_BACKREF_REV_MAX - 1) << \
					 BTRFS_BACKREF_REV_SHIFT)

#define BTRFS_OLD_BACKREF_REV		0
#define BTRFS_MIXED_BACKREF_REV		1
282

283
284
285
/*
 * every tree block (leaf or node) starts with this header.
 */
286
struct btrfs_header {
287
	/* these first four must match the super block */
Chris Mason's avatar
Chris Mason committed
288
	u8 csum[BTRFS_CSUM_SIZE];
289
	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
290
	__le64 bytenr; /* which block this node is supposed to live in */
291
	__le64 flags;
292
293
294

	/* allowed to be different from the super from here on down */
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
295
	__le64 generation;
296
	__le64 owner;
297
	__le32 nritems;
298
	u8 level;
299
300
} __attribute__ ((__packed__));

301
#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
302
303
				      sizeof(struct btrfs_header)) / \
				     sizeof(struct btrfs_key_ptr))
304
#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
305
#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
306
307
308
#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
					sizeof(struct btrfs_item) - \
					sizeof(struct btrfs_file_extent_item))
309

310
311
312
313
314
315

/*
 * this is a very generous portion of the super block, giving us
 * room to translate 14 chunks with 3 stripes each.
 */
#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
316
#define BTRFS_LABEL_SIZE 256
317

318
319
320
321
/*
 * the super block basically lists the main trees of the FS
 * it currently lacks any block count etc etc
 */
322
struct btrfs_super_block {
Chris Mason's avatar
Chris Mason committed
323
	u8 csum[BTRFS_CSUM_SIZE];
324
	/* the first 4 fields must match struct btrfs_header */
Yan Zheng's avatar
Yan Zheng committed
325
	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
326
	__le64 bytenr; /* this block number */
327
	__le64 flags;
328
329

	/* allowed to be different from the btrfs_header from here own down */
330
331
332
	__le64 magic;
	__le64 generation;
	__le64 root;
333
	__le64 chunk_root;
334
	__le64 log_root;
335
336
337

	/* this will help find the new super based on the log root */
	__le64 log_root_transid;
338
339
	__le64 total_bytes;
	__le64 bytes_used;
340
	__le64 root_dir_objectid;
341
	__le64 num_devices;
342
343
344
	__le32 sectorsize;
	__le32 nodesize;
	__le32 leafsize;
345
	__le32 stripesize;
346
	__le32 sys_chunk_array_size;
347
	__le64 chunk_root_generation;
348
349
350
	__le64 compat_flags;
	__le64 compat_ro_flags;
	__le64 incompat_flags;
351
	__le16 csum_type;
352
	u8 root_level;
353
	u8 chunk_root_level;
354
	u8 log_root_level;
355
	struct btrfs_dev_item dev_item;
356

357
	char label[BTRFS_LABEL_SIZE];
358
359
360

	/* future expansion */
	__le64 reserved[32];
361
	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
Chris Mason's avatar
Chris Mason committed
362
363
} __attribute__ ((__packed__));

364
365
366
367
/*
 * Compat flags that we support.  If any incompat flags are set other than the
 * ones specified below then we will fail to mount
 */
368
369
370
371
372
373
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)

#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
#define BTRFS_FEATURE_INCOMPAT_SUPP		\
	BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
374

375
/*
376
 * A leaf is full of items. offset and size tell us where to find
377
378
 * the item in the leaf (relative to the start of the data area)
 */
Chris Mason's avatar
Chris Mason committed
379
struct btrfs_item {
Chris Mason's avatar
Chris Mason committed
380
	struct btrfs_disk_key key;
381
	__le32 offset;
382
	__le32 size;
383
384
} __attribute__ ((__packed__));

385
386
387
388
389
390
391
/*
 * leaves have an item area and a data area:
 * [item0, item1....itemN] [free space] [dataN...data1, data0]
 *
 * The data is separate from the items to get the keys closer together
 * during searches.
 */
392
struct btrfs_leaf {
393
	struct btrfs_header header;
394
	struct btrfs_item items[];
395
396
} __attribute__ ((__packed__));

397
398
399
400
/*
 * all non-leaf blocks are nodes, they hold only keys and pointers to
 * other blocks
 */
401
402
403
struct btrfs_key_ptr {
	struct btrfs_disk_key key;
	__le64 blockptr;
404
	__le64 generation;
405
406
} __attribute__ ((__packed__));

407
struct btrfs_node {
408
	struct btrfs_header header;
409
	struct btrfs_key_ptr ptrs[];
410
411
} __attribute__ ((__packed__));

412
/*
413
414
 * btrfs_paths remember the path taken from the root down to the leaf.
 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
415
416
417
418
419
 * to any other levels that are present.
 *
 * The slots array records the index of the item or block pointer
 * used while walking the tree.
 */
420
struct btrfs_path {
421
	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
422
	int slots[BTRFS_MAX_LEVEL];
423
424
	/* if there is real range locking, this locks field will change */
	int locks[BTRFS_MAX_LEVEL];
425
	int reada;
426
	/* keep some upper locks as we walk down */
427
	int lowest_level;
428
429
430
431
432

	/*
	 * set by btrfs_split_item, tells search_slot to keep all locks
	 * and to force calls to keep space in the nodes
	 */
433
434
435
436
	unsigned int search_for_split:1;
	unsigned int keep_locks:1;
	unsigned int skip_locking:1;
	unsigned int leave_spinning:1;
437
	unsigned int search_commit_root:1;
438
};
Chris Mason's avatar
Chris Mason committed
439

440
441
442
443
/*
 * items in the extent btree are used to record the objectid of the
 * owner of the block and the number of references
 */
444

445
struct btrfs_extent_item {
446
447
448
449
450
451
	__le64 refs;
	__le64 generation;
	__le64 flags;
} __attribute__ ((__packed__));

struct btrfs_extent_item_v0 {
452
	__le32 refs;
453
454
} __attribute__ ((__packed__));

455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
					sizeof(struct btrfs_item))

#define BTRFS_EXTENT_FLAG_DATA		(1ULL << 0)
#define BTRFS_EXTENT_FLAG_TREE_BLOCK	(1ULL << 1)

/* following flags only apply to tree blocks */

/* use full backrefs for extent pointers in the block */
#define BTRFS_BLOCK_FLAG_FULL_BACKREF	(1ULL << 8)

struct btrfs_tree_block_info {
	struct btrfs_disk_key key;
	u8 level;
} __attribute__ ((__packed__));

struct btrfs_extent_data_ref {
	__le64 root;
	__le64 objectid;
	__le64 offset;
	__le32 count;
} __attribute__ ((__packed__));

struct btrfs_shared_data_ref {
	__le32 count;
} __attribute__ ((__packed__));

struct btrfs_extent_inline_ref {
	u8 type;
	u64 offset;
} __attribute__ ((__packed__));

/* old style backrefs item */
struct btrfs_extent_ref_v0 {
489
490
491
	__le64 root;
	__le64 generation;
	__le64 objectid;
492
	__le32 count;
493
494
} __attribute__ ((__packed__));

495

496
497
/* dev extents record free space on individual devices.  The owner
 * field points back to the chunk allocation mapping tree that allocated
498
 * the extent.  The chunk tree uuid field is a way to double check the owner
499
500
 */
struct btrfs_dev_extent {
501
502
503
	__le64 chunk_tree;
	__le64 chunk_objectid;
	__le64 chunk_offset;
504
	__le64 length;
505
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
506
507
} __attribute__ ((__packed__));

508
struct btrfs_inode_ref {
509
	__le64 index;
510
511
512
513
	__le16 name_len;
	/* name goes here */
} __attribute__ ((__packed__));

514
struct btrfs_timespec {
Chris Mason's avatar
Chris Mason committed
515
	__le64 sec;
Chris Mason's avatar
Chris Mason committed
516
517
518
	__le32 nsec;
} __attribute__ ((__packed__));

Jan Engelhardt's avatar
Jan Engelhardt committed
519
enum btrfs_compression_type {
520
521
522
	BTRFS_COMPRESS_NONE = 0,
	BTRFS_COMPRESS_ZLIB = 1,
	BTRFS_COMPRESS_LAST = 2,
Jan Engelhardt's avatar
Jan Engelhardt committed
523
};
524

Chris Mason's avatar
Chris Mason committed
525
struct btrfs_inode_item {
526
	/* nfs style generation number */
Chris Mason's avatar
Chris Mason committed
527
	__le64 generation;
528
529
	/* transid that last touched this inode */
	__le64 transid;
Chris Mason's avatar
Chris Mason committed
530
	__le64 size;
531
	__le64 nbytes;
532
	__le64 block_group;
Chris Mason's avatar
Chris Mason committed
533
534
535
536
	__le32 nlink;
	__le32 uid;
	__le32 gid;
	__le32 mode;
537
	__le64 rdev;
538
	__le64 flags;
539

540
541
542
543
544
545
546
547
	/* modification sequence number for NFS */
	__le64 sequence;

	/*
	 * a little future expansion, for more than this we can
	 * just grow the inode item and version it
	 */
	__le64 reserved[4];
548
549
550
551
	struct btrfs_timespec atime;
	struct btrfs_timespec ctime;
	struct btrfs_timespec mtime;
	struct btrfs_timespec otime;
Chris Mason's avatar
Chris Mason committed
552
553
} __attribute__ ((__packed__));

554
555
556
557
struct btrfs_dir_log_item {
	__le64 end;
} __attribute__ ((__packed__));

558
struct btrfs_dir_item {
559
	struct btrfs_disk_key location;
560
	__le64 transid;
Josef Bacik's avatar
Josef Bacik committed
561
	__le16 data_len;
562
	__le16 name_len;
563
564
565
566
	u8 type;
} __attribute__ ((__packed__));

struct btrfs_root_item {
567
	struct btrfs_inode_item inode;
568
	__le64 generation;
569
	__le64 root_dirid;
570
571
572
	__le64 bytenr;
	__le64 byte_limit;
	__le64 bytes_used;
Yan Zheng's avatar
Yan Zheng committed
573
	__le64 last_snapshot;
574
	__le64 flags;
575
	__le32 refs;
576
577
	struct btrfs_disk_key drop_progress;
	u8 drop_level;
578
	u8 level;
579
} __attribute__ ((__packed__));
580

581
582
583
584
585
586
587
588
589
/*
 * this is used for both forward and backward root refs
 */
struct btrfs_root_ref {
	__le64 dirid;
	__le64 sequence;
	__le16 name_len;
} __attribute__ ((__packed__));

Yan Zheng's avatar
Yan Zheng committed
590
591
592
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
593

594
struct btrfs_file_extent_item {
595
596
597
	/*
	 * transaction id that created this extent
	 */
598
	__le64 generation;
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
	/*
	 * max number of bytes to hold this extent in ram
	 * when we split a compressed extent we can't know how big
	 * each of the resulting pieces will be.  So, this is
	 * an upper limit on the size of the extent in ram instead of
	 * an exact limit.
	 */
	__le64 ram_bytes;

	/*
	 * 32 bits for the various ways we might encode the data,
	 * including compression and encryption.  If any of these
	 * are set to something a given disk format doesn't understand
	 * it is treated like an incompat flag for reading and writing,
	 * but not for stat.
	 */
	u8 compression;
	u8 encryption;
	__le16 other_encoding; /* spare for later use */

	/* are we inline data or a real extent? */
620
	u8 type;
621

622
623
624
625
	/*
	 * disk space consumed by the extent, checksum blocks are included
	 * in these numbers
	 */
626
627
	__le64 disk_bytenr;
	__le64 disk_num_bytes;
628
	/*
Chris Mason's avatar
Chris Mason committed
629
	 * the logical offset in file blocks (no csums)
630
631
632
633
634
635
636
	 * this extent record is for.  This allows a file extent to point
	 * into the middle of an existing extent on disk, sharing it
	 * between two snapshots (useful if some bytes in the middle of the
	 * extent have changed
	 */
	__le64 offset;
	/*
637
638
	 * the logical number of file blocks (no csums included).  This
	 * always reflects the size uncompressed and without encoding.
639
	 */
640
	__le64 num_bytes;
641

642
643
} __attribute__ ((__packed__));

Chris Mason's avatar
Chris Mason committed
644
struct btrfs_csum_item {
645
	u8 csum;
Chris Mason's avatar
Chris Mason committed
646
647
} __attribute__ ((__packed__));

648
649
650
651
/* different types of block groups (and chunks) */
#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
652
#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
653
#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
654
#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
Chris Mason's avatar
Chris Mason committed
655
#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
Chris Mason's avatar
Chris Mason committed
656

Chris Mason's avatar
Chris Mason committed
657
658
struct btrfs_block_group_item {
	__le64 used;
659
660
	__le64 chunk_objectid;
	__le64 flags;
Chris Mason's avatar
Chris Mason committed
661
662
} __attribute__ ((__packed__));

663
664
struct btrfs_space_info {
	u64 flags;
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685

	u64 total_bytes;	/* total bytes in the space */
	u64 bytes_used;		/* total bytes used on disk */
	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
				   transaction finishes */
	u64 bytes_reserved;	/* total bytes the allocator has reserved for
				   current allocations */
	u64 bytes_readonly;	/* total bytes that are read only */

	/* delalloc accounting */
	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
				   this space is not necessarily reserved yet
				   by the allocator */
	u64 bytes_may_use;	/* number of bytes that may be used for
				   delalloc */

	int full;		/* indicates that we cannot allocate any more
				   chunks for this space */
	int force_alloc;	/* set if we need to force a chunk alloc for
				   this space */

686
	struct list_head list;
687
688
689
690

	/* for block groups in our same type */
	struct list_head block_groups;
	spinlock_t lock;
691
	struct rw_semaphore groups_sem;
692
693
};

694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
/*
 * free clusters are used to claim free space in relatively large chunks,
 * allowing us to do less seeky writes.  They are used for all metadata
 * allocations and data allocations in ssd mode.
 */
struct btrfs_free_cluster {
	spinlock_t lock;
	spinlock_t refill_lock;
	struct rb_root root;

	/* largest extent in this cluster */
	u64 max_size;

	/* first extent starting offset */
	u64 window_start;

	struct btrfs_block_group_cache *block_group;
	/*
	 * when a cluster is allocated from a block group, we put the
	 * cluster onto a list in the block group so that it can
	 * be freed before the block group is freed.
	 */
	struct list_head block_group_list;
717
718
};

Chris Mason's avatar
Chris Mason committed
719
720
721
struct btrfs_block_group_cache {
	struct btrfs_key key;
	struct btrfs_block_group_item item;
722
	spinlock_t lock;
723
	struct mutex cache_mutex;
724
	u64 pinned;
725
	u64 reserved;
726
727
	u64 flags;
	int cached;
728
	int ro;
729
730
731
732
733
	int dirty;

	struct btrfs_space_info *space_info;

	/* free space cache stuff */
734
	spinlock_t tree_lock;
735
736
737
738
739
740
741
742
	struct rb_root free_space_bytes;
	struct rb_root free_space_offset;

	/* block group cache stuff */
	struct rb_node cache_node;

	/* for block groups in the same raid type */
	struct list_head list;
743
744
745

	/* usage count */
	atomic_t count;
746
747
748
749
750

	/* List of struct btrfs_free_clusters for this block group.
	 * Today it will only have one thing on it, but that may change
	 */
	struct list_head cluster_list;
Chris Mason's avatar
Chris Mason committed
751
};
752

753
struct reloc_control;
754
struct btrfs_device;
755
struct btrfs_fs_devices;
756
struct btrfs_fs_info {
757
	u8 fsid[BTRFS_FSID_SIZE];
758
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
759
760
	struct btrfs_root *extent_root;
	struct btrfs_root *tree_root;
761
762
	struct btrfs_root *chunk_root;
	struct btrfs_root *dev_root;
763
	struct btrfs_root *fs_root;
764
	struct btrfs_root *csum_root;
765
766
767

	/* the log root tree is a directory of all the other log roots */
	struct btrfs_root *log_root_tree;
768
	struct radix_tree_root fs_roots_radix;
769

770
771
772
773
	/* block group cache stuff */
	spinlock_t block_group_cache_lock;
	struct rb_root block_group_cache_tree;

774
	struct extent_io_tree pinned_extents;
775

776
777
778
	/* logical->physical extent mapping */
	struct btrfs_mapping_tree mapping_tree;

779
	u64 generation;
780
	u64 last_trans_committed;
781
782
783
784
785
786

	/*
	 * this is updated to the current trans every time a full commit
	 * is required instead of the faster short fsync log commits
	 */
	u64 last_trans_log_full_commit;
787
	u64 open_ioctl_trans;
788
	unsigned long mount_opt;
789
	u64 max_extent;
790
	u64 max_inline;
791
	u64 alloc_start;
Chris Mason's avatar
Chris Mason committed
792
	struct btrfs_transaction *running_transaction;
793
	wait_queue_head_t transaction_throttle;
794
	wait_queue_head_t transaction_wait;
795
	wait_queue_head_t async_submit_wait;
796

797
	struct btrfs_super_block super_copy;
798
	struct btrfs_super_block super_for_commit;
799
	struct block_device *__bdev;
Chris Mason's avatar
Chris Mason committed
800
	struct super_block *sb;
801
	struct inode *btree_inode;
Chris Mason's avatar
Chris Mason committed
802
	struct backing_dev_info bdi;
Chris Mason's avatar
Chris Mason committed
803
	struct mutex trans_mutex;
804
	struct mutex tree_log_mutex;
805
806
	struct mutex transaction_kthread_mutex;
	struct mutex cleaner_mutex;
807
	struct mutex chunk_mutex;
808
	struct mutex drop_mutex;
809
	struct mutex volume_mutex;
Zheng Yan's avatar
Zheng Yan committed
810
	struct mutex tree_reloc_mutex;
811

812
813
814
815
816
817
818
819
820
	/*
	 * this protects the ordered operations list only while we are
	 * processing all of the entries on it.  This way we make
	 * sure the commit code doesn't find the list temporarily empty
	 * because another function happens to be doing non-waiting preflush
	 * before jumping into the main commit.
	 */
	struct mutex ordered_operations_mutex;

Chris Mason's avatar
Chris Mason committed
821
	struct list_head trans_list;
822
	struct list_head hashers;
823
	struct list_head dead_roots;
824

825
	atomic_t nr_async_submits;
826
	atomic_t async_submit_draining;
827
	atomic_t nr_async_bios;
828
	atomic_t async_delalloc_pages;
829

830
831
832
833
834
	/*
	 * this is used by the balancing code to wait for all the pending
	 * ordered extents
	 */
	spinlock_t ordered_extent_lock;
835
836
837
838
839
840

	/*
	 * all of the data=ordered extents pending writeback
	 * these can span multiple transactions and basically include
	 * every dirty data page that isn't from nodatacow
	 */
841
	struct list_head ordered_extents;
842
843
844
845
846
847

	/*
	 * all of the inodes that have delalloc bytes.  It is possible for
	 * this list to be empty even when there is still dirty data=ordered
	 * extents waiting to finish IO.
	 */
848
	struct list_head delalloc_inodes;
849

850
851
852
853
854
855
856
	/*
	 * special rename and truncate targets that must be on disk before
	 * we're allowed to commit.  This is basically the ext3 style
	 * data=ordered list.
	 */
	struct list_head ordered_operations;

857
858
859
860
861
862
	/*
	 * there is a pool of worker threads for checksumming during writes
	 * and a pool for checksumming after reads.  This is because readers
	 * can run with FS locks held, and the writers may be waiting for
	 * those locks.  We don't want ordering in the pending list to cause
	 * deadlocks, and so the two are serviced separately.
863
864
865
	 *
	 * A third pool does submit_bio to avoid deadlocking with the other
	 * two
866
867
	 */
	struct btrfs_workers workers;
868
	struct btrfs_workers delalloc_workers;
869
	struct btrfs_workers endio_workers;
870
	struct btrfs_workers endio_meta_workers;
871
	struct btrfs_workers endio_meta_write_workers;
872
	struct btrfs_workers endio_write_workers;
873
	struct btrfs_workers submit_workers;
874
875
876
877
878
879
	/*
	 * fixup workers take dirty pages that didn't properly go through
	 * the cow mechanism and make them safe to write.  It happens
	 * for the sys_munmap function call path
	 */
	struct btrfs_workers fixup_workers;
880
881
	struct task_struct *transaction_kthread;
	struct task_struct *cleaner_kthread;
882
	int thread_pool_size;
883

884
885
	struct kobject super_kobj;
	struct completion kobj_unregister;
886
	int do_barriers;
887
	int closing;
888
	int log_root_recovering;
889

890
	u64 total_pinned;
891
892
893
894
895

	/* protected by the delalloc lock, used to keep from writing
	 * metadata until there is a nice batch
	 */
	u64 dirty_metadata_bytes;
896
897
	struct list_head dirty_cowonly_roots;

898
	struct btrfs_fs_devices *fs_devices;
899
900
901
902
903
904

	/*
	 * the space_info list is almost entirely read only.  It only changes
	 * when we add a new raid type to the FS, and that happens
	 * very rarely.  RCU is used to protect it.
	 */
905
	struct list_head space_info;
906

907
908
	struct reloc_control *reloc_ctl;

909
	spinlock_t delalloc_lock;
910
	spinlock_t new_trans_lock;
911
	u64 delalloc_bytes;
912
913
914
915
916
917

	/* data_alloc_cluster is only used in ssd mode */
	struct btrfs_free_cluster data_alloc_cluster;

	/* all metadata allocations go through this cluster */
	struct btrfs_free_cluster meta_alloc_cluster;
918

Yan Zheng's avatar
Yan Zheng committed
919
920
921
	spinlock_t ref_cache_lock;
	u64 total_ref_cache_size;

922
923
924
925
926
927
	u64 avail_data_alloc_bits;
	u64 avail_metadata_alloc_bits;
	u64 avail_system_alloc_bits;
	u64 data_alloc_profile;
	u64 metadata_alloc_profile;
	u64 system_alloc_profile;
928

929
930
931
	unsigned data_chunk_allocations;
	unsigned metadata_ratio;

932
	void *bdev_holder;
933
};
934

935
936
/*
 * in ram representation of the tree.  extent_root is used for all allocations
937
 * and for the extent tree extent_root root.
938
939
 */
struct btrfs_root {
940
	struct extent_buffer *node;
941
942
943
944

	/* the node lock is held while changing the node pointer */
	spinlock_t node_lock;

945
	struct extent_buffer *commit_root;
946
	struct btrfs_root *log_root;
Zheng Yan's avatar
Zheng Yan committed
947
	struct btrfs_root *reloc_root;
Yan Zheng's avatar
Yan Zheng committed
948

949
950
	struct btrfs_root_item root_item;
	struct btrfs_key root_key;
951
	struct btrfs_fs_info *fs_info;
952
953
	struct extent_io_tree dirty_log_pages;

954
955
	struct kobject root_kobj;
	struct completion kobj_unregister;
956
	struct mutex objectid_mutex;
Yan Zheng's avatar
Yan Zheng committed
957

958
	struct mutex log_mutex;
Yan Zheng's avatar
Yan Zheng committed
959
960
961
962
963
964
	wait_queue_head_t log_writer_wait;
	wait_queue_head_t log_commit_wait[2];
	atomic_t log_writers;
	atomic_t log_commit[2];
	unsigned long log_transid;
	unsigned long log_batch;
965

966
967
	u64 objectid;
	u64 last_trans;
968
969
970
971
972
973
974
975
976
977

	/* data allocations are done in sectorsize units */
	u32 sectorsize;

	/* node allocations are done in nodesize units */
	u32 nodesize;

	/* leaf allocations are done in leafsize units */
	u32 leafsize;

978
979
	u32 stripesize;

980
	u32 type;
Chris Mason's avatar
Chris Mason committed
981
982
	u64 highest_inode;
	u64 last_inode_alloc;
983
	int ref_cows;
984
	int track_dirty;
985
	u64 defrag_trans_start;
986
	struct btrfs_key defrag_progress;
987
	struct btrfs_key defrag_max;
988
989
	int defrag_running;
	int defrag_level;
990
	char *name;
991
	int in_sysfs;
992
993
994

	/* the dirty list is only used by non-reference counted roots */
	struct list_head dirty_list;
995

996
997
	struct list_head root_list;

998
	spinlock_t list_lock;
999
	struct list_head orphan_list;
1000

1001
1002
1003
1004
	spinlock_t inode_lock;
	/* red-black tree that keeps track of in-memory inodes */
	struct rb_root inode_tree;

1005
1006
1007
1008
1009
	/*
	 * right now this just gets used so that a root has its own devid
	 * for stat.  It may be used for more later
	 */
	struct super_block anon_super;
1010
1011
};

Chris Mason's avatar
Chris Mason committed
1012
1013
1014
1015
1016
/*
 * inode items have the data typically returned from stat and store other
 * info about object characteristics.  There is one for every file and dir in
 * the FS
 */
Chris Mason's avatar
Chris Mason committed
1017
#define BTRFS_INODE_ITEM_KEY		1
1018
1019
1020
#define BTRFS_INODE_REF_KEY		12
#define BTRFS_XATTR_ITEM_KEY		24
#define BTRFS_ORPHAN_ITEM_KEY		48
Chris Mason's avatar
Chris Mason committed
1021
/* reserve 2-15 close to the inode for later flexibility */
Chris Mason's avatar
Chris Mason committed
1022
1023
1024
1025
1026

/*
 * dir items are the name -> inode pointers in a directory.  There is one
 * for every name in a directory.
 */
1027
1028
1029
1030
#define BTRFS_DIR_LOG_ITEM_KEY  60
#define BTRFS_DIR_LOG_INDEX_KEY 72
#define BTRFS_DIR_ITEM_KEY	84
#define BTRFS_DIR_INDEX_KEY	96
Chris Mason's avatar
Chris Mason committed
1031
/*
Chris Mason's avatar
Chris Mason committed
1032
 * extent data is for file data
Chris Mason's avatar
Chris Mason committed
1033
 */
1034
#define BTRFS_EXTENT_DATA_KEY	108
1035

Chris Mason's avatar
Chris Mason committed
1036
/*
1037
1038
 * extent csums are stored in a separate tree and hold csums for
 * an entire extent on disk.
Chris Mason's avatar
Chris Mason committed
1039
 */
1040
#define BTRFS_EXTENT_CSUM_KEY	128
Chris Mason's avatar
Chris Mason committed
1041

Chris Mason's avatar
Chris Mason committed
1042
/*
Wu Fengguang's avatar
Wu Fengguang committed
1043
 * root items point to tree roots.  They are typically in the root