ctree.h 73.4 KB
Newer Older
Chris Mason's avatar
Chris Mason committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

Chris Mason's avatar
Chris Mason committed
19
20
#ifndef __BTRFS_CTREE__
#define __BTRFS_CTREE__
21

22
#include <linux/version.h>
23
24
#include <linux/mm.h>
#include <linux/highmem.h>
Chris Mason's avatar
Chris Mason committed
25
#include <linux/fs.h>
26
#include <linux/completion.h>
Chris Mason's avatar
Chris Mason committed
27
#include <linux/backing-dev.h>
28
#include <linux/wait.h>
29
#include <asm/kmap_types.h>
30
#include "extent_io.h"
31
#include "extent_map.h"
32
#include "async-thread.h"
Chris Mason's avatar
Chris Mason committed
33

34
struct btrfs_trans_handle;
Chris Mason's avatar
Chris Mason committed
35
struct btrfs_transaction;
36
37
38
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
Chris Mason's avatar
Chris Mason committed
39
extern struct kmem_cache *btrfs_path_cachep;
40
struct btrfs_ordered_sum;
41

42
#define BTRFS_MAGIC "_BHRfS_M"
43

Josef Bacik's avatar
Josef Bacik committed
44
45
#define BTRFS_ACL_NOT_CACHED    ((void *)-1)

46
#define BTRFS_MAX_LEVEL 8
47

48
49
#define BTRFS_COMPAT_EXTENT_TREE_V0

50
51
52
53
54
55
56
/*
 * files bigger than this get some pre-flushing when they are added
 * to the ordered operations list.  That way we limit the total
 * work done by the commit
 */
#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)

57
/* holds pointers to all of the tree roots */
58
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
59
60

/* stores information about which extents are in use, and reference counts */
Chris Mason's avatar
Chris Mason committed
61
#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
62
63
64
65
66

/*
 * chunk tree stores translations from logical -> physical block numbering
 * the super block points to the chunk tree
 */
67
#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
68
69
70
71
72

/*
 * stores information about which areas of a given device are in use.
 * one per device.  The tree of tree roots points to the device tree
 */
73
74
75
76
77
78
79
#define BTRFS_DEV_TREE_OBJECTID 4ULL

/* one per subvolume, storing files and directories */
#define BTRFS_FS_TREE_OBJECTID 5ULL

/* directory objectid inside the root tree */
#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
80

81
82
83
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL

84
85
86
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL

87
88
89
90
/* does write ahead logging to speed up fsyncs */
#define BTRFS_TREE_LOG_OBJECTID -6ULL
#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL

Zheng Yan's avatar
Zheng Yan committed
91
92
93
94
/* for space balancing */
#define BTRFS_TREE_RELOC_OBJECTID -8ULL
#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL

95
96
97
98
99
100
101
/*
 * extent checksums all have this objectid
 * this allows them to share the logging tree
 * for fsyncs
 */
#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL

Zheng Yan's avatar
Zheng Yan committed
102
103
104
/* dummy objectid represents multiple objectids */
#define BTRFS_MULTIPLE_OBJECTIDS -255ULL

105
/*
106
 * All files have objectids in this range.
107
 */
108
#define BTRFS_FIRST_FREE_OBJECTID 256ULL
109
#define BTRFS_LAST_FREE_OBJECTID -256ULL
110
#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
111

112
113
114
115
116
117
118

/*
 * the device items go into the chunk tree.  The key is in the form
 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
 */
#define BTRFS_DEV_ITEMS_OBJECTID 1ULL

Chris Mason's avatar
Chris Mason committed
119
120
121
122
123
124
/*
 * we can actually store much bigger names, but lets not confuse the rest
 * of linux
 */
#define BTRFS_NAME_LEN 255

Chris Mason's avatar
Chris Mason committed
125
126
/* 32 bytes in various csum fields */
#define BTRFS_CSUM_SIZE 32
127
128
129
130
131
132

/* csum types */
#define BTRFS_CSUM_TYPE_CRC32	0

static int btrfs_csum_sizes[] = { 4, 0 };

133
/* four bytes for CRC32 */
134
#define BTRFS_EMPTY_DIR_SIZE 0
Chris Mason's avatar
Chris Mason committed
135

Chris Mason's avatar
Chris Mason committed
136
137
138
139
140
141
142
143
#define BTRFS_FT_UNKNOWN	0
#define BTRFS_FT_REG_FILE	1
#define BTRFS_FT_DIR		2
#define BTRFS_FT_CHRDEV		3
#define BTRFS_FT_BLKDEV		4
#define BTRFS_FT_FIFO		5
#define BTRFS_FT_SOCK		6
#define BTRFS_FT_SYMLINK	7
Josef Bacik's avatar
Josef Bacik committed
144
145
#define BTRFS_FT_XATTR		8
#define BTRFS_FT_MAX		9
Chris Mason's avatar
Chris Mason committed
146

147
/*
Wu Fengguang's avatar
Wu Fengguang committed
148
149
150
151
152
153
154
155
156
 * The key defines the order in the tree, and so it also defines (optimal)
 * block layout.
 *
 * objectid corresponds to the inode number.
 *
 * type tells us things about the object, and is a kind of stream selector.
 * so for a given inode, keys with type of 1 might refer to the inode data,
 * type of 2 may point to file data in the btree and type == 3 may point to
 * extents.
157
158
 *
 * offset is the starting byte offset for this key in the stream.
Chris Mason's avatar
Chris Mason committed
159
160
161
162
 *
 * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
 * in cpu native order.  Otherwise they are identical and their sizes
 * should be the same (ie both packed)
163
 */
Chris Mason's avatar
Chris Mason committed
164
165
struct btrfs_disk_key {
	__le64 objectid;
166
	u8 type;
167
	__le64 offset;
Chris Mason's avatar
Chris Mason committed
168
169
170
} __attribute__ ((__packed__));

struct btrfs_key {
171
	u64 objectid;
172
	u8 type;
173
	u64 offset;
174
175
} __attribute__ ((__packed__));

176
177
178
179
struct btrfs_mapping_tree {
	struct extent_map_tree map_tree;
};

180
#define BTRFS_UUID_SIZE 16
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
struct btrfs_dev_item {
	/* the internal btrfs device id */
	__le64 devid;

	/* size of the device */
	__le64 total_bytes;

	/* bytes used */
	__le64 bytes_used;

	/* optimal io alignment for this device */
	__le32 io_align;

	/* optimal io width for this device */
	__le32 io_width;

	/* minimal io size for this device */
	__le32 sector_size;

	/* type and info about this device */
	__le64 type;

Yan Zheng's avatar
Yan Zheng committed
203
204
205
	/* expected generation for this device */
	__le64 generation;

206
207
	/*
	 * starting byte of this partition on the device,
Wu Fengguang's avatar
Wu Fengguang committed
208
	 * to allow for stripe alignment in the future
209
210
211
	 */
	__le64 start_offset;

212
213
214
215
216
217
218
219
220
	/* grouping information for allocation decisions */
	__le32 dev_group;

	/* seek speed 0-100 where 100 is fastest */
	u8 seek_speed;

	/* bandwidth 0-100 where 100 is fastest */
	u8 bandwidth;

221
	/* btrfs generated uuid for this device */
222
	u8 uuid[BTRFS_UUID_SIZE];
Yan Zheng's avatar
Yan Zheng committed
223
224
225

	/* uuid of FS who owns this device */
	u8 fsid[BTRFS_UUID_SIZE];
226
227
228
229
230
} __attribute__ ((__packed__));

struct btrfs_stripe {
	__le64 devid;
	__le64 offset;
231
	u8 dev_uuid[BTRFS_UUID_SIZE];
232
233
234
} __attribute__ ((__packed__));

struct btrfs_chunk {
235
236
237
238
	/* size of this chunk in bytes */
	__le64 length;

	/* objectid of the root referencing this chunk */
239
	__le64 owner;
240

241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
	__le64 stripe_len;
	__le64 type;

	/* optimal io alignment for this chunk */
	__le32 io_align;

	/* optimal io width for this chunk */
	__le32 io_width;

	/* minimal io size for this chunk */
	__le32 sector_size;

	/* 2^16 stripes is quite a lot, a second limit is the size of a single
	 * item in the btree
	 */
	__le16 num_stripes;
Chris Mason's avatar
Chris Mason committed
257
258
259

	/* sub stripes only matter for raid10 */
	__le16 sub_stripes;
260
261
262
263
264
265
266
267
268
269
270
	struct btrfs_stripe stripe;
	/* additional stripes go here */
} __attribute__ ((__packed__));

static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
	BUG_ON(num_stripes == 0);
	return sizeof(struct btrfs_chunk) +
		sizeof(struct btrfs_stripe) * (num_stripes - 1);
}

271
#define BTRFS_FSID_SIZE 16
272
273
274
275
276
277
278
279
280
281
282
283
#define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
#define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
#define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
#define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)

#define BTRFS_BACKREF_REV_MAX		256
#define BTRFS_BACKREF_REV_SHIFT		56
#define BTRFS_BACKREF_REV_MASK		(((u64)BTRFS_BACKREF_REV_MAX - 1) << \
					 BTRFS_BACKREF_REV_SHIFT)

#define BTRFS_OLD_BACKREF_REV		0
#define BTRFS_MIXED_BACKREF_REV		1
284

285
286
287
/*
 * every tree block (leaf or node) starts with this header.
 */
288
struct btrfs_header {
289
	/* these first four must match the super block */
Chris Mason's avatar
Chris Mason committed
290
	u8 csum[BTRFS_CSUM_SIZE];
291
	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
292
	__le64 bytenr; /* which block this node is supposed to live in */
293
	__le64 flags;
294
295
296

	/* allowed to be different from the super from here on down */
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
297
	__le64 generation;
298
	__le64 owner;
299
	__le32 nritems;
300
	u8 level;
301
302
} __attribute__ ((__packed__));

303
#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
304
305
				      sizeof(struct btrfs_header)) / \
				     sizeof(struct btrfs_key_ptr))
306
#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
307
#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
308
309
310
#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
					sizeof(struct btrfs_item) - \
					sizeof(struct btrfs_file_extent_item))
311

312
313
314
315
316
317

/*
 * this is a very generous portion of the super block, giving us
 * room to translate 14 chunks with 3 stripes each.
 */
#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
318
#define BTRFS_LABEL_SIZE 256
319

320
321
322
323
/*
 * the super block basically lists the main trees of the FS
 * it currently lacks any block count etc etc
 */
324
struct btrfs_super_block {
Chris Mason's avatar
Chris Mason committed
325
	u8 csum[BTRFS_CSUM_SIZE];
326
	/* the first 4 fields must match struct btrfs_header */
Yan Zheng's avatar
Yan Zheng committed
327
	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
328
	__le64 bytenr; /* this block number */
329
	__le64 flags;
330
331

	/* allowed to be different from the btrfs_header from here own down */
332
333
334
	__le64 magic;
	__le64 generation;
	__le64 root;
335
	__le64 chunk_root;
336
	__le64 log_root;
337
338
339

	/* this will help find the new super based on the log root */
	__le64 log_root_transid;
340
341
	__le64 total_bytes;
	__le64 bytes_used;
342
	__le64 root_dir_objectid;
343
	__le64 num_devices;
344
345
346
	__le32 sectorsize;
	__le32 nodesize;
	__le32 leafsize;
347
	__le32 stripesize;
348
	__le32 sys_chunk_array_size;
349
	__le64 chunk_root_generation;
350
351
352
	__le64 compat_flags;
	__le64 compat_ro_flags;
	__le64 incompat_flags;
353
	__le16 csum_type;
354
	u8 root_level;
355
	u8 chunk_root_level;
356
	u8 log_root_level;
357
	struct btrfs_dev_item dev_item;
358

359
	char label[BTRFS_LABEL_SIZE];
360
361
362

	/* future expansion */
	__le64 reserved[32];
363
	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
Chris Mason's avatar
Chris Mason committed
364
365
} __attribute__ ((__packed__));

366
367
368
369
/*
 * Compat flags that we support.  If any incompat flags are set other than the
 * ones specified below then we will fail to mount
 */
370
371
372
373
374
375
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)

#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
#define BTRFS_FEATURE_INCOMPAT_SUPP		\
	BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
376

377
/*
378
 * A leaf is full of items. offset and size tell us where to find
379
380
 * the item in the leaf (relative to the start of the data area)
 */
Chris Mason's avatar
Chris Mason committed
381
struct btrfs_item {
Chris Mason's avatar
Chris Mason committed
382
	struct btrfs_disk_key key;
383
	__le32 offset;
384
	__le32 size;
385
386
} __attribute__ ((__packed__));

387
388
389
390
391
392
393
/*
 * leaves have an item area and a data area:
 * [item0, item1....itemN] [free space] [dataN...data1, data0]
 *
 * The data is separate from the items to get the keys closer together
 * during searches.
 */
394
struct btrfs_leaf {
395
	struct btrfs_header header;
396
	struct btrfs_item items[];
397
398
} __attribute__ ((__packed__));

399
400
401
402
/*
 * all non-leaf blocks are nodes, they hold only keys and pointers to
 * other blocks
 */
403
404
405
struct btrfs_key_ptr {
	struct btrfs_disk_key key;
	__le64 blockptr;
406
	__le64 generation;
407
408
} __attribute__ ((__packed__));

409
struct btrfs_node {
410
	struct btrfs_header header;
411
	struct btrfs_key_ptr ptrs[];
412
413
} __attribute__ ((__packed__));

414
/*
415
416
 * btrfs_paths remember the path taken from the root down to the leaf.
 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
417
418
419
420
421
 * to any other levels that are present.
 *
 * The slots array records the index of the item or block pointer
 * used while walking the tree.
 */
422
struct btrfs_path {
423
	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
424
	int slots[BTRFS_MAX_LEVEL];
425
426
	/* if there is real range locking, this locks field will change */
	int locks[BTRFS_MAX_LEVEL];
427
	int reada;
428
	/* keep some upper locks as we walk down */
429
	int lowest_level;
430
431
432
433
434

	/*
	 * set by btrfs_split_item, tells search_slot to keep all locks
	 * and to force calls to keep space in the nodes
	 */
435
436
437
438
	unsigned int search_for_split:1;
	unsigned int keep_locks:1;
	unsigned int skip_locking:1;
	unsigned int leave_spinning:1;
439
	unsigned int search_commit_root:1;
440
};
Chris Mason's avatar
Chris Mason committed
441

442
443
444
445
/*
 * items in the extent btree are used to record the objectid of the
 * owner of the block and the number of references
 */
446

447
struct btrfs_extent_item {
448
449
450
451
452
453
	__le64 refs;
	__le64 generation;
	__le64 flags;
} __attribute__ ((__packed__));

struct btrfs_extent_item_v0 {
454
	__le32 refs;
455
456
} __attribute__ ((__packed__));

457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
					sizeof(struct btrfs_item))

#define BTRFS_EXTENT_FLAG_DATA		(1ULL << 0)
#define BTRFS_EXTENT_FLAG_TREE_BLOCK	(1ULL << 1)

/* following flags only apply to tree blocks */

/* use full backrefs for extent pointers in the block */
#define BTRFS_BLOCK_FLAG_FULL_BACKREF	(1ULL << 8)

struct btrfs_tree_block_info {
	struct btrfs_disk_key key;
	u8 level;
} __attribute__ ((__packed__));

struct btrfs_extent_data_ref {
	__le64 root;
	__le64 objectid;
	__le64 offset;
	__le32 count;
} __attribute__ ((__packed__));

struct btrfs_shared_data_ref {
	__le32 count;
} __attribute__ ((__packed__));

struct btrfs_extent_inline_ref {
	u8 type;
	u64 offset;
} __attribute__ ((__packed__));

/* old style backrefs item */
struct btrfs_extent_ref_v0 {
491
492
493
	__le64 root;
	__le64 generation;
	__le64 objectid;
494
	__le32 count;
495
496
} __attribute__ ((__packed__));

497

498
499
/* dev extents record free space on individual devices.  The owner
 * field points back to the chunk allocation mapping tree that allocated
500
 * the extent.  The chunk tree uuid field is a way to double check the owner
501
502
 */
struct btrfs_dev_extent {
503
504
505
	__le64 chunk_tree;
	__le64 chunk_objectid;
	__le64 chunk_offset;
506
	__le64 length;
507
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
508
509
} __attribute__ ((__packed__));

510
struct btrfs_inode_ref {
511
	__le64 index;
512
513
514
515
	__le16 name_len;
	/* name goes here */
} __attribute__ ((__packed__));

516
struct btrfs_timespec {
Chris Mason's avatar
Chris Mason committed
517
	__le64 sec;
Chris Mason's avatar
Chris Mason committed
518
519
520
	__le32 nsec;
} __attribute__ ((__packed__));

Jan Engelhardt's avatar
Jan Engelhardt committed
521
enum btrfs_compression_type {
522
523
524
	BTRFS_COMPRESS_NONE = 0,
	BTRFS_COMPRESS_ZLIB = 1,
	BTRFS_COMPRESS_LAST = 2,
Jan Engelhardt's avatar
Jan Engelhardt committed
525
};
526

Chris Mason's avatar
Chris Mason committed
527
struct btrfs_inode_item {
528
	/* nfs style generation number */
Chris Mason's avatar
Chris Mason committed
529
	__le64 generation;
530
531
	/* transid that last touched this inode */
	__le64 transid;
Chris Mason's avatar
Chris Mason committed
532
	__le64 size;
533
	__le64 nbytes;
534
	__le64 block_group;
Chris Mason's avatar
Chris Mason committed
535
536
537
538
	__le32 nlink;
	__le32 uid;
	__le32 gid;
	__le32 mode;
539
	__le64 rdev;
540
	__le64 flags;
541

542
543
544
545
546
547
548
549
	/* modification sequence number for NFS */
	__le64 sequence;

	/*
	 * a little future expansion, for more than this we can
	 * just grow the inode item and version it
	 */
	__le64 reserved[4];
550
551
552
553
	struct btrfs_timespec atime;
	struct btrfs_timespec ctime;
	struct btrfs_timespec mtime;
	struct btrfs_timespec otime;
Chris Mason's avatar
Chris Mason committed
554
555
} __attribute__ ((__packed__));

556
557
558
559
struct btrfs_dir_log_item {
	__le64 end;
} __attribute__ ((__packed__));

560
struct btrfs_dir_item {
561
	struct btrfs_disk_key location;
562
	__le64 transid;
Josef Bacik's avatar
Josef Bacik committed
563
	__le16 data_len;
564
	__le16 name_len;
565
566
567
568
	u8 type;
} __attribute__ ((__packed__));

struct btrfs_root_item {
569
	struct btrfs_inode_item inode;
570
	__le64 generation;
571
	__le64 root_dirid;
572
573
574
	__le64 bytenr;
	__le64 byte_limit;
	__le64 bytes_used;
Yan Zheng's avatar
Yan Zheng committed
575
	__le64 last_snapshot;
576
	__le64 flags;
577
	__le32 refs;
578
579
	struct btrfs_disk_key drop_progress;
	u8 drop_level;
580
	u8 level;
581
} __attribute__ ((__packed__));
582

583
584
585
586
587
588
589
590
591
/*
 * this is used for both forward and backward root refs
 */
struct btrfs_root_ref {
	__le64 dirid;
	__le64 sequence;
	__le16 name_len;
} __attribute__ ((__packed__));

Yan Zheng's avatar
Yan Zheng committed
592
593
594
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
595

596
struct btrfs_file_extent_item {
597
598
599
	/*
	 * transaction id that created this extent
	 */
600
	__le64 generation;
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
	/*
	 * max number of bytes to hold this extent in ram
	 * when we split a compressed extent we can't know how big
	 * each of the resulting pieces will be.  So, this is
	 * an upper limit on the size of the extent in ram instead of
	 * an exact limit.
	 */
	__le64 ram_bytes;

	/*
	 * 32 bits for the various ways we might encode the data,
	 * including compression and encryption.  If any of these
	 * are set to something a given disk format doesn't understand
	 * it is treated like an incompat flag for reading and writing,
	 * but not for stat.
	 */
	u8 compression;
	u8 encryption;
	__le16 other_encoding; /* spare for later use */

	/* are we inline data or a real extent? */
622
	u8 type;
623

624
625
626
627
	/*
	 * disk space consumed by the extent, checksum blocks are included
	 * in these numbers
	 */
628
629
	__le64 disk_bytenr;
	__le64 disk_num_bytes;
630
	/*
Chris Mason's avatar
Chris Mason committed
631
	 * the logical offset in file blocks (no csums)
632
633
634
635
636
637
638
	 * this extent record is for.  This allows a file extent to point
	 * into the middle of an existing extent on disk, sharing it
	 * between two snapshots (useful if some bytes in the middle of the
	 * extent have changed
	 */
	__le64 offset;
	/*
639
640
	 * the logical number of file blocks (no csums included).  This
	 * always reflects the size uncompressed and without encoding.
641
	 */
642
	__le64 num_bytes;
643

644
645
} __attribute__ ((__packed__));

Chris Mason's avatar
Chris Mason committed
646
struct btrfs_csum_item {
647
	u8 csum;
Chris Mason's avatar
Chris Mason committed
648
649
} __attribute__ ((__packed__));

650
651
652
653
/* different types of block groups (and chunks) */
#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
654
#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
655
#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
656
#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
Chris Mason's avatar
Chris Mason committed
657
#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
Chris Mason's avatar
Chris Mason committed
658

Chris Mason's avatar
Chris Mason committed
659
660
struct btrfs_block_group_item {
	__le64 used;
661
662
	__le64 chunk_objectid;
	__le64 flags;
Chris Mason's avatar
Chris Mason committed
663
664
} __attribute__ ((__packed__));

665
666
struct btrfs_space_info {
	u64 flags;
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687

	u64 total_bytes;	/* total bytes in the space */
	u64 bytes_used;		/* total bytes used on disk */
	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
				   transaction finishes */
	u64 bytes_reserved;	/* total bytes the allocator has reserved for
				   current allocations */
	u64 bytes_readonly;	/* total bytes that are read only */

	/* delalloc accounting */
	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
				   this space is not necessarily reserved yet
				   by the allocator */
	u64 bytes_may_use;	/* number of bytes that may be used for
				   delalloc */

	int full;		/* indicates that we cannot allocate any more
				   chunks for this space */
	int force_alloc;	/* set if we need to force a chunk alloc for
				   this space */

688
	struct list_head list;
689
690
691
692

	/* for block groups in our same type */
	struct list_head block_groups;
	spinlock_t lock;
693
	struct rw_semaphore groups_sem;
694
695
};

696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
/*
 * free clusters are used to claim free space in relatively large chunks,
 * allowing us to do less seeky writes.  They are used for all metadata
 * allocations and data allocations in ssd mode.
 */
struct btrfs_free_cluster {
	spinlock_t lock;
	spinlock_t refill_lock;
	struct rb_root root;

	/* largest extent in this cluster */
	u64 max_size;

	/* first extent starting offset */
	u64 window_start;

	struct btrfs_block_group_cache *block_group;
	/*
	 * when a cluster is allocated from a block group, we put the
	 * cluster onto a list in the block group so that it can
	 * be freed before the block group is freed.
	 */
	struct list_head block_group_list;
719
720
};

Chris Mason's avatar
Chris Mason committed
721
722
723
struct btrfs_block_group_cache {
	struct btrfs_key key;
	struct btrfs_block_group_item item;
724
	spinlock_t lock;
725
	struct mutex cache_mutex;
726
	u64 pinned;
727
	u64 reserved;
728
729
	u64 flags;
	int cached;
730
	int ro;
731
732
733
734
735
	int dirty;

	struct btrfs_space_info *space_info;

	/* free space cache stuff */
736
	spinlock_t tree_lock;
737
738
739
740
741
742
743
744
	struct rb_root free_space_bytes;
	struct rb_root free_space_offset;

	/* block group cache stuff */
	struct rb_node cache_node;

	/* for block groups in the same raid type */
	struct list_head list;
745
746
747

	/* usage count */
	atomic_t count;
748
749
750
751
752

	/* List of struct btrfs_free_clusters for this block group.
	 * Today it will only have one thing on it, but that may change
	 */
	struct list_head cluster_list;
Chris Mason's avatar
Chris Mason committed
753
};
754

755
struct reloc_control;
756
struct btrfs_device;
757
struct btrfs_fs_devices;
758
struct btrfs_fs_info {
759
	u8 fsid[BTRFS_FSID_SIZE];
760
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
761
762
	struct btrfs_root *extent_root;
	struct btrfs_root *tree_root;
763
764
	struct btrfs_root *chunk_root;
	struct btrfs_root *dev_root;
765
	struct btrfs_root *fs_root;
766
	struct btrfs_root *csum_root;
767
768
769

	/* the log root tree is a directory of all the other log roots */
	struct btrfs_root *log_root_tree;
770
	struct radix_tree_root fs_roots_radix;
771

772
773
774
775
	/* block group cache stuff */
	spinlock_t block_group_cache_lock;
	struct rb_root block_group_cache_tree;

776
	struct extent_io_tree pinned_extents;
777

778
779
780
	/* logical->physical extent mapping */
	struct btrfs_mapping_tree mapping_tree;

781
	u64 generation;
782
	u64 last_trans_committed;
783
784
785
786
787
788

	/*
	 * this is updated to the current trans every time a full commit
	 * is required instead of the faster short fsync log commits
	 */
	u64 last_trans_log_full_commit;
789
	u64 open_ioctl_trans;
790
	unsigned long mount_opt;
791
	u64 max_extent;
792
	u64 max_inline;
793
	u64 alloc_start;
Chris Mason's avatar
Chris Mason committed
794
	struct btrfs_transaction *running_transaction;
795
	wait_queue_head_t transaction_throttle;
796
	wait_queue_head_t transaction_wait;
797
	wait_queue_head_t async_submit_wait;
798

799
	struct btrfs_super_block super_copy;
800
	struct btrfs_super_block super_for_commit;
801
	struct block_device *__bdev;
Chris Mason's avatar
Chris Mason committed
802
	struct super_block *sb;
803
	struct inode *btree_inode;
Chris Mason's avatar
Chris Mason committed
804
	struct backing_dev_info bdi;
Chris Mason's avatar
Chris Mason committed
805
	struct mutex trans_mutex;
806
	struct mutex tree_log_mutex;
807
808
	struct mutex transaction_kthread_mutex;
	struct mutex cleaner_mutex;
809
	struct mutex chunk_mutex;
810
	struct mutex drop_mutex;
811
	struct mutex volume_mutex;
Zheng Yan's avatar
Zheng Yan committed
812
	struct mutex tree_reloc_mutex;
813

814
815
816
817
818
819
820
821
822
	/*
	 * this protects the ordered operations list only while we are
	 * processing all of the entries on it.  This way we make
	 * sure the commit code doesn't find the list temporarily empty
	 * because another function happens to be doing non-waiting preflush
	 * before jumping into the main commit.
	 */
	struct mutex ordered_operations_mutex;

Chris Mason's avatar
Chris Mason committed
823
	struct list_head trans_list;
824
	struct list_head hashers;
825
	struct list_head dead_roots;
826

827
	atomic_t nr_async_submits;
828
	atomic_t async_submit_draining;
829
	atomic_t nr_async_bios;
830
	atomic_t async_delalloc_pages;
831

832
833
834
835
836
	/*
	 * this is used by the balancing code to wait for all the pending
	 * ordered extents
	 */
	spinlock_t ordered_extent_lock;
837
838
839
840
841
842

	/*
	 * all of the data=ordered extents pending writeback
	 * these can span multiple transactions and basically include
	 * every dirty data page that isn't from nodatacow
	 */
843
	struct list_head ordered_extents;
844
845
846
847
848
849

	/*
	 * all of the inodes that have delalloc bytes.  It is possible for
	 * this list to be empty even when there is still dirty data=ordered
	 * extents waiting to finish IO.
	 */
850
	struct list_head delalloc_inodes;
851

852
853
854
855
856
857
858
	/*
	 * special rename and truncate targets that must be on disk before
	 * we're allowed to commit.  This is basically the ext3 style
	 * data=ordered list.
	 */
	struct list_head ordered_operations;

859
860
861
862
863
864
	/*
	 * there is a pool of worker threads for checksumming during writes
	 * and a pool for checksumming after reads.  This is because readers
	 * can run with FS locks held, and the writers may be waiting for
	 * those locks.  We don't want ordering in the pending list to cause
	 * deadlocks, and so the two are serviced separately.
865
866
867
	 *
	 * A third pool does submit_bio to avoid deadlocking with the other
	 * two
868
869
	 */
	struct btrfs_workers workers;
870
	struct btrfs_workers delalloc_workers;
871
	struct btrfs_workers endio_workers;
872
	struct btrfs_workers endio_meta_workers;
873
	struct btrfs_workers endio_meta_write_workers;
874
	struct btrfs_workers endio_write_workers;
875
	struct btrfs_workers submit_workers;
876
877
878
879
880
881
	/*
	 * fixup workers take dirty pages that didn't properly go through
	 * the cow mechanism and make them safe to write.  It happens
	 * for the sys_munmap function call path
	 */
	struct btrfs_workers fixup_workers;
882
883
	struct task_struct *transaction_kthread;
	struct task_struct *cleaner_kthread;
884
	int thread_pool_size;
885

886
887
	struct kobject super_kobj;
	struct completion kobj_unregister;
888
	int do_barriers;
889
	int closing;
890
	int log_root_recovering;
891

892
	u64 total_pinned;
893
894
895
896
897

	/* protected by the delalloc lock, used to keep from writing
	 * metadata until there is a nice batch
	 */
	u64 dirty_metadata_bytes;
898
899
	struct list_head dirty_cowonly_roots;

900
	struct btrfs_fs_devices *fs_devices;
901
902
903
904
905
906

	/*
	 * the space_info list is almost entirely read only.  It only changes
	 * when we add a new raid type to the FS, and that happens
	 * very rarely.  RCU is used to protect it.
	 */
907
	struct list_head space_info;
908

909
910
	struct reloc_control *reloc_ctl;

911
	spinlock_t delalloc_lock;
912
	spinlock_t new_trans_lock;
913
	u64 delalloc_bytes;
914
915
916
917
918
919

	/* data_alloc_cluster is only used in ssd mode */
	struct btrfs_free_cluster data_alloc_cluster;

	/* all metadata allocations go through this cluster */
	struct btrfs_free_cluster meta_alloc_cluster;
920

Yan Zheng's avatar
Yan Zheng committed
921
922
923
	spinlock_t ref_cache_lock;
	u64 total_ref_cache_size;

924
925
926
927
928
929
	u64 avail_data_alloc_bits;
	u64 avail_metadata_alloc_bits;
	u64 avail_system_alloc_bits;
	u64 data_alloc_profile;
	u64 metadata_alloc_profile;
	u64 system_alloc_profile;
930

931
932
933
	unsigned data_chunk_allocations;
	unsigned metadata_ratio;

934
	void *bdev_holder;
935
};
936

937
938
/*
 * in ram representation of the tree.  extent_root is used for all allocations
939
 * and for the extent tree extent_root root.
940
941
 */
struct btrfs_root {
942
	struct extent_buffer *node;
943
944
945
946

	/* the node lock is held while changing the node pointer */
	spinlock_t node_lock;

947
	struct extent_buffer *commit_root;
948
	struct btrfs_root *log_root;
Zheng Yan's avatar
Zheng Yan committed
949
	struct btrfs_root *reloc_root;
Yan Zheng's avatar
Yan Zheng committed
950

951
952
	struct btrfs_root_item root_item;
	struct btrfs_key root_key;
953
	struct btrfs_fs_info *fs_info;
954
955
	struct extent_io_tree dirty_log_pages;

956
957
	struct kobject root_kobj;
	struct completion kobj_unregister;
958
	struct mutex objectid_mutex;
Yan Zheng's avatar
Yan Zheng committed
959

960
	struct mutex log_mutex;
Yan Zheng's avatar
Yan Zheng committed
961
962
963
964
965
966
	wait_queue_head_t log_writer_wait;
	wait_queue_head_t log_commit_wait[2];
	atomic_t log_writers;
	atomic_t log_commit[2];
	unsigned long log_transid;
	unsigned long log_batch;
967

968
969
	u64 objectid;
	u64 last_trans;
970
971
972
973
974
975
976
977
978
979

	/* data allocations are done in sectorsize units */
	u32 sectorsize;

	/* node allocations are done in nodesize units */
	u32 nodesize;

	/* leaf allocations are done in leafsize units */
	u32 leafsize;

980
981
	u32 stripesize;

982
	u32 type;
Chris Mason's avatar
Chris Mason committed
983
984
	u64 highest_inode;
	u64 last_inode_alloc;
985
	int ref_cows;
986
	int track_dirty;
987
	u64 defrag_trans_start;
988
	struct btrfs_key defrag_progress;
989
	struct btrfs_key defrag_max;
990
991
	int defrag_running;
	int defrag_level;
992
	char *name;
993
	int in_sysfs;
994
995
996

	/* the dirty list is only used by non-reference counted roots */
	struct list_head dirty_list;
997

998
999
	struct list_head root_list;

1000
	spinlock_t list_lock;
1001
	struct list_head orphan_list;
1002

1003
1004
1005
1006
	spinlock_t inode_lock;
	/* red-black tree that keeps track of in-memory inodes */
	struct rb_root inode_tree;

1007
1008
1009
1010
1011
	/*
	 * right now this just gets used so that a root has its own devid
	 * for stat.  It may be used for more later
	 */
	struct super_block anon_super;
1012
1013
};

Chris Mason's avatar
Chris Mason committed
1014
1015
1016
1017
1018
/*
 * inode items have the data typically returned from stat and store other
 * info about object characteristics.  There is one for every file and dir in
 * the FS
 */
Chris Mason's avatar
Chris Mason committed
1019
#define BTRFS_INODE_ITEM_KEY		1
1020
1021
1022
#define BTRFS_INODE_REF_KEY		12
#define BTRFS_XATTR_ITEM_KEY		24
#define BTRFS_ORPHAN_ITEM_KEY		48
Chris Mason's avatar
Chris Mason committed
1023
/* reserve 2-15 close to the inode for later flexibility */
Chris Mason's avatar
Chris Mason committed
1024
1025
1026
1027
1028

/*
 * dir items are the name -> inode pointers in a directory.  There is one
 * for every name in a directory.
 */
1029
1030
1031
1032
#define BTRFS_DIR_LOG_ITEM_KEY  60
#define BTRFS_DIR_LOG_INDEX_KEY 72
#define BTRFS_DIR_ITEM_KEY	84
#define BTRFS_DIR_INDEX_KEY	96
Chris Mason's avatar
Chris Mason committed
1033
/*
Chris Mason's avatar
Chris Mason committed
1034
 * extent data is for file data
Chris Mason's avatar
Chris Mason committed
1035
 */
1036
#define BTRFS_EXTENT_DATA_KEY	108
1037

Chris Mason's avatar
Chris Mason committed
1038
/*
1039
1040
 * extent csums are stored in a separate tree and hold csums for
 * an entire extent on disk.
Chris Mason's avatar
Chris Mason committed
1041
 */
1042
#define BTRFS_EXTENT_CSUM_KEY	128
Chris Mason's avatar
Chris Mason committed
1043

Chris Mason's avatar
Chris Mason committed
1044
/*