ctree.h 75.6 KB
Newer Older
Chris Mason's avatar
Chris Mason committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

Chris Mason's avatar
Chris Mason committed
19
20
#ifndef __BTRFS_CTREE__
#define __BTRFS_CTREE__
21

22
#include <linux/version.h>
23
24
#include <linux/mm.h>
#include <linux/highmem.h>
Chris Mason's avatar
Chris Mason committed
25
#include <linux/fs.h>
26
#include <linux/completion.h>
Chris Mason's avatar
Chris Mason committed
27
#include <linux/backing-dev.h>
28
#include <linux/wait.h>
29
#include <asm/kmap_types.h>
30
#include "extent_io.h"
31
#include "extent_map.h"
32
#include "async-thread.h"
Chris Mason's avatar
Chris Mason committed
33

34
struct btrfs_trans_handle;
Chris Mason's avatar
Chris Mason committed
35
struct btrfs_transaction;
36
37
38
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
Chris Mason's avatar
Chris Mason committed
39
extern struct kmem_cache *btrfs_path_cachep;
40
struct btrfs_ordered_sum;
41

42
#define BTRFS_MAGIC "_BHRfS_M"
43

44
#define BTRFS_MAX_LEVEL 8
45

46
47
#define BTRFS_COMPAT_EXTENT_TREE_V0

48
49
50
51
52
53
54
/*
 * files bigger than this get some pre-flushing when they are added
 * to the ordered operations list.  That way we limit the total
 * work done by the commit
 */
#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)

55
/* holds pointers to all of the tree roots */
56
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
57
58

/* stores information about which extents are in use, and reference counts */
Chris Mason's avatar
Chris Mason committed
59
#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
60
61
62
63
64

/*
 * chunk tree stores translations from logical -> physical block numbering
 * the super block points to the chunk tree
 */
65
#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
66
67
68
69
70

/*
 * stores information about which areas of a given device are in use.
 * one per device.  The tree of tree roots points to the device tree
 */
71
72
73
74
75
76
77
#define BTRFS_DEV_TREE_OBJECTID 4ULL

/* one per subvolume, storing files and directories */
#define BTRFS_FS_TREE_OBJECTID 5ULL

/* directory objectid inside the root tree */
#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
78

79
80
81
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL

82
83
84
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL

85
86
87
88
/* does write ahead logging to speed up fsyncs */
#define BTRFS_TREE_LOG_OBJECTID -6ULL
#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL

Zheng Yan's avatar
Zheng Yan committed
89
90
91
92
/* for space balancing */
#define BTRFS_TREE_RELOC_OBJECTID -8ULL
#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL

93
94
95
96
97
98
99
/*
 * extent checksums all have this objectid
 * this allows them to share the logging tree
 * for fsyncs
 */
#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL

Zheng Yan's avatar
Zheng Yan committed
100
101
102
/* dummy objectid represents multiple objectids */
#define BTRFS_MULTIPLE_OBJECTIDS -255ULL

103
/*
104
 * All files have objectids in this range.
105
 */
106
#define BTRFS_FIRST_FREE_OBJECTID 256ULL
107
#define BTRFS_LAST_FREE_OBJECTID -256ULL
108
#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
109

110
111
112
113
114
115
116

/*
 * the device items go into the chunk tree.  The key is in the form
 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
 */
#define BTRFS_DEV_ITEMS_OBJECTID 1ULL

117
118
119
120
#define BTRFS_BTREE_INODE_OBJECTID 1

#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2

Chris Mason's avatar
Chris Mason committed
121
122
123
124
125
126
/*
 * we can actually store much bigger names, but lets not confuse the rest
 * of linux
 */
#define BTRFS_NAME_LEN 255

Chris Mason's avatar
Chris Mason committed
127
128
/* 32 bytes in various csum fields */
#define BTRFS_CSUM_SIZE 32
129
130
131
132
133
134

/* csum types */
#define BTRFS_CSUM_TYPE_CRC32	0

static int btrfs_csum_sizes[] = { 4, 0 };

135
/* four bytes for CRC32 */
136
#define BTRFS_EMPTY_DIR_SIZE 0
Chris Mason's avatar
Chris Mason committed
137

Chris Mason's avatar
Chris Mason committed
138
139
140
141
142
143
144
145
#define BTRFS_FT_UNKNOWN	0
#define BTRFS_FT_REG_FILE	1
#define BTRFS_FT_DIR		2
#define BTRFS_FT_CHRDEV		3
#define BTRFS_FT_BLKDEV		4
#define BTRFS_FT_FIFO		5
#define BTRFS_FT_SOCK		6
#define BTRFS_FT_SYMLINK	7
Josef Bacik's avatar
Josef Bacik committed
146
147
#define BTRFS_FT_XATTR		8
#define BTRFS_FT_MAX		9
Chris Mason's avatar
Chris Mason committed
148

149
/*
Wu Fengguang's avatar
Wu Fengguang committed
150
151
152
153
154
155
156
157
158
 * The key defines the order in the tree, and so it also defines (optimal)
 * block layout.
 *
 * objectid corresponds to the inode number.
 *
 * type tells us things about the object, and is a kind of stream selector.
 * so for a given inode, keys with type of 1 might refer to the inode data,
 * type of 2 may point to file data in the btree and type == 3 may point to
 * extents.
159
160
 *
 * offset is the starting byte offset for this key in the stream.
Chris Mason's avatar
Chris Mason committed
161
162
163
164
 *
 * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
 * in cpu native order.  Otherwise they are identical and their sizes
 * should be the same (ie both packed)
165
 */
Chris Mason's avatar
Chris Mason committed
166
167
struct btrfs_disk_key {
	__le64 objectid;
168
	u8 type;
169
	__le64 offset;
Chris Mason's avatar
Chris Mason committed
170
171
172
} __attribute__ ((__packed__));

struct btrfs_key {
173
	u64 objectid;
174
	u8 type;
175
	u64 offset;
176
177
} __attribute__ ((__packed__));

178
179
180
181
struct btrfs_mapping_tree {
	struct extent_map_tree map_tree;
};

182
#define BTRFS_UUID_SIZE 16
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
struct btrfs_dev_item {
	/* the internal btrfs device id */
	__le64 devid;

	/* size of the device */
	__le64 total_bytes;

	/* bytes used */
	__le64 bytes_used;

	/* optimal io alignment for this device */
	__le32 io_align;

	/* optimal io width for this device */
	__le32 io_width;

	/* minimal io size for this device */
	__le32 sector_size;

	/* type and info about this device */
	__le64 type;

Yan Zheng's avatar
Yan Zheng committed
205
206
207
	/* expected generation for this device */
	__le64 generation;

208
209
	/*
	 * starting byte of this partition on the device,
Wu Fengguang's avatar
Wu Fengguang committed
210
	 * to allow for stripe alignment in the future
211
212
213
	 */
	__le64 start_offset;

214
215
216
217
218
219
220
221
222
	/* grouping information for allocation decisions */
	__le32 dev_group;

	/* seek speed 0-100 where 100 is fastest */
	u8 seek_speed;

	/* bandwidth 0-100 where 100 is fastest */
	u8 bandwidth;

223
	/* btrfs generated uuid for this device */
224
	u8 uuid[BTRFS_UUID_SIZE];
Yan Zheng's avatar
Yan Zheng committed
225
226
227

	/* uuid of FS who owns this device */
	u8 fsid[BTRFS_UUID_SIZE];
228
229
230
231
232
} __attribute__ ((__packed__));

struct btrfs_stripe {
	__le64 devid;
	__le64 offset;
233
	u8 dev_uuid[BTRFS_UUID_SIZE];
234
235
236
} __attribute__ ((__packed__));

struct btrfs_chunk {
237
238
239
240
	/* size of this chunk in bytes */
	__le64 length;

	/* objectid of the root referencing this chunk */
241
	__le64 owner;
242

243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
	__le64 stripe_len;
	__le64 type;

	/* optimal io alignment for this chunk */
	__le32 io_align;

	/* optimal io width for this chunk */
	__le32 io_width;

	/* minimal io size for this chunk */
	__le32 sector_size;

	/* 2^16 stripes is quite a lot, a second limit is the size of a single
	 * item in the btree
	 */
	__le16 num_stripes;
Chris Mason's avatar
Chris Mason committed
259
260
261

	/* sub stripes only matter for raid10 */
	__le16 sub_stripes;
262
263
264
265
266
267
268
269
270
271
272
	struct btrfs_stripe stripe;
	/* additional stripes go here */
} __attribute__ ((__packed__));

static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
	BUG_ON(num_stripes == 0);
	return sizeof(struct btrfs_chunk) +
		sizeof(struct btrfs_stripe) * (num_stripes - 1);
}

273
#define BTRFS_FSID_SIZE 16
274
275
276
277
278
279
280
281
282
283
284
285
#define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
#define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
#define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
#define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)

#define BTRFS_BACKREF_REV_MAX		256
#define BTRFS_BACKREF_REV_SHIFT		56
#define BTRFS_BACKREF_REV_MASK		(((u64)BTRFS_BACKREF_REV_MAX - 1) << \
					 BTRFS_BACKREF_REV_SHIFT)

#define BTRFS_OLD_BACKREF_REV		0
#define BTRFS_MIXED_BACKREF_REV		1
286

287
288
289
/*
 * every tree block (leaf or node) starts with this header.
 */
290
struct btrfs_header {
291
	/* these first four must match the super block */
Chris Mason's avatar
Chris Mason committed
292
	u8 csum[BTRFS_CSUM_SIZE];
293
	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
294
	__le64 bytenr; /* which block this node is supposed to live in */
295
	__le64 flags;
296
297
298

	/* allowed to be different from the super from here on down */
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
299
	__le64 generation;
300
	__le64 owner;
301
	__le32 nritems;
302
	u8 level;
303
304
} __attribute__ ((__packed__));

305
#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
306
307
				      sizeof(struct btrfs_header)) / \
				     sizeof(struct btrfs_key_ptr))
308
#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
309
#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
310
311
312
#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
					sizeof(struct btrfs_item) - \
					sizeof(struct btrfs_file_extent_item))
313

314
315
316
317
318
319

/*
 * this is a very generous portion of the super block, giving us
 * room to translate 14 chunks with 3 stripes each.
 */
#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
320
#define BTRFS_LABEL_SIZE 256
321

322
323
324
325
/*
 * the super block basically lists the main trees of the FS
 * it currently lacks any block count etc etc
 */
326
struct btrfs_super_block {
Chris Mason's avatar
Chris Mason committed
327
	u8 csum[BTRFS_CSUM_SIZE];
328
	/* the first 4 fields must match struct btrfs_header */
Yan Zheng's avatar
Yan Zheng committed
329
	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
330
	__le64 bytenr; /* this block number */
331
	__le64 flags;
332
333

	/* allowed to be different from the btrfs_header from here own down */
334
335
336
	__le64 magic;
	__le64 generation;
	__le64 root;
337
	__le64 chunk_root;
338
	__le64 log_root;
339
340
341

	/* this will help find the new super based on the log root */
	__le64 log_root_transid;
342
343
	__le64 total_bytes;
	__le64 bytes_used;
344
	__le64 root_dir_objectid;
345
	__le64 num_devices;
346
347
348
	__le32 sectorsize;
	__le32 nodesize;
	__le32 leafsize;
349
	__le32 stripesize;
350
	__le32 sys_chunk_array_size;
351
	__le64 chunk_root_generation;
352
353
354
	__le64 compat_flags;
	__le64 compat_ro_flags;
	__le64 incompat_flags;
355
	__le16 csum_type;
356
	u8 root_level;
357
	u8 chunk_root_level;
358
	u8 log_root_level;
359
	struct btrfs_dev_item dev_item;
360

361
	char label[BTRFS_LABEL_SIZE];
362
363
364

	/* future expansion */
	__le64 reserved[32];
365
	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
Chris Mason's avatar
Chris Mason committed
366
367
} __attribute__ ((__packed__));

368
369
370
371
/*
 * Compat flags that we support.  If any incompat flags are set other than the
 * ones specified below then we will fail to mount
 */
372
373
374
375
376
377
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)

#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
#define BTRFS_FEATURE_INCOMPAT_SUPP		\
	BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
378

379
/*
380
 * A leaf is full of items. offset and size tell us where to find
381
382
 * the item in the leaf (relative to the start of the data area)
 */
Chris Mason's avatar
Chris Mason committed
383
struct btrfs_item {
Chris Mason's avatar
Chris Mason committed
384
	struct btrfs_disk_key key;
385
	__le32 offset;
386
	__le32 size;
387
388
} __attribute__ ((__packed__));

389
390
391
392
393
394
395
/*
 * leaves have an item area and a data area:
 * [item0, item1....itemN] [free space] [dataN...data1, data0]
 *
 * The data is separate from the items to get the keys closer together
 * during searches.
 */
396
struct btrfs_leaf {
397
	struct btrfs_header header;
398
	struct btrfs_item items[];
399
400
} __attribute__ ((__packed__));

401
402
403
404
/*
 * all non-leaf blocks are nodes, they hold only keys and pointers to
 * other blocks
 */
405
406
407
struct btrfs_key_ptr {
	struct btrfs_disk_key key;
	__le64 blockptr;
408
	__le64 generation;
409
410
} __attribute__ ((__packed__));

411
struct btrfs_node {
412
	struct btrfs_header header;
413
	struct btrfs_key_ptr ptrs[];
414
415
} __attribute__ ((__packed__));

416
/*
417
418
 * btrfs_paths remember the path taken from the root down to the leaf.
 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
419
420
421
422
423
 * to any other levels that are present.
 *
 * The slots array records the index of the item or block pointer
 * used while walking the tree.
 */
424
struct btrfs_path {
425
	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
426
	int slots[BTRFS_MAX_LEVEL];
427
428
	/* if there is real range locking, this locks field will change */
	int locks[BTRFS_MAX_LEVEL];
429
	int reada;
430
	/* keep some upper locks as we walk down */
431
	int lowest_level;
432
433
434
435
436

	/*
	 * set by btrfs_split_item, tells search_slot to keep all locks
	 * and to force calls to keep space in the nodes
	 */
437
438
439
440
	unsigned int search_for_split:1;
	unsigned int keep_locks:1;
	unsigned int skip_locking:1;
	unsigned int leave_spinning:1;
441
	unsigned int search_commit_root:1;
442
};
Chris Mason's avatar
Chris Mason committed
443

444
445
446
447
/*
 * items in the extent btree are used to record the objectid of the
 * owner of the block and the number of references
 */
448

449
struct btrfs_extent_item {
450
451
452
453
454
455
	__le64 refs;
	__le64 generation;
	__le64 flags;
} __attribute__ ((__packed__));

struct btrfs_extent_item_v0 {
456
	__le32 refs;
457
458
} __attribute__ ((__packed__));

459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
					sizeof(struct btrfs_item))

#define BTRFS_EXTENT_FLAG_DATA		(1ULL << 0)
#define BTRFS_EXTENT_FLAG_TREE_BLOCK	(1ULL << 1)

/* following flags only apply to tree blocks */

/* use full backrefs for extent pointers in the block */
#define BTRFS_BLOCK_FLAG_FULL_BACKREF	(1ULL << 8)

struct btrfs_tree_block_info {
	struct btrfs_disk_key key;
	u8 level;
} __attribute__ ((__packed__));

struct btrfs_extent_data_ref {
	__le64 root;
	__le64 objectid;
	__le64 offset;
	__le32 count;
} __attribute__ ((__packed__));

struct btrfs_shared_data_ref {
	__le32 count;
} __attribute__ ((__packed__));

struct btrfs_extent_inline_ref {
	u8 type;
488
	__le64 offset;
489
490
491
492
} __attribute__ ((__packed__));

/* old style backrefs item */
struct btrfs_extent_ref_v0 {
493
494
495
	__le64 root;
	__le64 generation;
	__le64 objectid;
496
	__le32 count;
497
498
} __attribute__ ((__packed__));

499

500
501
/* dev extents record free space on individual devices.  The owner
 * field points back to the chunk allocation mapping tree that allocated
502
 * the extent.  The chunk tree uuid field is a way to double check the owner
503
504
 */
struct btrfs_dev_extent {
505
506
507
	__le64 chunk_tree;
	__le64 chunk_objectid;
	__le64 chunk_offset;
508
	__le64 length;
509
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
510
511
} __attribute__ ((__packed__));

512
struct btrfs_inode_ref {
513
	__le64 index;
514
515
516
517
	__le16 name_len;
	/* name goes here */
} __attribute__ ((__packed__));

518
struct btrfs_timespec {
Chris Mason's avatar
Chris Mason committed
519
	__le64 sec;
Chris Mason's avatar
Chris Mason committed
520
521
522
	__le32 nsec;
} __attribute__ ((__packed__));

Jan Engelhardt's avatar
Jan Engelhardt committed
523
enum btrfs_compression_type {
524
525
526
	BTRFS_COMPRESS_NONE = 0,
	BTRFS_COMPRESS_ZLIB = 1,
	BTRFS_COMPRESS_LAST = 2,
Jan Engelhardt's avatar
Jan Engelhardt committed
527
};
528

Chris Mason's avatar
Chris Mason committed
529
struct btrfs_inode_item {
530
	/* nfs style generation number */
Chris Mason's avatar
Chris Mason committed
531
	__le64 generation;
532
533
	/* transid that last touched this inode */
	__le64 transid;
Chris Mason's avatar
Chris Mason committed
534
	__le64 size;
535
	__le64 nbytes;
536
	__le64 block_group;
Chris Mason's avatar
Chris Mason committed
537
538
539
540
	__le32 nlink;
	__le32 uid;
	__le32 gid;
	__le32 mode;
541
	__le64 rdev;
542
	__le64 flags;
543

544
545
546
547
548
549
550
551
	/* modification sequence number for NFS */
	__le64 sequence;

	/*
	 * a little future expansion, for more than this we can
	 * just grow the inode item and version it
	 */
	__le64 reserved[4];
552
553
554
555
	struct btrfs_timespec atime;
	struct btrfs_timespec ctime;
	struct btrfs_timespec mtime;
	struct btrfs_timespec otime;
Chris Mason's avatar
Chris Mason committed
556
557
} __attribute__ ((__packed__));

558
559
560
561
struct btrfs_dir_log_item {
	__le64 end;
} __attribute__ ((__packed__));

562
struct btrfs_dir_item {
563
	struct btrfs_disk_key location;
564
	__le64 transid;
Josef Bacik's avatar
Josef Bacik committed
565
	__le16 data_len;
566
	__le16 name_len;
567
568
569
570
	u8 type;
} __attribute__ ((__packed__));

struct btrfs_root_item {
571
	struct btrfs_inode_item inode;
572
	__le64 generation;
573
	__le64 root_dirid;
574
575
576
	__le64 bytenr;
	__le64 byte_limit;
	__le64 bytes_used;
Yan Zheng's avatar
Yan Zheng committed
577
	__le64 last_snapshot;
578
	__le64 flags;
579
	__le32 refs;
580
581
	struct btrfs_disk_key drop_progress;
	u8 drop_level;
582
	u8 level;
583
} __attribute__ ((__packed__));
584

585
586
587
588
589
590
591
592
593
/*
 * this is used for both forward and backward root refs
 */
struct btrfs_root_ref {
	__le64 dirid;
	__le64 sequence;
	__le16 name_len;
} __attribute__ ((__packed__));

Yan Zheng's avatar
Yan Zheng committed
594
595
596
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
597

598
struct btrfs_file_extent_item {
599
600
601
	/*
	 * transaction id that created this extent
	 */
602
	__le64 generation;
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
	/*
	 * max number of bytes to hold this extent in ram
	 * when we split a compressed extent we can't know how big
	 * each of the resulting pieces will be.  So, this is
	 * an upper limit on the size of the extent in ram instead of
	 * an exact limit.
	 */
	__le64 ram_bytes;

	/*
	 * 32 bits for the various ways we might encode the data,
	 * including compression and encryption.  If any of these
	 * are set to something a given disk format doesn't understand
	 * it is treated like an incompat flag for reading and writing,
	 * but not for stat.
	 */
	u8 compression;
	u8 encryption;
	__le16 other_encoding; /* spare for later use */

	/* are we inline data or a real extent? */
624
	u8 type;
625

626
627
628
629
	/*
	 * disk space consumed by the extent, checksum blocks are included
	 * in these numbers
	 */
630
631
	__le64 disk_bytenr;
	__le64 disk_num_bytes;
632
	/*
Chris Mason's avatar
Chris Mason committed
633
	 * the logical offset in file blocks (no csums)
634
635
636
637
638
639
640
	 * this extent record is for.  This allows a file extent to point
	 * into the middle of an existing extent on disk, sharing it
	 * between two snapshots (useful if some bytes in the middle of the
	 * extent have changed
	 */
	__le64 offset;
	/*
641
642
	 * the logical number of file blocks (no csums included).  This
	 * always reflects the size uncompressed and without encoding.
643
	 */
644
	__le64 num_bytes;
645

646
647
} __attribute__ ((__packed__));

Chris Mason's avatar
Chris Mason committed
648
struct btrfs_csum_item {
649
	u8 csum;
Chris Mason's avatar
Chris Mason committed
650
651
} __attribute__ ((__packed__));

652
653
654
655
/* different types of block groups (and chunks) */
#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
656
#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
657
#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
658
#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
Chris Mason's avatar
Chris Mason committed
659
#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
Chris Mason's avatar
Chris Mason committed
660

Chris Mason's avatar
Chris Mason committed
661
662
struct btrfs_block_group_item {
	__le64 used;
663
664
	__le64 chunk_objectid;
	__le64 flags;
Chris Mason's avatar
Chris Mason committed
665
666
} __attribute__ ((__packed__));

667
668
struct btrfs_space_info {
	u64 flags;
669
670
671
672
673
674
675
676

	u64 total_bytes;	/* total bytes in the space */
	u64 bytes_used;		/* total bytes used on disk */
	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
				   transaction finishes */
	u64 bytes_reserved;	/* total bytes the allocator has reserved for
				   current allocations */
	u64 bytes_readonly;	/* total bytes that are read only */
677
	u64 bytes_super;	/* total bytes reserved for the super blocks */
Josef Bacik's avatar
Josef Bacik committed
678
679
	u64 bytes_root;		/* the number of bytes needed to commit a
				   transaction */
680
	u64 bytes_may_use;	/* number of bytes that may be used for
Josef Bacik's avatar
Josef Bacik committed
681
682
683
				   delalloc/allocations */
	u64 bytes_delalloc;	/* number of bytes currently reserved for
				   delayed allocation */
684
685
686
687
688

	int full;		/* indicates that we cannot allocate any more
				   chunks for this space */
	int force_alloc;	/* set if we need to force a chunk alloc for
				   this space */
Josef Bacik's avatar
Josef Bacik committed
689
690
	int force_delalloc;	/* make people start doing filemap_flush until
				   we're under a threshold */
691

692
	struct list_head list;
693
694
695
696

	/* for block groups in our same type */
	struct list_head block_groups;
	spinlock_t lock;
697
	struct rw_semaphore groups_sem;
Josef Bacik's avatar
Josef Bacik committed
698
	atomic_t caching_threads;
Josef Bacik's avatar
Josef Bacik committed
699
700
701

	int allocating_chunk;
	wait_queue_head_t wait;
702
703
};

704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
/*
 * free clusters are used to claim free space in relatively large chunks,
 * allowing us to do less seeky writes.  They are used for all metadata
 * allocations and data allocations in ssd mode.
 */
struct btrfs_free_cluster {
	spinlock_t lock;
	spinlock_t refill_lock;
	struct rb_root root;

	/* largest extent in this cluster */
	u64 max_size;

	/* first extent starting offset */
	u64 window_start;

720
721
722
	/* if this cluster simply points at a bitmap in the block group */
	bool points_to_bitmap;

723
724
725
726
727
728
729
	struct btrfs_block_group_cache *block_group;
	/*
	 * when a cluster is allocated from a block group, we put the
	 * cluster onto a list in the block group so that it can
	 * be freed before the block group is freed.
	 */
	struct list_head block_group_list;
730
731
};

Josef Bacik's avatar
Josef Bacik committed
732
733
734
735
736
737
enum btrfs_caching_type {
	BTRFS_CACHE_NO		= 0,
	BTRFS_CACHE_STARTED	= 1,
	BTRFS_CACHE_FINISHED	= 2,
};

738
739
740
741
742
743
744
745
746
struct btrfs_caching_control {
	struct list_head list;
	struct mutex mutex;
	wait_queue_head_t wait;
	struct btrfs_block_group_cache *block_group;
	u64 progress;
	atomic_t count;
};

Chris Mason's avatar
Chris Mason committed
747
748
749
struct btrfs_block_group_cache {
	struct btrfs_key key;
	struct btrfs_block_group_item item;
Josef Bacik's avatar
Josef Bacik committed
750
	struct btrfs_fs_info *fs_info;
751
	spinlock_t lock;
752
	u64 pinned;
753
	u64 reserved;
754
	u64 bytes_super;
755
	u64 flags;
756
757
758
759
	u64 sectorsize;
	int extents_thresh;
	int free_extents;
	int total_bitmaps;
760
	int ro;
761
762
	int dirty;

Josef Bacik's avatar
Josef Bacik committed
763
764
	/* cache tracking stuff */
	int cached;
765
766
	struct btrfs_caching_control *caching_ctl;
	u64 last_byte_to_unpin;
Josef Bacik's avatar
Josef Bacik committed
767

768
769
770
	struct btrfs_space_info *space_info;

	/* free space cache stuff */
771
	spinlock_t tree_lock;
772
	struct rb_root free_space_offset;
Josef Bacik's avatar
Josef Bacik committed
773
	u64 free_space;
774
775
776
777
778
779

	/* block group cache stuff */
	struct rb_node cache_node;

	/* for block groups in the same raid type */
	struct list_head list;
780
781
782

	/* usage count */
	atomic_t count;
783
784
785
786
787

	/* List of struct btrfs_free_clusters for this block group.
	 * Today it will only have one thing on it, but that may change
	 */
	struct list_head cluster_list;
Chris Mason's avatar
Chris Mason committed
788
};
789

790
struct reloc_control;
791
struct btrfs_device;
792
struct btrfs_fs_devices;
793
struct btrfs_fs_info {
794
	u8 fsid[BTRFS_FSID_SIZE];
795
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
796
797
	struct btrfs_root *extent_root;
	struct btrfs_root *tree_root;
798
799
	struct btrfs_root *chunk_root;
	struct btrfs_root *dev_root;
800
	struct btrfs_root *fs_root;
801
	struct btrfs_root *csum_root;
802
803
804

	/* the log root tree is a directory of all the other log roots */
	struct btrfs_root *log_root_tree;
805
806

	spinlock_t fs_roots_radix_lock;
807
	struct radix_tree_root fs_roots_radix;
808

809
810
811
812
	/* block group cache stuff */
	spinlock_t block_group_cache_lock;
	struct rb_root block_group_cache_tree;

813
814
	struct extent_io_tree freed_extents[2];
	struct extent_io_tree *pinned_extents;
815

816
817
818
	/* logical->physical extent mapping */
	struct btrfs_mapping_tree mapping_tree;

819
	u64 generation;
820
	u64 last_trans_committed;
821
822
823
824
825
826

	/*
	 * this is updated to the current trans every time a full commit
	 * is required instead of the faster short fsync log commits
	 */
	u64 last_trans_log_full_commit;
827
	u64 open_ioctl_trans;
828
	unsigned long mount_opt;
829
	u64 max_extent;
830
	u64 max_inline;
831
	u64 alloc_start;
Chris Mason's avatar
Chris Mason committed
832
	struct btrfs_transaction *running_transaction;
833
	wait_queue_head_t transaction_throttle;
834
	wait_queue_head_t transaction_wait;
835
	wait_queue_head_t async_submit_wait;
836

837
	struct btrfs_super_block super_copy;
838
	struct btrfs_super_block super_for_commit;
839
	struct block_device *__bdev;
Chris Mason's avatar
Chris Mason committed
840
	struct super_block *sb;
841
	struct inode *btree_inode;
Chris Mason's avatar
Chris Mason committed
842
	struct backing_dev_info bdi;
Chris Mason's avatar
Chris Mason committed
843
	struct mutex trans_mutex;
844
	struct mutex tree_log_mutex;
845
846
	struct mutex transaction_kthread_mutex;
	struct mutex cleaner_mutex;
847
	struct mutex chunk_mutex;
848
	struct mutex volume_mutex;
849
850
851
852
853
854
855
856
	/*
	 * this protects the ordered operations list only while we are
	 * processing all of the entries on it.  This way we make
	 * sure the commit code doesn't find the list temporarily empty
	 * because another function happens to be doing non-waiting preflush
	 * before jumping into the main commit.
	 */
	struct mutex ordered_operations_mutex;
857
	struct rw_semaphore extent_commit_sem;
858

859
860
861
862
	struct rw_semaphore subvol_sem;

	struct srcu_struct subvol_srcu;

Chris Mason's avatar
Chris Mason committed
863
	struct list_head trans_list;
864
	struct list_head hashers;
865
	struct list_head dead_roots;
866
	struct list_head caching_block_groups;
867

868
	atomic_t nr_async_submits;
869
	atomic_t async_submit_draining;
870
	atomic_t nr_async_bios;
871
	atomic_t async_delalloc_pages;
872

873
874
875
876
877
	/*
	 * this is used by the balancing code to wait for all the pending
	 * ordered extents
	 */
	spinlock_t ordered_extent_lock;
878
879
880
881
882
883

	/*
	 * all of the data=ordered extents pending writeback
	 * these can span multiple transactions and basically include
	 * every dirty data page that isn't from nodatacow
	 */
884
	struct list_head ordered_extents;
885
886
887
888
889
890

	/*
	 * all of the inodes that have delalloc bytes.  It is possible for
	 * this list to be empty even when there is still dirty data=ordered
	 * extents waiting to finish IO.
	 */
891
	struct list_head delalloc_inodes;
892

893
894
895
896
897
898
899
	/*
	 * special rename and truncate targets that must be on disk before
	 * we're allowed to commit.  This is basically the ext3 style
	 * data=ordered list.
	 */
	struct list_head ordered_operations;

900
901
902
903
904
905
	/*
	 * there is a pool of worker threads for checksumming during writes
	 * and a pool for checksumming after reads.  This is because readers
	 * can run with FS locks held, and the writers may be waiting for
	 * those locks.  We don't want ordering in the pending list to cause
	 * deadlocks, and so the two are serviced separately.
906
907
908
	 *
	 * A third pool does submit_bio to avoid deadlocking with the other
	 * two
909
	 */
910
	struct btrfs_workers generic_worker;
911
	struct btrfs_workers workers;
912
	struct btrfs_workers delalloc_workers;
913
	struct btrfs_workers endio_workers;
914
	struct btrfs_workers endio_meta_workers;
915
	struct btrfs_workers endio_meta_write_workers;
916
	struct btrfs_workers endio_write_workers;
917
	struct btrfs_workers submit_workers;
918
919
920
921
922
923
	/*
	 * fixup workers take dirty pages that didn't properly go through
	 * the cow mechanism and make them safe to write.  It happens
	 * for the sys_munmap function call path
	 */
	struct btrfs_workers fixup_workers;
924
925
	struct task_struct *transaction_kthread;
	struct task_struct *cleaner_kthread;
926
	int thread_pool_size;
927

928
929
	struct kobject super_kobj;
	struct completion kobj_unregister;
930
	int do_barriers;
931
	int closing;
932
	int log_root_recovering;
933

934
	u64 total_pinned;
935
936
937
938
939

	/* protected by the delalloc lock, used to keep from writing
	 * metadata until there is a nice batch
	 */
	u64 dirty_metadata_bytes;
940
941
	struct list_head dirty_cowonly_roots;

942
	struct btrfs_fs_devices *fs_devices;
943
944
945
946
947
948

	/*
	 * the space_info list is almost entirely read only.  It only changes
	 * when we add a new raid type to the FS, and that happens
	 * very rarely.  RCU is used to protect it.
	 */
949
	struct list_head space_info;
950

951
952
	struct reloc_control *reloc_ctl;

953
	spinlock_t delalloc_lock;
954
	spinlock_t new_trans_lock;
955
	u64 delalloc_bytes;
956
957
958
959
960
961

	/* data_alloc_cluster is only used in ssd mode */
	struct btrfs_free_cluster data_alloc_cluster;

	/* all metadata allocations go through this cluster */
	struct btrfs_free_cluster meta_alloc_cluster;
962

Yan Zheng's avatar
Yan Zheng committed
963
964
965
	spinlock_t ref_cache_lock;
	u64 total_ref_cache_size;

966
967
968
969
970
971
	u64 avail_data_alloc_bits;
	u64 avail_metadata_alloc_bits;
	u64 avail_system_alloc_bits;
	u64 data_alloc_profile;
	u64 metadata_alloc_profile;
	u64 system_alloc_profile;
972

973
974
975
	unsigned data_chunk_allocations;
	unsigned metadata_ratio;

976
	void *bdev_holder;
977
};
978

979
980
/*
 * in ram representation of the tree.  extent_root is used for all allocations
981
 * and for the extent tree extent_root root.
982
983
 */
struct btrfs_root {
984
	struct extent_buffer *node;
985
986
987
988

	/* the node lock is held while changing the node pointer */
	spinlock_t node_lock;

989
	struct extent_buffer *commit_root;
990
	struct btrfs_root *log_root;
Zheng Yan's avatar
Zheng Yan committed
991
	struct btrfs_root *reloc_root;
Yan Zheng's avatar
Yan Zheng committed
992

993
994
	struct btrfs_root_item root_item;
	struct btrfs_key root_key;
995
	struct btrfs_fs_info *fs_info;
996
997
	struct extent_io_tree dirty_log_pages;

998
999
	struct kobject root_kobj;
	struct completion kobj_unregister;
1000
	struct mutex objectid_mutex;
Yan Zheng's avatar
Yan Zheng committed
1001

1002
	struct mutex log_mutex;
Yan Zheng's avatar
Yan Zheng committed
1003
1004
1005
1006
1007
1008
	wait_queue_head_t log_writer_wait;
	wait_queue_head_t log_commit_wait[2];
	atomic_t log_writers;
	atomic_t log_commit[2];
	unsigned long log_transid;
	unsigned long log_batch;
1009

1010
1011
	u64 objectid;
	u64 last_trans;
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021

	/* data allocations are done in sectorsize units */
	u32 sectorsize;

	/* node allocations are done in nodesize units */
	u32 nodesize;

	/* leaf allocations are done in leafsize units */
	u32 leafsize;

1022
1023
	u32 stripesize;

1024
	u32 type;
1025
1026

	u64 highest_objectid;
1027
	int ref_cows;
1028
	int track_dirty;
1029
1030
	int in_radix;

1031
	u64 defrag_trans_start;
1032
	struct btrfs_key defrag_progress;
1033
	struct btrfs_key defrag_max;
1034
1035
	int defrag_running;
	int defrag_level;
1036
	char *name;
1037
	int in_sysfs;
1038
1039
1040

	/* the dirty list is only used by non-reference counted roots */
	struct list_head dirty_list;
1041

1042
1043
	struct list_head root_list;

1044
	spinlock_t list_lock;
1045
	struct list_head orphan_list;
1046

1047
1048
1049
1050
	spinlock_t inode_lock;
	/* red-black tree that keeps track of in-memory inodes */
	struct rb_root inode_tree;

1051
1052
1053
1054
1055
	/*
	 * right now this just gets used so that a root has its own devid
	 * for stat.  It may be used for more later
	 */
	struct super_block anon_super;
1056
1057
};

Chris Mason's avatar
Chris Mason committed
1058
1059
1060
1061
1062
/*
 * inode items have the data typically returned from stat and store other
 * info about object characteristics.  There is one for every file and dir in
 * the FS
 */
Chris Mason's avatar
Chris Mason committed
1063
#define BTRFS_INODE_ITEM_KEY		1
1064
1065
1066
#define BTRFS_INODE_REF_KEY		12
#define BTRFS_XATTR_ITEM_KEY		24
#define BTRFS_ORPHAN_ITEM_KEY		48
Chris Mason's avatar
Chris Mason committed
1067
/* reserve 2-15 close to the inode for later flexibility */
Chris Mason's avatar
Chris Mason committed
1068
1069
1070
1071
1072

/*
 * dir items are the name -> inode pointers in a directory.  There is one