file.c 64.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/* -*- mode: c; c-basic-offset: 8; -*-
 * vim: noexpandtab sw=8 ts=8 sts=0:
 *
 * file.c
 *
 * File open, close, extend, truncate
 *
 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

26
#include <linux/capability.h>
27 28 29 30 31 32
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
33
#include <linux/sched.h>
34
#include <linux/splice.h>
35
#include <linux/mount.h>
36
#include <linux/writeback.h>
Mark Fasheh's avatar
Mark Fasheh committed
37
#include <linux/falloc.h>
38
#include <linux/quotaops.h>
39
#include <linux/blkdev.h>
40 41 42 43 44 45 46 47 48 49 50 51 52

#include <cluster/masklog.h>

#include "ocfs2.h"

#include "alloc.h"
#include "aops.h"
#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
#include "sysfile.h"
#include "inode.h"
Herbert Poetzl's avatar
Herbert Poetzl committed
53
#include "ioctl.h"
54
#include "journal.h"
55
#include "locks.h"
56 57 58
#include "mmap.h"
#include "suballoc.h"
#include "super.h"
59
#include "xattr.h"
Tiger Yang's avatar
Tiger Yang committed
60
#include "acl.h"
61
#include "quota.h"
Tao Ma's avatar
Tao Ma committed
62
#include "refcounttree.h"
63
#include "ocfs2_trace.h"
64 65 66

#include "buffer_head_io.h"

67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
static int ocfs2_init_file_private(struct inode *inode, struct file *file)
{
	struct ocfs2_file_private *fp;

	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
	if (!fp)
		return -ENOMEM;

	fp->fp_file = file;
	mutex_init(&fp->fp_mutex);
	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
	file->private_data = fp;

	return 0;
}

static void ocfs2_free_file_private(struct inode *inode, struct file *file)
{
	struct ocfs2_file_private *fp = file->private_data;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

	if (fp) {
		ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
		ocfs2_lock_res_free(&fp->fp_flock);
		kfree(fp);
		file->private_data = NULL;
	}
}

96 97 98 99 100 101
static int ocfs2_file_open(struct inode *inode, struct file *file)
{
	int status;
	int mode = file->f_flags;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);

102 103 104 105
	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
			      (unsigned long long)OCFS2_I(inode)->ip_blkno,
			      file->f_path.dentry->d_name.len,
			      file->f_path.dentry->d_name.name, mode);
106

107
	if (file->f_mode & FMODE_WRITE)
108
		dquot_initialize(inode);
109

110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
	spin_lock(&oi->ip_lock);

	/* Check that the inode hasn't been wiped from disk by another
	 * node. If it hasn't then we're safe as long as we hold the
	 * spin lock until our increment of open count. */
	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
		spin_unlock(&oi->ip_lock);

		status = -ENOENT;
		goto leave;
	}

	if (mode & O_DIRECT)
		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;

	oi->ip_open_count++;
	spin_unlock(&oi->ip_lock);
127 128 129 130 131 132 133 134 135 136 137 138

	status = ocfs2_init_file_private(inode, file);
	if (status) {
		/*
		 * We want to set open count back if we're failing the
		 * open.
		 */
		spin_lock(&oi->ip_lock);
		oi->ip_open_count--;
		spin_unlock(&oi->ip_lock);
	}

139 140 141 142 143 144 145 146 147 148 149
leave:
	return status;
}

static int ocfs2_file_release(struct inode *inode, struct file *file)
{
	struct ocfs2_inode_info *oi = OCFS2_I(inode);

	spin_lock(&oi->ip_lock);
	if (!--oi->ip_open_count)
		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
150 151 152 153 154 155

	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
				 oi->ip_blkno,
				 file->f_path.dentry->d_name.len,
				 file->f_path.dentry->d_name.name,
				 oi->ip_open_count);
156 157
	spin_unlock(&oi->ip_lock);

158 159
	ocfs2_free_file_private(inode, file);

160 161 162
	return 0;
}

163 164 165 166 167 168 169 170 171 172 173
static int ocfs2_dir_open(struct inode *inode, struct file *file)
{
	return ocfs2_init_file_private(inode, file);
}

static int ocfs2_dir_release(struct inode *inode, struct file *file)
{
	ocfs2_free_file_private(inode, file);
	return 0;
}

174
static int ocfs2_sync_file(struct file *file, int datasync)
175 176 177
{
	int err = 0;
	journal_t *journal;
178
	struct inode *inode = file->f_mapping->host;
179 180
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

181 182 183 184 185
	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
			      OCFS2_I(inode)->ip_blkno,
			      file->f_path.dentry->d_name.len,
			      file->f_path.dentry->d_name.name,
			      (unsigned long long)datasync);
186

187 188 189 190 191 192
	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
		/*
		 * We still have to flush drive's caches to get data to the
		 * platter
		 */
		if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
193
			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
194
		goto bail;
195
	}
196

197
	journal = osb->journal->j_journal;
Joel Becker's avatar
Joel Becker committed
198
	err = jbd2_journal_force_commit(journal);
199 200

bail:
Tao Ma's avatar
Tao Ma committed
201 202
	if (err)
		mlog_errno(err);
203 204 205 206

	return (err < 0) ? -EIO : 0;
}

207 208 209 210 211 212 213 214 215 216 217 218 219
int ocfs2_should_update_atime(struct inode *inode,
			      struct vfsmount *vfsmnt)
{
	struct timespec now;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
		return 0;

	if ((inode->i_flags & S_NOATIME) ||
	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
		return 0;

220 221 222 223 224 225 226 227 228 229 230
	/*
	 * We can be called with no vfsmnt structure - NFSD will
	 * sometimes do this.
	 *
	 * Note that our action here is different than touch_atime() -
	 * if we can't tell whether this is a noatime mount, then we
	 * don't know whether to trust the value of s_atime_quantum.
	 */
	if (vfsmnt == NULL)
		return 0;

231 232 233 234
	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
		return 0;

235 236 237 238 239 240 241 242
	if (vfsmnt->mnt_flags & MNT_RELATIME) {
		if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
		    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
			return 1;

		return 0;
	}

243 244 245 246 247 248 249 250 251 252 253 254 255
	now = CURRENT_TIME;
	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
		return 0;
	else
		return 1;
}

int ocfs2_update_inode_atime(struct inode *inode,
			     struct buffer_head *bh)
{
	int ret;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	handle_t *handle;
256
	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
257 258

	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
259 260
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
261 262 263 264
		mlog_errno(ret);
		goto out;
	}

265
	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
266
				      OCFS2_JOURNAL_ACCESS_WRITE);
267 268 269 270 271 272 273 274 275 276
	if (ret) {
		mlog_errno(ret);
		goto out_commit;
	}

	/*
	 * Don't use ocfs2_mark_inode_dirty() here as we don't always
	 * have i_mutex to guard against concurrent changes to other
	 * inode fields.
	 */
277
	inode->i_atime = CURRENT_TIME;
278 279
	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
280
	ocfs2_journal_dirty(handle, bh);
281

282
out_commit:
283 284 285 286 287
	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
	return ret;
}

288 289 290 291
static int ocfs2_set_inode_size(handle_t *handle,
				struct inode *inode,
				struct buffer_head *fe_bh,
				u64 new_i_size)
292 293 294 295
{
	int status;

	i_size_write(inode, new_i_size);
296
	inode->i_blocks = ocfs2_inode_sector_count(inode);
297 298 299 300 301 302 303 304 305 306 307 308
	inode->i_ctime = inode->i_mtime = CURRENT_TIME;

	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
	if (status < 0) {
		mlog_errno(status);
		goto bail;
	}

bail:
	return status;
}

309 310 311
int ocfs2_simple_size_update(struct inode *inode,
			     struct buffer_head *di_bh,
			     u64 new_i_size)
312 313 314
{
	int ret;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
315
	handle_t *handle = NULL;
316

317
	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
318 319
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
320 321 322 323 324 325 326 327 328
		mlog_errno(ret);
		goto out;
	}

	ret = ocfs2_set_inode_size(handle, inode, di_bh,
				   new_i_size);
	if (ret < 0)
		mlog_errno(ret);

329
	ocfs2_commit_trans(osb, handle);
330 331 332 333
out:
	return ret;
}

334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
static int ocfs2_cow_file_pos(struct inode *inode,
			      struct buffer_head *fe_bh,
			      u64 offset)
{
	int status;
	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	unsigned int num_clusters = 0;
	unsigned int ext_flags = 0;

	/*
	 * If the new offset is aligned to the range of the cluster, there is
	 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
	 * CoW either.
	 */
	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
		return 0;

	status = ocfs2_get_clusters(inode, cpos, &phys,
				    &num_clusters, &ext_flags);
	if (status) {
		mlog_errno(status);
		goto out;
	}

	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
		goto out;

361
	return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
362 363 364 365 366

out:
	return status;
}

367 368 369 370 371 372
static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
				     struct inode *inode,
				     struct buffer_head *fe_bh,
				     u64 new_i_size)
{
	int status;
373
	handle_t *handle;
374
	struct ocfs2_dinode *di;
375
	u64 cluster_bytes;
376

377 378 379 380 381 382 383 384 385 386 387
	/*
	 * We need to CoW the cluster contains the offset if it is reflinked
	 * since we will call ocfs2_zero_range_for_truncate later which will
	 * write "0" from offset to the end of the cluster.
	 */
	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
	if (status) {
		mlog_errno(status);
		return status;
	}

388 389 390
	/* TODO: This needs to actually orphan the inode in this
	 * transaction. */

391
	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
392 393 394 395 396 397
	if (IS_ERR(handle)) {
		status = PTR_ERR(handle);
		mlog_errno(status);
		goto out;
	}

398
	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
399
					 OCFS2_JOURNAL_ACCESS_WRITE);
400 401 402 403 404 405 406 407
	if (status < 0) {
		mlog_errno(status);
		goto out_commit;
	}

	/*
	 * Do this before setting i_size.
	 */
408 409 410
	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
					       cluster_bytes);
411 412 413 414 415 416 417 418 419 420 421 422 423
	if (status) {
		mlog_errno(status);
		goto out_commit;
	}

	i_size_write(inode, new_i_size);
	inode->i_ctime = inode->i_mtime = CURRENT_TIME;

	di = (struct ocfs2_dinode *) fe_bh->b_data;
	di->i_size = cpu_to_le64(new_i_size);
	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);

424
	ocfs2_journal_dirty(handle, fe_bh);
425

426
out_commit:
427
	ocfs2_commit_trans(osb, handle);
428 429 430 431 432 433 434 435 436 437 438 439
out:
	return status;
}

static int ocfs2_truncate_file(struct inode *inode,
			       struct buffer_head *di_bh,
			       u64 new_i_size)
{
	int status = 0;
	struct ocfs2_dinode *fe = NULL;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

440 441
	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
	 * already validated it */
442 443
	fe = (struct ocfs2_dinode *) di_bh->b_data;

444 445 446 447
	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
				  (unsigned long long)le64_to_cpu(fe->i_size),
				  (unsigned long long)new_i_size);

448
	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
449 450 451
			"Inode %llu, inode i_size = %lld != di "
			"i_size = %llu, i_flags = 0x%x\n",
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
452
			i_size_read(inode),
453 454
			(unsigned long long)le64_to_cpu(fe->i_size),
			le32_to_cpu(fe->i_flags));
455 456

	if (new_i_size > le64_to_cpu(fe->i_size)) {
457 458 459
		trace_ocfs2_truncate_file_error(
			(unsigned long long)le64_to_cpu(fe->i_size),
			(unsigned long long)new_i_size);
460 461 462 463 464 465 466 467 468 469
		status = -EINVAL;
		mlog_errno(status);
		goto bail;
	}

	/* lets handle the simple truncate cases before doing any more
	 * cluster locking. */
	if (new_i_size == le64_to_cpu(fe->i_size))
		goto bail;

470 471
	down_write(&OCFS2_I(inode)->ip_alloc_sem);

472 473 474
	ocfs2_resv_discard(&osb->osb_la_resmap,
			   &OCFS2_I(inode)->ip_la_data_resv);

Mark Fasheh's avatar
Mark Fasheh committed
475 476 477 478 479 480 481
	/*
	 * The inode lock forced other nodes to sync and drop their
	 * pages, which (correctly) happens even if we have a truncate
	 * without allocation change - ocfs2 cluster sizes can be much
	 * greater than page size, so we have to truncate them
	 * anyway.
	 */
482 483 484
	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
	truncate_inode_pages(inode->i_mapping, new_i_size);

485 486
	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
487
					       i_size_read(inode), 1);
488 489 490
		if (status)
			mlog_errno(status);

Mark Fasheh's avatar
Mark Fasheh committed
491
		goto bail_unlock_sem;
492 493
	}

494 495 496 497 498 499 500
	/* alright, we're going to need to do a full blown alloc size
	 * change. Orphan the inode so that recovery can complete the
	 * truncate if necessary. This does the task of marking
	 * i_size. */
	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
	if (status < 0) {
		mlog_errno(status);
Mark Fasheh's avatar
Mark Fasheh committed
501
		goto bail_unlock_sem;
502 503
	}

504
	status = ocfs2_commit_truncate(osb, inode, di_bh);
505 506
	if (status < 0) {
		mlog_errno(status);
Mark Fasheh's avatar
Mark Fasheh committed
507
		goto bail_unlock_sem;
508 509 510
	}

	/* TODO: orphan dir cleanup here. */
Mark Fasheh's avatar
Mark Fasheh committed
511
bail_unlock_sem:
512 513
	up_write(&OCFS2_I(inode)->ip_alloc_sem);

514
bail:
515 516
	if (!status && OCFS2_I(inode)->ip_clusters == 0)
		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
517 518 519 520 521

	return status;
}

/*
522
 * extend file allocation only here.
523 524 525 526 527 528 529 530
 * we'll update all the disk stuff, and oip->alloc_size
 *
 * expect stuff to be locked, a transaction started and enough data /
 * metadata reservations in the contexts.
 *
 * Will return -EAGAIN, and a reason if a restart is needed.
 * If passed in, *reason will always be set, even in error.
 */
531 532 533 534 535 536 537 538 539 540
int ocfs2_add_inode_data(struct ocfs2_super *osb,
			 struct inode *inode,
			 u32 *logical_offset,
			 u32 clusters_to_add,
			 int mark_unwritten,
			 struct buffer_head *fe_bh,
			 handle_t *handle,
			 struct ocfs2_alloc_context *data_ac,
			 struct ocfs2_alloc_context *meta_ac,
			 enum ocfs2_alloc_restarted *reason_ret)
541
{
542 543
	int ret;
	struct ocfs2_extent_tree et;
544

545
	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
546 547 548
	ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
					  clusters_to_add, mark_unwritten,
					  data_ac, meta_ac, reason_ret);
549 550

	return ret;
551 552
}

553 554
static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
				     u32 clusters_to_add, int mark_unwritten)
555 556 557
{
	int status = 0;
	int restart_func = 0;
558
	int credits;
559
	u32 prev_clusters;
560 561
	struct buffer_head *bh = NULL;
	struct ocfs2_dinode *fe = NULL;
562
	handle_t *handle = NULL;
563 564 565 566
	struct ocfs2_alloc_context *data_ac = NULL;
	struct ocfs2_alloc_context *meta_ac = NULL;
	enum ocfs2_alloc_restarted why;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
567
	struct ocfs2_extent_tree et;
568
	int did_quota = 0;
569

Mark Fasheh's avatar
Mark Fasheh committed
570 571 572 573
	/*
	 * This function only exists for file systems which don't
	 * support holes.
	 */
574
	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
Mark Fasheh's avatar
Mark Fasheh committed
575

576
	status = ocfs2_read_inode_block(inode, &bh);
577 578 579 580 581 582 583 584 585
	if (status < 0) {
		mlog_errno(status);
		goto leave;
	}
	fe = (struct ocfs2_dinode *) bh->b_data;

restart_all:
	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);

586
	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
587 588
	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
				       &data_ac, &meta_ac);
589 590 591 592 593
	if (status) {
		mlog_errno(status);
		goto leave;
	}

594 595
	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
					    clusters_to_add);
596
	handle = ocfs2_start_trans(osb, credits);
597 598 599 600 601 602 603 604
	if (IS_ERR(handle)) {
		status = PTR_ERR(handle);
		handle = NULL;
		mlog_errno(status);
		goto leave;
	}

restarted_transaction:
605 606 607 608 609 610
	trace_ocfs2_extend_allocation(
		(unsigned long long)OCFS2_I(inode)->ip_blkno,
		(unsigned long long)i_size_read(inode),
		le32_to_cpu(fe->i_clusters), clusters_to_add,
		why, restart_func);

611 612 613
	status = dquot_alloc_space_nodirty(inode,
			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
	if (status)
614 615 616
		goto leave;
	did_quota = 1;

617 618 619
	/* reserve a write to the file entry early on - that we if we
	 * run out of credits in the allocation path, we can still
	 * update i_size. */
620
	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
621
					 OCFS2_JOURNAL_ACCESS_WRITE);
622 623 624 625 626 627 628
	if (status < 0) {
		mlog_errno(status);
		goto leave;
	}

	prev_clusters = OCFS2_I(inode)->ip_clusters;

629 630 631 632 633 634 635 636 637 638
	status = ocfs2_add_inode_data(osb,
				      inode,
				      &logical_start,
				      clusters_to_add,
				      mark_unwritten,
				      bh,
				      handle,
				      data_ac,
				      meta_ac,
				      &why);
639 640 641 642 643 644
	if ((status < 0) && (status != -EAGAIN)) {
		if (status != -ENOSPC)
			mlog_errno(status);
		goto leave;
	}

645
	ocfs2_journal_dirty(handle, bh);
646 647 648 649

	spin_lock(&OCFS2_I(inode)->ip_lock);
	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
	spin_unlock(&OCFS2_I(inode)->ip_lock);
650
	/* Release unused quota reservation */
651
	dquot_free_space(inode,
652 653
			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
	did_quota = 0;
654 655 656 657

	if (why != RESTART_NONE && clusters_to_add) {
		if (why == RESTART_META) {
			restart_func = 1;
658
			status = 0;
659 660 661 662 663
		} else {
			BUG_ON(why != RESTART_TRANS);

			/* TODO: This can be more intelligent. */
			credits = ocfs2_calc_extend_credits(osb->sb,
664
							    &fe->id2.i_list,
665
							    clusters_to_add);
666
			status = ocfs2_extend_trans(handle, credits);
667 668 669 670 671 672 673 674 675 676 677
			if (status < 0) {
				/* handle still has to be committed at
				 * this point. */
				status = -ENOMEM;
				mlog_errno(status);
				goto leave;
			}
			goto restarted_transaction;
		}
	}

678
	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
679
	     le32_to_cpu(fe->i_clusters),
680 681 682
	     (unsigned long long)le64_to_cpu(fe->i_size),
	     OCFS2_I(inode)->ip_clusters,
	     (unsigned long long)i_size_read(inode));
683 684

leave:
685
	if (status < 0 && did_quota)
686
		dquot_free_space(inode,
687
			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
688
	if (handle) {
689
		ocfs2_commit_trans(osb, handle);
690 691 692 693 694 695 696 697 698 699 700 701 702 703
		handle = NULL;
	}
	if (data_ac) {
		ocfs2_free_alloc_context(data_ac);
		data_ac = NULL;
	}
	if (meta_ac) {
		ocfs2_free_alloc_context(meta_ac);
		meta_ac = NULL;
	}
	if ((!status) && restart_func) {
		restart_func = 0;
		goto restart_all;
	}
704 705
	brelse(bh);
	bh = NULL;
706 707 708 709

	return status;
}

710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742
/*
 * While a write will already be ordering the data, a truncate will not.
 * Thus, we need to explicitly order the zeroed pages.
 */
static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
{
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	handle_t *handle = NULL;
	int ret = 0;

	if (!ocfs2_should_order_data(inode))
		goto out;

	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	if (IS_ERR(handle)) {
		ret = -ENOMEM;
		mlog_errno(ret);
		goto out;
	}

	ret = ocfs2_jbd2_file_inode(handle, inode);
	if (ret < 0)
		mlog_errno(ret);

out:
	if (ret) {
		if (!IS_ERR(handle))
			ocfs2_commit_trans(osb, handle);
		handle = ERR_PTR(ret);
	}
	return handle;
}

743 744
/* Some parts of this taken from generic_cont_expand, which turned out
 * to be too fragile to do exactly what we need without us having to
745
 * worry about recursive locking in ->write_begin() and ->write_end(). */
746 747
static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
				 u64 abs_to)
748 749 750
{
	struct address_space *mapping = inode->i_mapping;
	struct page *page;
751
	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
752
	handle_t *handle = NULL;
753
	int ret = 0;
754
	unsigned zero_from, zero_to, block_start, block_end;
755

756 757 758
	BUG_ON(abs_from >= abs_to);
	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
	BUG_ON(abs_from & (inode->i_blkbits - 1));
759

760
	page = find_or_create_page(mapping, index, GFP_NOFS);
761 762 763 764 765 766
	if (!page) {
		ret = -ENOMEM;
		mlog_errno(ret);
		goto out;
	}

767 768 769 770 771
	/* Get the offsets within the page that we want to zero */
	zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
	zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
	if (!zero_to)
		zero_to = PAGE_CACHE_SIZE;
772

773 774 775 776 777
	trace_ocfs2_write_zero_page(
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
			(unsigned long long)abs_from,
			(unsigned long long)abs_to,
			index, zero_from, zero_to);
778

779 780 781 782 783 784
	/* We know that zero_from is block aligned */
	for (block_start = zero_from; block_start < zero_to;
	     block_start = block_end) {
		block_end = block_start + (1 << inode->i_blkbits);

		/*
785 786
		 * block_start is block-aligned.  Bump it by one to force
		 * __block_write_begin and block_commit_write to zero the
787 788
		 * whole block.
		 */
789 790
		ret = __block_write_begin(page, block_start + 1, 0,
					  ocfs2_get_block);
791 792
		if (ret < 0) {
			mlog_errno(ret);
793 794 795
			goto out_unlock;
		}

796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812
		if (!handle) {
			handle = ocfs2_zero_start_ordered_transaction(inode);
			if (IS_ERR(handle)) {
				ret = PTR_ERR(handle);
				handle = NULL;
				break;
			}
		}

		/* must not update i_size! */
		ret = block_commit_write(page, block_start + 1,
					 block_start + 1);
		if (ret < 0)
			mlog_errno(ret);
		else
			ret = 0;
	}
813 814

	if (handle)
815
		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
816

817 818 819 820 821 822 823
out_unlock:
	unlock_page(page);
	page_cache_release(page);
out:
	return ret;
}

824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839
/*
 * Find the next range to zero.  We do this in terms of bytes because
 * that's what ocfs2_zero_extend() wants, and it is dealing with the
 * pagecache.  We may return multiple extents.
 *
 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
 * needs to be zeroed.  range_start and range_end return the next zeroing
 * range.  A subsequent call should pass the previous range_end as its
 * zero_start.  If range_end is 0, there's nothing to do.
 *
 * Unwritten extents are skipped over.  Refcounted extents are CoWd.
 */
static int ocfs2_zero_extend_get_range(struct inode *inode,
				       struct buffer_head *di_bh,
				       u64 zero_start, u64 zero_end,
				       u64 *range_start, u64 *range_end)
840
{
841 842 843 844 845 846 847
	int rc = 0, needs_cow = 0;
	u32 p_cpos, zero_clusters = 0;
	u32 zero_cpos =
		zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
	unsigned int num_clusters = 0;
	unsigned int ext_flags = 0;
848

849 850 851 852 853
	while (zero_cpos < last_cpos) {
		rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
					&num_clusters, &ext_flags);
		if (rc) {
			mlog_errno(rc);
854 855 856
			goto out;
		}

857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889
		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
			zero_clusters = num_clusters;
			if (ext_flags & OCFS2_EXT_REFCOUNTED)
				needs_cow = 1;
			break;
		}

		zero_cpos += num_clusters;
	}
	if (!zero_clusters) {
		*range_end = 0;
		goto out;
	}

	while ((zero_cpos + zero_clusters) < last_cpos) {
		rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
					&p_cpos, &num_clusters,
					&ext_flags);
		if (rc) {
			mlog_errno(rc);
			goto out;
		}

		if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
			break;
		if (ext_flags & OCFS2_EXT_REFCOUNTED)
			needs_cow = 1;
		zero_clusters += num_clusters;
	}
	if ((zero_cpos + zero_clusters) > last_cpos)
		zero_clusters = last_cpos - zero_cpos;

	if (needs_cow) {
890 891
		rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
					zero_clusters, UINT_MAX);
892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
		if (rc) {
			mlog_errno(rc);
			goto out;
		}
	}

	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
					     zero_cpos + zero_clusters);

out:
	return rc;
}

/*
 * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
 * has made sure that the entire range needs zeroing.
 */
static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
				   u64 range_end)
{
	int rc = 0;
	u64 next_pos;
	u64 zero_pos = range_start;

917 918 919 920
	trace_ocfs2_zero_extend_range(
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
			(unsigned long long)range_start,
			(unsigned long long)range_end);
921 922 923 924 925 926 927 928 929 930 931 932
	BUG_ON(range_start >= range_end);

	while (zero_pos < range_end) {
		next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
		if (next_pos > range_end)
			next_pos = range_end;
		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
		if (rc < 0) {
			mlog_errno(rc);
			break;
		}
		zero_pos = next_pos;
933 934 935 936 937 938

		/*
		 * Very large extends have the potential to lock up
		 * the cpu for extended periods of time.
		 */
		cond_resched();
939 940
	}

941 942 943 944 945 946 947 948 949 950 951
	return rc;
}

int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
		      loff_t zero_to_size)
{
	int ret = 0;
	u64 zero_start, range_start = 0, range_end = 0;
	struct super_block *sb = inode->i_sb;

	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
952 953 954
	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
				(unsigned long long)zero_start,
				(unsigned long long)i_size_read(inode));
955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980
	while (zero_start < zero_to_size) {
		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
						  zero_to_size,
						  &range_start,
						  &range_end);
		if (ret) {
			mlog_errno(ret);
			break;
		}
		if (!range_end)
			break;
		/* Trim the ends */
		if (range_start < zero_start)
			range_start = zero_start;
		if (range_end > zero_to_size)
			range_end = zero_to_size;

		ret = ocfs2_zero_extend_range(inode, range_start,
					      range_end);
		if (ret) {
			mlog_errno(ret);
			break;
		}
		zero_start = range_end;
	}

981 982 983
	return ret;
}

984 985
int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
			  u64 new_i_size, u64 zero_to)
986 987 988 989 990
{
	int ret;
	u32 clusters_to_add;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);

991 992 993 994 995 996 997
	/*
	 * Only quota files call this without a bh, and they can't be
	 * refcounted.
	 */
	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));

998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017
	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
	if (clusters_to_add < oi->ip_clusters)
		clusters_to_add = 0;
	else
		clusters_to_add -= oi->ip_clusters;

	if (clusters_to_add) {
		ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
						clusters_to_add, 0);
		if (ret) {
			mlog_errno(ret);
			goto out;
		}
	}

	/*
	 * Call this even if we don't add any clusters to the tree. We
	 * still need to zero the area between the old i_size and the
	 * new i_size.
	 */
1018
	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1019 1020 1021 1022 1023 1024 1025
	if (ret < 0)
		mlog_errno(ret);

out:
	return ret;
}

1026 1027
static int ocfs2_extend_file(struct inode *inode,
			     struct buffer_head *di_bh,
1028
			     u64 new_i_size)
1029
{
Mark Fasheh's avatar
Mark Fasheh committed
1030
	int ret = 0;
1031
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1032

1033
	BUG_ON(!di_bh);
1034

1035 1036 1037 1038 1039
	/* setattr sometimes calls us like this. */
	if (new_i_size == 0)
		goto out;

	if (i_size_read(inode) == new_i_size)
1040
		goto out;
1041 1042
	BUG_ON(new_i_size < i_size_read(inode));

1043
	/*
1044 1045 1046
	 * The alloc sem blocks people in read/write from reading our
	 * allocation until we're done changing it. We depend on
	 * i_mutex to block other extend/truncate calls while we're
1047 1048
	 * here.  We even have to hold it for sparse files because there
	 * might be some tail zeroing.
1049
	 */
1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
	down_write(&oi->ip_alloc_sem);

	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
		/*
		 * We can optimize small extends by keeping the inodes
		 * inline data.
		 */
		if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
			up_write(&oi->ip_alloc_sem);
			goto out_update_size;
		}

		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
		if (ret) {
			up_write(&oi->ip_alloc_sem);
			mlog_errno(ret);
Mark Fasheh's avatar
Mark Fasheh committed
1066
			goto out;
1067 1068 1069
		}
	}

1070 1071 1072 1073 1074
	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
		ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
	else
		ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
					    new_i_size);
1075 1076

	up_write(&oi->ip_alloc_sem);
1077

1078 1079
	if (ret < 0) {
		mlog_errno(ret);
Mark Fasheh's avatar
Mark Fasheh committed
1080
		goto out;
1081 1082
	}

1083
out_update_size:
1084 1085 1086
	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
	if (ret < 0)
		mlog_errno(ret);
1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098

out:
	return ret;
}

int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
{
	int status = 0, size_change;
	struct inode *inode = dentry->d_inode;
	struct super_block *sb = inode->i_sb;
	struct ocfs2_super *osb = OCFS2_SB(sb);
	struct buffer_head *bh = NULL;
1099
	handle_t *handle = NULL;
1100
	struct dquot *transfer_to[MAXQUOTAS] = { };
1101
	int qtype;
1102

1103 1104 1105 1106 1107
	trace_ocfs2_setattr(inode, dentry,
			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
			    dentry->d_name.len, dentry->d_name.name,
			    attr->ia_valid, attr->ia_mode,
			    attr->ia_uid, attr->ia_gid);
1108

1109 1110 1111 1112
	/* ensuring we don't even attempt to truncate a symlink */
	if (S_ISLNK(inode->i_mode))
		attr->ia_valid &= ~ATTR_SIZE;

1113 1114
#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
			   | ATTR_GID | ATTR_UID | ATTR_MODE)
1115
	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1116 1117 1118 1119 1120 1121
		return 0;

	status = inode_change_ok(inode, attr);
	if (status)
		return status;

1122 1123
	if (is_quota_modification(inode, attr))
		dquot_initialize(inode);
1124 1125 1126 1127 1128 1129 1130 1131 1132
	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
	if (size_change) {
		status = ocfs2_rw_lock(inode, 1);
		if (status < 0) {
			mlog_errno(status);
			goto bail;
		}
	}

1133
	status = ocfs2_inode_lock(inode, &bh, 1);
1134 1135 1136 1137 1138 1139 1140
	if (status < 0) {
		if (status != -ENOENT)
			mlog_errno(status);
		goto bail_unlock_rw;
	}

	if (size_change && attr->ia_size != i_size_read(inode)) {
1141 1142
		status = inode_newsize_ok(inode, attr->ia_size);
		if (status)
1143 1144
			goto bail_unlock;

1145 1146
		inode_dio_wait(inode);

Joel Becker's avatar
Joel Becker committed
1147 1148 1149 1150 1151 1152 1153
		if (i_size_read(inode) > attr->ia_size) {
			if (ocfs2_should_order_data(inode)) {
				status = ocfs2_begin_ordered_truncate(inode,
								      attr->ia_size);
				if (status)
					goto bail_unlock;
			}
1154
			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
Joel Becker's avatar
Joel Becker committed
1155
		} else
1156
			status = ocfs2_extend_file(inode, bh, attr->ia_size);
1157 1158 1159 1160 1161 1162 1163 1164
		if (status < 0) {
			if (status != -ENOSPC)
				mlog_errno(status);
			status = -ENOSPC;
			goto bail_unlock;
		}
	}

1165 1166
	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {