Commit ccd979bd authored by Mark Fasheh's avatar Mark Fasheh Committed by Joel Becker
Browse files

[PATCH] OCFS2: The Second Oracle Cluster Filesystem



The OCFS2 file system module.
Signed-off-by: default avatarMark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: default avatarKurt Hackel <kurt.hackel@oracle.com>
parent 8df08c89
......@@ -36,6 +36,8 @@ ntfs.txt
- info and mount options for the NTFS filesystem (Windows NT).
proc.txt
- info on Linux's /proc filesystem.
ocfs2.txt
- info and mount options for the OCFS2 clustered filesystem.
romfs.txt
- Description of the ROMFS filesystem.
smbfs.txt
......
OCFS2 filesystem
==================
OCFS2 is a general purpose extent based shared disk cluster file
system with many similarities to ext3. It supports 64 bit inode
numbers, and has automatically extending metadata groups which may
also make it attractive for non-clustered use.
You'll want to install the ocfs2-tools package in order to at least
get "mount.ocfs2" and "ocfs2_hb_ctl".
Project web page: http://oss.oracle.com/projects/ocfs2
Tools web page: http://oss.oracle.com/projects/ocfs2-tools
OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
All code copyright 2005 Oracle except when otherwise noted.
CREDITS:
Lots of code taken from ext3 and other projects.
Authors in alphabetical order:
Joel Becker <joel.becker@oracle.com>
Zach Brown <zach.brown@oracle.com>
Mark Fasheh <mark.fasheh@oracle.com>
Kurt Hackel <kurt.hackel@oracle.com>
Sunil Mushran <sunil.mushran@oracle.com>
Manish Singh <manish.singh@oracle.com>
Caveats
=======
Features which OCFS2 does not support yet:
- sparse files
- extended attributes
- shared writeable mmap
- loopback is supported, but data written will not
be cluster coherent.
- quotas
- cluster aware flock
- Directory change notification (F_NOTIFY)
- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
- POSIX ACLs
- readpages / writepages (not user visible)
Mount options
=============
OCFS2 supports the following mount options:
(*) == default
barrier=1 This enables/disables barriers. barrier=0 disables it,
barrier=1 enables it.
errors=remount-ro(*) Remount the filesystem read-only on an error.
errors=panic Panic and halt the machine if an error occurs.
intr (*) Allow signals to interrupt cluster operations.
nointr Do not allow signals to interrupt cluster
operations.
......@@ -1905,6 +1905,15 @@ M: ajoshi@shell.unixbox.com
L: linux-nvidia@lists.surfsouth.com
S: Maintained
ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
P: Mark Fasheh
M: mark.fasheh@oracle.com
P: Kurt Hackel
M: kurt.hackel@oracle.com
L: ocfs2-devel@oss.oracle.com
W: http://oss.oracle.com/projects/ocfs2/
S: Supported
OLYMPIC NETWORK DRIVER
P: Peter De Shrijver
M: p2@ace.ulyssis.student.kuleuven.ac.be
......
EXTRA_CFLAGS += -Ifs/ocfs2
EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
obj-$(CONFIG_OCFS2_FS) += ocfs2.o
ocfs2-objs := \
alloc.o \
aops.o \
buffer_head_io.o \
dcache.o \
dir.o \
dlmglue.o \
export.o \
extent_map.o \
file.o \
heartbeat.o \
inode.o \
journal.o \
localalloc.o \
mmap.o \
namei.o \
slot_map.o \
suballoc.o \
super.o \
symlink.o \
sysfile.o \
uptodate.o \
ver.o \
vote.o
obj-$(CONFIG_OCFS2_FS) += cluster/
obj-$(CONFIG_OCFS2_FS) += dlm/
This diff is collapsed.
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* alloc.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_ALLOC_H
#define OCFS2_ALLOC_H
struct ocfs2_alloc_context;
int ocfs2_insert_extent(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 blkno,
u32 new_clusters,
struct ocfs2_alloc_context *meta_ac);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
struct ocfs2_dinode *fe);
/* how many new metadata chunks would an allocation need at maximum? */
static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
{
/*
* Rather than do all the work of determining how much we need
* (involves a ton of reads and locks), just ask for the
* maximal limit. That's a tree depth shift. So, one block for
* level of the tree (current l_tree_depth), one block for the
* new tree_depth==0 extent_block, and one block at the new
* top-of-the tree.
*/
return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
}
int ocfs2_truncate_log_init(struct ocfs2_super *osb);
void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
int cancel);
int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
int slot_num,
struct ocfs2_dinode **tl_copy);
int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode *tl_copy);
struct ocfs2_truncate_context {
struct inode *tc_ext_alloc_inode;
struct buffer_head *tc_ext_alloc_bh;
int tc_ext_alloc_locked; /* is it cluster locked? */
/* these get destroyed once it's passed to ocfs2_commit_truncate. */
struct buffer_head *tc_last_eb_bh;
};
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context **tc);
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context *tc);
#endif /* OCFS2_ALLOC_H */
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <asm/byteorder.h>
#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
#include "inode.h"
#include "journal.h"
#include "super.h"
#include "symlink.h"
#include "buffer_head_io.h"
static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int err = -EIO;
int status;
struct ocfs2_dinode *fe = NULL;
struct buffer_head *bh = NULL;
struct buffer_head *buffer_cache_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
void *kaddr;
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
(unsigned long long)iblock, bh_result, create);
BUG_ON(ocfs2_inode_is_fast_symlink(inode));
if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
(unsigned long long)iblock);
goto bail;
}
status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
OCFS2_I(inode)->ip_blkno,
&bh, OCFS2_BH_CACHED, inode);
if (status < 0) {
mlog_errno(status);
goto bail;
}
fe = (struct ocfs2_dinode *) bh->b_data;
if (!OCFS2_IS_VALID_DINODE(fe)) {
mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
fe->i_blkno, 7, fe->i_signature);
goto bail;
}
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(fe->i_clusters))) {
mlog(ML_ERROR, "block offset is outside the allocated size: "
"%llu\n", (unsigned long long)iblock);
goto bail;
}
/* We don't use the page cache to create symlink data, so if
* need be, copy it over from the buffer cache. */
if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
iblock;
buffer_cache_bh = sb_getblk(osb->sb, blkno);
if (!buffer_cache_bh) {
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
goto bail;
}
/* we haven't locked out transactions, so a commit
* could've happened. Since we've got a reference on
* the bh, even if it commits while we're doing the
* copy, the data is still good. */
if (buffer_jbd(buffer_cache_bh)
&& ocfs2_inode_is_new(inode)) {
kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
if (!kaddr) {
mlog(ML_ERROR, "couldn't kmap!\n");
goto bail;
}
memcpy(kaddr + (bh_result->b_size * iblock),
buffer_cache_bh->b_data,
bh_result->b_size);
kunmap_atomic(kaddr, KM_USER0);
set_buffer_uptodate(bh_result);
}
brelse(buffer_cache_bh);
}
map_bh(bh_result, inode->i_sb,
le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
err = 0;
bail:
if (bh)
brelse(bh);
mlog_exit(err);
return err;
}
static int ocfs2_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int err = 0;
u64 p_blkno, past_eof;
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
(unsigned long long)iblock, bh_result, create);
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
inode, inode->i_ino);
if (S_ISLNK(inode->i_mode)) {
/* this always does I/O for some reason. */
err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
goto bail;
}
/* this can happen if another node truncs after our extend! */
spin_lock(&OCFS2_I(inode)->ip_lock);
if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
OCFS2_I(inode)->ip_clusters))
err = -EIO;
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (err)
goto bail;
err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
NULL);
if (err) {
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
"%"MLFu64", NULL)\n", err, inode,
(unsigned long long)iblock, p_blkno);
goto bail;
}
map_bh(bh_result, inode->i_sb, p_blkno);
if (bh_result->b_blocknr == 0) {
err = -EIO;
mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
"blkno=(%"MLFu64")\n", (unsigned long long)iblock,
p_blkno, OCFS2_I(inode)->ip_blkno);
}
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
if (create && (iblock >= past_eof))
set_buffer_new(bh_result);
bail:
if (err < 0)
err = -EIO;
mlog_exit(err);
return err;
}
static int ocfs2_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
int ret, unlock = 1;
mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
mlog_errno(ret);
goto out;
}
down_read(&OCFS2_I(inode)->ip_alloc_sem);
/*
* i_size might have just been updated as we grabed the meta lock. We
* might now be discovering a truncate that hit on another node.
* block_read_full_page->get_block freaks out if it is asked to read
* beyond the end of a file, so we check here. Callers
* (generic_file_read, fault->nopage) are clever enough to check i_size
* and notice that the page they just read isn't needed.
*
* XXX sys_readahead() seems to get that wrong?
*/
if (start >= i_size_read(inode)) {
char *addr = kmap(page);
memset(addr, 0, PAGE_SIZE);
flush_dcache_page(page);
kunmap(page);
SetPageUptodate(page);
ret = 0;
goto out_alloc;
}
ret = ocfs2_data_lock_with_page(inode, 0, page);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
mlog_errno(ret);
goto out_alloc;
}
ret = block_read_full_page(page, ocfs2_get_block);
unlock = 0;
ocfs2_data_unlock(inode, 0);
out_alloc:
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_meta_unlock(inode, 0);
out:
if (unlock)
unlock_page(page);
mlog_exit(ret);
return ret;
}
/* Note: Because we don't support holes, our allocation has
* already happened (allocation writes zeros to the file data)
* so we don't have to worry about ordered writes in
* ocfs2_writepage.
*
* ->writepage is called during the process of invalidating the page cache
* during blocked lock processing. It can't block on any cluster locks
* to during block mapping. It's relying on the fact that the block
* mapping can't have disappeared under the dirty pages that it is
* being asked to write back.
*/
static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
mlog_entry("(0x%p)\n", page);
ret = block_write_full_page(page, ocfs2_get_block, wbc);
mlog_exit(ret);
return ret;
}
/*
* ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
* from loopback. It must be able to perform its own locking around
* ocfs2_get_block().
*/
int ocfs2_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
int ret;
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
if (ret != 0) {
mlog_errno(ret);
goto out;
}
down_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = block_prepare_write(page, from, to, ocfs2_get_block);
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_meta_unlock(inode, 0);
out:
mlog_exit(ret);
return ret;
}
/* Taken from ext3. We don't necessarily need the full blown
* functionality yet, but IMHO it's better to cut and paste the whole
* thing so we can avoid introducing our own bugs (and easily pick up
* their fixes when they happen) --Mark */
static int walk_page_buffers( handle_t *handle,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
int (*fn)( handle_t *handle,
struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
unsigned blocksize = head->b_size;
int err, ret = 0;
struct buffer_head *next;
for ( bh = head, block_start = 0;
ret == 0 && (bh != head || !block_start);
block_start = block_end, bh = next)
{
next = bh->b_this_page;
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (partial && !buffer_uptodate(bh))
*partial = 1;
continue;
}
err = (*fn)(handle, bh);
if (!ret)
ret = err;
}
return ret;
}
struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
unsigned to)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_journal_handle *handle = NULL;
int ret = 0;
handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
if (!handle) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
if (ocfs2_should_order_data(inode)) {
ret = walk_page_buffers(handle->k_handle,
page_buffers(page),
from, to, NULL,
ocfs2_journal_dirty_data);
if (ret < 0)
mlog_errno(ret);
}
out:
if (ret) {
if (handle)
ocfs2_commit_trans(handle);
handle = ERR_PTR(ret);
}
return handle;
}
static int ocfs2_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
int ret, extending = 0, locklevel = 0;
loff_t new_i_size;
struct buffer_head *di_bh = NULL;
struct inode *inode = page->mapping->host;
struct ocfs2_journal_handle *handle = NULL;
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
* us to sample inode->i_size here without the metadata lock:
*
* 1) We're currently holding the inode alloc lock, so no
* nodes can change it underneath us.
*
* 2) We've had to take the metadata lock at least once
* already to check for extending writes, hence insuring
* that our current copy is also up to date.
*/
new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
if (new_i_size > i_size_read(inode)) {
extending = 1;
locklevel = 1;
}
ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
if (ret != 0) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_data_lock_with_page(inode, 1, page);
if (ret != 0) {
mlog_errno(ret);
goto out_unlock_meta;
}
if (extending) {
handle = ocfs2_start_walk_page_trans(inode, page, from, to);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
goto out_unlock_data;
}
/* Mark our buffer early. We'd rather catch this error up here
* as opposed to after a successful commit_write which would
* require us to set back inode->i_size. */
ret = ocfs2_journal_access(handle, inode, di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);