Commit 00e3f5cc authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "The two main changes are aio support in CephFS, and a series that
  fixes several issues in the authentication key timeout/renewal code.

  On top of that are a variety of cleanups and minor bug fixes"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  libceph: remove outdated comment
  libceph: kill off ceph_x_ticket_handler::validity
  libceph: invalidate AUTH in addition to a service ticket
  libceph: fix authorizer invalidation, take 2
  libceph: clear messenger auth_retry flag if we fault
  libceph: fix ceph_msg_revoke()
  libceph: use list_for_each_entry_safe
  ceph: use i_size_{read,write} to get/set i_size
  ceph: re-send AIO write request when getting -EOLDSNAP error
  ceph: Asynchronous IO support
  ceph: Avoid to propagate the invalid page point
  ceph: fix double page_unlock() in page_mkwrite()
  rbd: delete an unnecessary check before rbd_dev_destroy()
  libceph: use list_next_entry instead of list_entry_next
  ceph: ceph_frag_contains_value can be boolean
  ceph: remove unused functions in ceph_frag.h
parents 772950ed 7e01726a
......@@ -5185,8 +5185,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
out_err:
rbd_dev_unparent(rbd_dev);
if (parent)
rbd_dev_destroy(parent);
rbd_dev_destroy(parent);
return ret;
}
......
......@@ -1108,7 +1108,7 @@ retry_locked:
return 0;
/* past end of file? */
i_size = inode->i_size; /* caller holds i_mutex */
i_size = i_size_read(inode);
if (page_off >= i_size ||
(pos_in_page == 0 && (pos+len) >= i_size &&
......@@ -1149,7 +1149,6 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
page = grab_cache_page_write_begin(mapping, index, 0);
if (!page)
return -ENOMEM;
*pagep = page;
dout("write_begin file %p inode %p page %p %d~%d\n", file,
inode, page, (int)pos, (int)len);
......@@ -1184,8 +1183,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
zero_user_segment(page, from+copied, len);
/* did file size increase? */
/* (no need for i_size_read(); we caller holds i_mutex */
if (pos+copied > inode->i_size)
if (pos+copied > i_size_read(inode))
check_cap = ceph_inode_set_size(inode, pos+copied);
if (!PageUptodate(page))
......@@ -1378,11 +1376,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
if ((off > size) ||
(page->mapping != inode->i_mapping))
(page->mapping != inode->i_mapping)) {
unlock_page(page);
goto out;
}
ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
if (ret == 0) {
if (ret >= 0) {
/* success. we'll keep the page locked. */
set_page_dirty(page);
ret = VM_FAULT_LOCKED;
......@@ -1393,8 +1393,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
}
out:
if (ret != VM_FAULT_LOCKED)
unlock_page(page);
if (ret == VM_FAULT_LOCKED ||
ci->i_inline_version != CEPH_INLINE_NONE) {
int dirty;
......
......@@ -106,7 +106,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
memset(&aux, 0, sizeof(aux));
aux.mtime = inode->i_mtime;
aux.size = inode->i_size;
aux.size = i_size_read(inode);
memcpy(buffer, &aux, sizeof(aux));
......@@ -117,9 +117,7 @@ static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
uint64_t *size)
{
const struct ceph_inode_info* ci = cookie_netfs_data;
const struct inode* inode = &ci->vfs_inode;
*size = inode->i_size;
*size = i_size_read(&ci->vfs_inode);
}
static enum fscache_checkaux ceph_fscache_inode_check_aux(
......@@ -134,7 +132,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
memset(&aux, 0, sizeof(aux));
aux.mtime = inode->i_mtime;
aux.size = inode->i_size;
aux.size = i_size_read(inode);
if (memcmp(data, &aux, sizeof(aux)) != 0)
return FSCACHE_CHECKAUX_OBSOLETE;
......
......@@ -397,8 +397,9 @@ int ceph_release(struct inode *inode, struct file *file)
}
enum {
CHECK_EOF = 1,
READ_INLINE = 2,
HAVE_RETRIED = 1,
CHECK_EOF = 2,
READ_INLINE = 3,
};
/*
......@@ -411,17 +412,15 @@ enum {
static int striped_read(struct inode *inode,
u64 off, u64 len,
struct page **pages, int num_pages,
int *checkeof, bool o_direct,
unsigned long buf_align)
int *checkeof)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 pos, this_len, left;
int io_align, page_align;
int pages_left;
int read;
loff_t i_size;
int page_align, pages_left;
int read, ret;
struct page **page_pos;
int ret;
bool hit_stripe, was_short;
/*
......@@ -432,13 +431,9 @@ static int striped_read(struct inode *inode,
page_pos = pages;
pages_left = num_pages;
read = 0;
io_align = off & ~PAGE_MASK;
more:
if (o_direct)
page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
else
page_align = pos & ~PAGE_MASK;
page_align = pos & ~PAGE_MASK;
this_len = left;
ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
&ci->i_layout, pos, &this_len,
......@@ -452,13 +447,12 @@ more:
dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
i_size = i_size_read(inode);
if (ret >= 0) {
int didpages;
if (was_short && (pos + ret < inode->i_size)) {
int zlen = min(this_len - ret,
inode->i_size - pos - ret);
int zoff = (o_direct ? buf_align : io_align) +
read + ret;
if (was_short && (pos + ret < i_size)) {
int zlen = min(this_len - ret, i_size - pos - ret);
int zoff = (off & ~PAGE_MASK) + read + ret;
dout(" zero gap %llu to %llu\n",
pos + ret, pos + ret + zlen);
ceph_zero_page_vector_range(zoff, zlen, pages);
......@@ -473,14 +467,14 @@ more:
pages_left -= didpages;
/* hit stripe and need continue*/
if (left && hit_stripe && pos < inode->i_size)
if (left && hit_stripe && pos < i_size)
goto more;
}
if (read > 0) {
ret = read;
/* did we bounce off eof? */
if (pos + left > inode->i_size)
if (pos + left > i_size)
*checkeof = CHECK_EOF;
}
......@@ -521,54 +515,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
if (ret < 0)
return ret;
if (iocb->ki_flags & IOCB_DIRECT) {
while (iov_iter_count(i)) {
size_t start;
ssize_t n;
n = dio_get_pagev_size(i);
pages = dio_get_pages_alloc(i, n, &start, &num_pages);
if (IS_ERR(pages))
return PTR_ERR(pages);
ret = striped_read(inode, off, n,
pages, num_pages, checkeof,
1, start);
ceph_put_page_vector(pages, num_pages, true);
if (ret <= 0)
break;
off += ret;
iov_iter_advance(i, ret);
if (ret < n)
num_pages = calc_pages_for(off, len);
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages))
return PTR_ERR(pages);
ret = striped_read(inode, off, len, pages,
num_pages, checkeof);
if (ret > 0) {
int l, k = 0;
size_t left = ret;
while (left) {
size_t page_off = off & ~PAGE_MASK;
size_t copy = min_t(size_t, left,
PAGE_SIZE - page_off);
l = copy_page_to_iter(pages[k++], page_off, copy, i);
off += l;
left -= l;
if (l < copy)
break;
}
} else {
num_pages = calc_pages_for(off, len);
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages))
return PTR_ERR(pages);
ret = striped_read(inode, off, len, pages,
num_pages, checkeof, 0, 0);
if (ret > 0) {
int l, k = 0;
size_t left = ret;
while (left) {
size_t page_off = off & ~PAGE_MASK;
size_t copy = min_t(size_t,
PAGE_SIZE - page_off, left);
l = copy_page_to_iter(pages[k++], page_off,
copy, i);
off += l;
left -= l;
if (l < copy)
break;
}
}
ceph_release_page_vector(pages, num_pages);
}
ceph_release_page_vector(pages, num_pages);
if (off > iocb->ki_pos) {
ret = off - iocb->ki_pos;
......@@ -579,6 +547,193 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
return ret;
}
struct ceph_aio_request {
struct kiocb *iocb;
size_t total_len;
int write;
int error;
struct list_head osd_reqs;
unsigned num_reqs;
atomic_t pending_reqs;
struct timespec mtime;
struct ceph_cap_flush *prealloc_cf;
};
struct ceph_aio_work {
struct work_struct work;
struct ceph_osd_request *req;
};
static void ceph_aio_retry_work(struct work_struct *work);
static void ceph_aio_complete(struct inode *inode,
struct ceph_aio_request *aio_req)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int ret;
if (!atomic_dec_and_test(&aio_req->pending_reqs))
return;
ret = aio_req->error;
if (!ret)
ret = aio_req->total_len;
dout("ceph_aio_complete %p rc %d\n", inode, ret);
if (ret >= 0 && aio_req->write) {
int dirty;
loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
if (endoff > i_size_read(inode)) {
if (ceph_inode_set_size(inode, endoff))
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&aio_req->prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
}
ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
CEPH_CAP_FILE_RD));
aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
ceph_free_cap_flush(aio_req->prealloc_cf);
kfree(aio_req);
}
static void ceph_aio_complete_req(struct ceph_osd_request *req,
struct ceph_msg *msg)
{
int rc = req->r_result;
struct inode *inode = req->r_inode;
struct ceph_aio_request *aio_req = req->r_priv;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
int num_pages = calc_pages_for((u64)osd_data->alignment,
osd_data->length);
dout("ceph_aio_complete_req %p rc %d bytes %llu\n",
inode, rc, osd_data->length);
if (rc == -EOLDSNAPC) {
struct ceph_aio_work *aio_work;
BUG_ON(!aio_req->write);
aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
if (aio_work) {
INIT_WORK(&aio_work->work, ceph_aio_retry_work);
aio_work->req = req;
queue_work(ceph_inode_to_client(inode)->wb_wq,
&aio_work->work);
return;
}
rc = -ENOMEM;
} else if (!aio_req->write) {
if (rc == -ENOENT)
rc = 0;
if (rc >= 0 && osd_data->length > rc) {
int zoff = osd_data->alignment + rc;
int zlen = osd_data->length - rc;
/*
* If read is satisfied by single OSD request,
* it can pass EOF. Otherwise read is within
* i_size.
*/
if (aio_req->num_reqs == 1) {
loff_t i_size = i_size_read(inode);
loff_t endoff = aio_req->iocb->ki_pos + rc;
if (endoff < i_size)
zlen = min_t(size_t, zlen,
i_size - endoff);
aio_req->total_len = rc + zlen;
}
if (zlen > 0)
ceph_zero_page_vector_range(zoff, zlen,
osd_data->pages);
}
}
ceph_put_page_vector(osd_data->pages, num_pages, false);
ceph_osdc_put_request(req);
if (rc < 0)
cmpxchg(&aio_req->error, 0, rc);
ceph_aio_complete(inode, aio_req);
return;
}
static void ceph_aio_retry_work(struct work_struct *work)
{
struct ceph_aio_work *aio_work =
container_of(work, struct ceph_aio_work, work);
struct ceph_osd_request *orig_req = aio_work->req;
struct ceph_aio_request *aio_req = orig_req->r_priv;
struct inode *inode = orig_req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
struct ceph_osd_request *req;
int ret;
spin_lock(&ci->i_ceph_lock);
if (__ceph_have_pending_cap_snap(ci)) {
struct ceph_cap_snap *capsnap =
list_last_entry(&ci->i_cap_snaps,
struct ceph_cap_snap,
ci_item);
snapc = ceph_get_snap_context(capsnap->context);
} else {
BUG_ON(!ci->i_head_snapc);
snapc = ceph_get_snap_context(ci->i_head_snapc);
}
spin_unlock(&ci->i_ceph_lock);
req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
false, GFP_NOFS);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
req = orig_req;
goto out;
}
req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE;
req->r_base_oloc = orig_req->r_base_oloc;
req->r_base_oid = orig_req->r_base_oid;
req->r_ops[0] = orig_req->r_ops[0];
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
snapc, CEPH_NOSNAP, &aio_req->mtime);
ceph_put_snap_context(snapc);
ceph_osdc_put_request(orig_req);
req->r_callback = ceph_aio_complete_req;
req->r_inode = inode;
req->r_priv = aio_req;
ret = ceph_osdc_start_request(req->r_osdc, req, false);
out:
if (ret < 0) {
BUG_ON(ret == -EOLDSNAPC);
req->r_result = ret;
ceph_aio_complete_req(req, NULL);
}
kfree(aio_work);
}
/*
* Write commit request unsafe callback, called to tell us when a
* request is unsafe (that is, in flight--has been handed to the
......@@ -612,16 +767,10 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
}
/*
* Synchronous write, straight from __user pointer or user pages.
*
* If write spans object boundary, just do multiple writes. (For a
* correct atomic write, we should e.g. take write locks on all
* objects, rollback on failure, etc.)
*/
static ssize_t
ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct ceph_snap_context *snapc)
ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct ceph_snap_context *snapc,
struct ceph_cap_flush **pcf)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
......@@ -630,44 +779,52 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
int num_pages;
int written = 0;
struct ceph_aio_request *aio_req = NULL;
int num_pages = 0;
int flags;
int check_caps = 0;
int ret;
struct timespec mtime = CURRENT_TIME;
size_t count = iov_iter_count(from);
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
bool write = iov_iter_rw(iter) == WRITE;
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
dout("sync_direct_write on file %p %lld~%u\n", file, pos,
(unsigned)count);
dout("sync_direct_read_write (%s) on file %p %lld~%u\n",
(write ? "write" : "read"), file, pos, (unsigned)count);
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
if (ret < 0)
return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_CACHE_SHIFT,
(pos + count) >> PAGE_CACHE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
if (write) {
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_CACHE_SHIFT,
(pos + count) >> PAGE_CACHE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE;
flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE;
} else {
flags = CEPH_OSD_FLAG_READ;
}
while (iov_iter_count(from) > 0) {
u64 len = dio_get_pagev_size(from);
size_t start;
ssize_t n;
while (iov_iter_count(iter) > 0) {
u64 size = dio_get_pagev_size(iter);
size_t start = 0;
ssize_t len;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0,
2,/*include a 'startsync' command*/
CEPH_OSD_OP_WRITE, flags, snapc,
vino, pos, &size, 0,
/*include a 'startsync' command*/
write ? 2 : 1,
write ? CEPH_OSD_OP_WRITE :
CEPH_OSD_OP_READ,
flags, snapc,
ci->i_truncate_seq,
ci->i_truncate_size,
false);
......@@ -676,10 +833,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
break;
}
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
n = len;
pages = dio_get_pages_alloc(from, len, &start, &num_pages);
len = size;
pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
if (IS_ERR(pages)) {
ceph_osdc_put_request(req);
ret = PTR_ERR(pages);
......@@ -687,47 +842,128 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
}
/*
* throw out any page cache pages in this range. this
* may block.
* To simplify error handling, allow AIO when IO within i_size
* or IO can be satisfied by single OSD request.
*/
truncate_inode_pages_range(inode->i_mapping, pos,
(pos+n) | (PAGE_CACHE_SIZE-1));
osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
false, false);
if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
(len == count || pos + count <= i_size_read(inode))) {
aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
if (aio_req) {
aio_req->iocb = iocb;
aio_req->write = write;
INIT_LIST_HEAD(&aio_req->osd_reqs);
if (write) {
aio_req->mtime = mtime;
swap(aio_req->prealloc_cf, *pcf);
}
}
/* ignore error */
}
if (write) {
/*
* throw out any page cache pages in this range. this
* may block.
*/
truncate_inode_pages_range(inode->i_mapping, pos,
(pos+len) | (PAGE_CACHE_SIZE - 1));
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
}
osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
false, false);
/* BUG_ON(vino.snap != CEPH_NOSNAP); */
ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (aio_req) {
aio_req->total_len += len;
aio_req->num_reqs++;
atomic_inc(&aio_req->pending_reqs);
req->r_callback = ceph_aio_complete_req;
req->r_inode = inode;
req->r_priv = aio_req;
list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
pos += len;
iov_iter_advance(iter, len);
continue;
}
ret = ceph_osdc_start_request(req->r_osdc, req, false);
if (!ret)
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
size = i_size_read(inode);
if (!write) {
if (ret == -ENOENT)
ret = 0;
if (ret >= 0 && ret < len && pos + ret < size) {
int zlen = min_t(size_t, len - ret,
size - pos - ret);
ceph_zero_page_vector_range(start + ret, zlen,
pages);
ret += zlen;
}