All new accounts created on Gitlab now require administrator approval. If you invite any collaborators, please let Flux staff know so they can approve the accounts.

Commit 478a1469 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'dax-locking-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull DAX locking updates from Ross Zwisler:
 "Filesystem DAX locking for 4.7

   - We use a bit in an exceptional radix tree entry as a lock bit and
     use it similarly to how page lock is used for normal faults.  This
     fixes races between hole instantiation and read faults of the same
     index.

   - Filesystem DAX PMD faults are disabled, and will be re-enabled when
     PMD locking is implemented"

* tag 'dax-locking-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  dax: Remove i_mmap_lock protection
  dax: Use radix tree entry lock to protect cow faults
  dax: New fault locking
  dax: Allow DAX code to replace exceptional entries
  dax: Define DAX lock bit for radix tree exceptional entry
  dax: Make huge page handling depend of CONFIG_BROKEN
  dax: Fix condition for filling of PMD holes
parents 315227f6 4d9a2c87
......@@ -52,6 +52,7 @@ config FS_DAX_PMD
depends on FS_DAX
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
depends on BROKEN
endif # BLOCK
......
This diff is collapsed.
......@@ -3,17 +3,25 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/radix-tree.h>
#include <asm/pgtable.h>
/* We use lowest available exceptional entry bit for locking */
#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
get_block_t, dio_iodone_t, int flags);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
pgoff_t index, bool wake_all);
#ifdef CONFIG_FS_DAX
struct page *read_dax_sector(struct block_device *bdev, sector_t n);
void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
unsigned int offset, unsigned int length);
#else
......@@ -22,6 +30,12 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
{
return ERR_PTR(-ENXIO);
}
/* Shouldn't ever be called when dax is disabled. */
static inline void dax_unlock_mapping_entry(struct address_space *mapping,
pgoff_t index)
{
BUG();
}
static inline int __dax_zero_page_range(struct block_device *bdev,
sector_t sector, unsigned int offset, unsigned int length)
{
......@@ -29,7 +43,7 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
}
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
unsigned int flags, get_block_t);
int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
......
......@@ -303,6 +303,12 @@ struct vm_fault {
* is set (which is also implied by
* VM_FAULT_ERROR).
*/
void *entry; /* ->fault handler can alternatively
* return locked DAX entry. In that
* case handler should return
* VM_FAULT_DAX_LOCKED and fill in
* entry here.
*/
/* for ->map_pages() only */
pgoff_t max_pgoff; /* map pages for offset from pgoff till
* max_pgoff inclusive */
......@@ -1076,6 +1082,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
#define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
......
......@@ -143,13 +143,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
return;
/*
* Track node that only contains shadow entries.
* Track node that only contains shadow entries. DAX mappings contain
* no shadow entries and may contain other exceptional entries so skip
* those.
*
* Avoid acquiring the list_lru lock if already tracked. The
* list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock.
*/
if (!workingset_node_pages(node) &&
if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
list_empty(&node->private_list)) {
node->private_data = mapping;
list_lru_add(&workingset_shadow_nodes, &node->private_list);
......@@ -580,14 +582,24 @@ static int page_cache_tree_insert(struct address_space *mapping,
if (!radix_tree_exceptional_entry(p))
return -EEXIST;
if (WARN_ON(dax_mapping(mapping)))
return -EINVAL;
if (shadowp)
*shadowp = p;
mapping->nrexceptional--;
if (node)
workingset_node_shadows_dec(node);
if (!dax_mapping(mapping)) {
if (shadowp)
*shadowp = p;
if (node)
workingset_node_shadows_dec(node);
} else {
/* DAX can replace empty locked entry with a hole */
WARN_ON_ONCE(p !=
(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
RADIX_DAX_ENTRY_LOCK));
/* DAX accounts exceptional entries as normal pages */
if (node)
workingset_node_pages_dec(node);
/* Wakeup waiters for exceptional entry lock */
dax_wake_mapping_entry_waiter(mapping, page->index,
false);
}
}
radix_tree_replace_slot(slot, page);
mapping->nrpages++;
......
......@@ -63,6 +63,7 @@
#include <linux/dma-debug.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
......@@ -2492,8 +2493,6 @@ void unmap_mapping_range(struct address_space *mapping,
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
/* DAX uses i_mmap_lock to serialise file truncate vs page fault */
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
......@@ -2825,7 +2824,8 @@ oom:
*/
static int __do_fault(struct vm_area_struct *vma, unsigned long address,
pgoff_t pgoff, unsigned int flags,
struct page *cow_page, struct page **page)
struct page *cow_page, struct page **page,
void **entry)
{
struct vm_fault vmf;
int ret;
......@@ -2840,8 +2840,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
if (!vmf.page)
goto out;
if (ret & VM_FAULT_DAX_LOCKED) {
*entry = vmf.entry;
return ret;
}
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
......@@ -2855,7 +2857,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
else
VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
out:
*page = vmf.page;
return ret;
}
......@@ -3048,7 +3049,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
}
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
......@@ -3071,6 +3072,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
struct page *fault_page, *new_page;
void *fault_entry;
struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
......@@ -3088,26 +3090,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
}
ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page,
&fault_entry);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
if (fault_page)
if (!(ret & VM_FAULT_DAX_LOCKED))
copy_user_highpage(new_page, fault_page, address, vma);
__SetPageUptodate(new_page);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
if (fault_page) {
if (!(ret & VM_FAULT_DAX_LOCKED)) {
unlock_page(fault_page);
put_page(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
* i_mmap_lock for read to protect against truncate.
*/
i_mmap_unlock_read(vma->vm_file->f_mapping);
dax_unlock_mapping_entry(vma->vm_file->f_mapping,
pgoff);
}
goto uncharge_out;
}
......@@ -3115,15 +3115,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
if (fault_page) {
if (!(ret & VM_FAULT_DAX_LOCKED)) {
unlock_page(fault_page);
put_page(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
* i_mmap_lock for read to protect against truncate.
*/
i_mmap_unlock_read(vma->vm_file->f_mapping);
dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
}
return ret;
uncharge_out:
......@@ -3143,7 +3139,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int dirtied = 0;
int ret, tmp;
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
......
......@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping,
if (shmem_mapping(mapping))
return;
spin_lock_irq(&mapping->tree_lock);
if (dax_mapping(mapping)) {
if (radix_tree_delete_item(&mapping->page_tree, index, entry))
mapping->nrexceptional--;
} else {
/*
* Regular page slots are stabilized by the page lock even
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
&slot))
goto unlock;
if (*slot != entry)
goto unlock;
radix_tree_replace_slot(slot, NULL);
mapping->nrexceptional--;
if (!node)
goto unlock;
workingset_node_shadows_dec(node);
/*
* Don't track node without shadow entries.
*
* Avoid acquiring the list_lru lock if already untracked.
* The list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock.
*/
if (!workingset_node_shadows(node) &&
!list_empty(&node->private_list))
list_lru_del(&workingset_shadow_nodes,
&node->private_list);
__radix_tree_delete_node(&mapping->page_tree, node);
dax_delete_mapping_entry(mapping, index);
return;
}
spin_lock_irq(&mapping->tree_lock);
/*
* Regular page slots are stabilized by the page lock even
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
&slot))
goto unlock;
if (*slot != entry)
goto unlock;
radix_tree_replace_slot(slot, NULL);
mapping->nrexceptional--;
if (!node)
goto unlock;
workingset_node_shadows_dec(node);
/*
* Don't track node without shadow entries.
*
* Avoid acquiring the list_lru lock if already untracked.
* The list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock.
*/
if (!workingset_node_shadows(node) &&
!list_empty(&node->private_list))
list_lru_del(&workingset_shadow_nodes,
&node->private_list);
__radix_tree_delete_node(&mapping->page_tree, node);
unlock:
spin_unlock_irq(&mapping->tree_lock);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment