Commit d080827f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm updates from Dan Williams:
 "The bulk of this has appeared in -next and independently received a
  build success notification from the kbuild robot.  The 'for-4.5/block-
  dax' topic branch was rebased over the weekend to drop the "block
  device end-of-life" rework that Al would like to see re-implemented
  with a notifier, and to address bug reports against the badblocks
  integration.

  There is pending feedback against "libnvdimm: Add a poison list and
  export badblocks" received last week.  Linda identified some localized
  fixups that we will handle incrementally.

  Summary:

   - Media error handling: The 'badblocks' implementation that
     originated in md-raid is up-levelled to a generic capability of a
     block device.  This initial implementation is limited to being
     consulted in the pmem block-i/o path.  Later, 'badblocks' will be
     consulted when creating dax mappings.

   - Raw block device dax: For virtualization and other cases that want
     large contiguous mappings of persistent memory, add the capability
     to dax-mmap a block device directly.

   - Increased /dev/mem restrictions: Add an option to treat all
     io-memory as IORESOURCE_EXCLUSIVE, i.e. disable /dev/mem access
     while a driver is actively using an address range.  This behavior
     is controlled via the new CONFIG_IO_STRICT_DEVMEM option and can be
     overridden by the existing "iomem=relaxed" kernel command line
     option.

   - Miscellaneous fixes include a 'pfn'-device huge page alignment fix,
     block device shutdown crash fix, and other small libnvdimm fixes"

* tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (32 commits)
  block: kill disk_{check|set|clear|alloc}_badblocks
  libnvdimm, pmem: nvdimm_read_bytes() badblocks support
  pmem, dax: disable dax in the presence of bad blocks
  pmem: fail io-requests to known bad blocks
  libnvdimm: convert to statically allocated badblocks
  libnvdimm: don't fail init for full badblocks list
  block, badblocks: introduce devm_init_badblocks
  block: clarify badblocks lifetime
  badblocks: rename badblocks_free to badblocks_exit
  libnvdimm, pmem: move definition of nvdimm_namespace_add_poison to nd.h
  libnvdimm: Add a poison list and export badblocks
  nfit_test: Enable DSMs for all test NFITs
  md: convert to use the generic badblocks code
  block: Add badblock management for gendisks
  badblocks: Add core badblock management code
  block: fix del_gendisk() vs blkdev_ioctl crash
  block: enable dax for raw block devices
  block: introduce bdev_file_inode()
  restrict /dev/mem to idle io memory ranges
  arch: consolidate CONFIG_STRICT_DEVM in lib/Kconfig.debug
  ...
parents cbd88cd4 8b63b6bf
......@@ -2,6 +2,7 @@ config ARM
bool
default y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_CUSTOM_GPIO_H
......
......@@ -15,20 +15,6 @@ config ARM_PTDUMP
kernel.
If in doubt, say "N"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel.
If this option is switched on, the /dev/mem file only allows
userspace access to memory mapped peripherals.
If in doubt, say Y.
# RMK wants arm kernels compiled with frame pointers or stack unwinding.
# If you know what you are doing and are willing to live without stack
# traces, you can get a slightly smaller kernel by setting this option to
......
......@@ -3,6 +3,7 @@ config ARM64
select ACPI_CCA_REQUIRED if ACPI
select ACPI_GENERIC_GSI if ACPI
select ACPI_REDUCED_HARDWARE_ONLY if ACPI
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
......
......@@ -14,20 +14,6 @@ config ARM64_PTDUMP
kernel.
If in doubt, say "N"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
help
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel.
If this option is switched on, the /dev/mem file only allows
userspace access to memory mapped peripherals.
If in doubt, say Y.
config PID_IN_CONTEXTIDR
bool "Write the current PID to the CONTEXTIDR register"
help
......
......@@ -10,6 +10,7 @@ config FRV
select HAVE_DEBUG_BUGVERBOSE
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_CPU_DEVICES
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_WANT_IPC_PARSE_VERSION
select OLD_SIGSUSPEND3
select OLD_SIGACTION
......
......@@ -13,6 +13,7 @@ config M32R
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
select GENERIC_ATOMIC64
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_USES_GETTIMEOFFSET
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
......
......@@ -159,6 +159,7 @@ config PPC
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
select ARCH_HAS_DMA_SET_COHERENT_MASK
select ARCH_HAS_DEVMEM_IS_ALLOWED
select HAVE_ARCH_SECCOMP_FILTER
config GENERIC_CSUM
......
......@@ -335,18 +335,6 @@ config PPC_EARLY_DEBUG_CPM_ADDR
platform probing is done, all platforms selected must
share the same address.
config STRICT_DEVMEM
def_bool y
prompt "Filter access to /dev/mem"
help
This option restricts access to /dev/mem. If this option is
disabled, you allow userspace access to all memory, including
kernel and userspace memory. Accidental memory access is likely
to be disastrous.
Memory access is required for experts who want to debug the kernel.
If you are unsure, say Y.
config FAIL_IOMMU
bool "Fault-injection capability for IOMMU"
depends on FAULT_INJECTION
......
......@@ -66,6 +66,7 @@ config S390
def_bool y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_SG_CHAIN
......
......@@ -5,18 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
config STRICT_DEVMEM
def_bool y
prompt "Filter access to /dev/mem"
---help---
This option restricts access to /dev/mem. If this option is
disabled, you allow userspace access to all memory, including
kernel and userspace memory. Accidental memory access is likely
to be disastrous.
Memory access is required for experts who want to debug the kernel.
If you are unsure, say Y.
config S390_PTDUMP
bool "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
......
......@@ -19,6 +19,7 @@ config TILE
select VIRT_TO_BUS
select SYS_HYPERVISOR
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_CLOCKEVENTS
select MODULES_USE_ELF_RELA
......@@ -116,9 +117,6 @@ config ARCH_DISCONTIGMEM_DEFAULT
config TRACE_IRQFLAGS_SUPPORT
def_bool y
config STRICT_DEVMEM
def_bool y
# SMP is required for Tilera Linux.
config SMP
def_bool y
......
config UNICORE32
def_bool y
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_MEMBLOCK
......
......@@ -2,20 +2,6 @@ menu "Kernel hacking"
source "lib/Kconfig.debug"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel.
If this option is switched on, the /dev/mem file only allows
userspace access to memory mapped peripherals.
If in doubt, say Y.
config EARLY_PRINTK
def_bool DEBUG_OCD
help
......
......@@ -24,6 +24,7 @@ config X86
select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
......
......@@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel. Note that with PAT support
enabled, even in this case there are restrictions on /dev/mem
use due to the cache aliasing requirements.
If this option is switched on, the /dev/mem file only allows
userspace access to PCI space and the BIOS code and data regions.
This is sufficient for dosemu and X and all common users of
/dev/mem.
If in doubt, say Y.
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
......
......@@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
partitions/
badblocks.o partitions/
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
......
/*
* Bad block management
*
* - Heavily based on MD badblocks code from Neil Brown
*
* Copyright (c) 2015, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/badblocks.h>
#include <linux/seqlock.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/slab.h>
/**
* badblocks_check() - check a given range for bad sectors
* @bb: the badblocks structure that holds all badblock information
* @s: sector (start) at which to check for badblocks
* @sectors: number of sectors to check for badblocks
* @first_bad: pointer to store location of the first badblock
* @bad_sectors: pointer to store number of badblocks after @first_bad
*
* We can record which blocks on each device are 'bad' and so just
* fail those blocks, or that stripe, rather than the whole device.
* Entries in the bad-block table are 64bits wide. This comprises:
* Length of bad-range, in sectors: 0-511 for lengths 1-512
* Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
* A 'shift' can be set so that larger blocks are tracked and
* consequently larger devices can be covered.
* 'Acknowledged' flag - 1 bit. - the most significant bit.
*
* Locking of the bad-block table uses a seqlock so badblocks_check
* might need to retry if it is very unlucky.
* We will sometimes want to check for bad blocks in a bi_end_io function,
* so we use the write_seqlock_irq variant.
*
* When looking for a bad block we specify a range and want to
* know if any block in the range is bad. So we binary-search
* to the last range that starts at-or-before the given endpoint,
* (or "before the sector after the target range")
* then see if it ends after the given start.
*
* Return:
* 0: there are no known bad blocks in the range
* 1: there are known bad block which are all acknowledged
* -1: there are bad blocks which have not yet been acknowledged in metadata.
* plus the start/length of the first bad section we overlap.
*/
int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors)
{
int hi;
int lo;
u64 *p = bb->page;
int rv;
sector_t target = s + sectors;
unsigned seq;
if (bb->shift > 0) {
/* round the start down, and the end up */
s >>= bb->shift;
target += (1<<bb->shift) - 1;
target >>= bb->shift;
sectors = target - s;
}
/* 'target' is now the first block after the bad range */
retry:
seq = read_seqbegin(&bb->lock);
lo = 0;
rv = 0;
hi = bb->count;
/* Binary search between lo and hi for 'target'
* i.e. for the last range that starts before 'target'
*/
/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
* are known not to be the last range before target.
* VARIANT: hi-lo is the number of possible
* ranges, and decreases until it reaches 1
*/
while (hi - lo > 1) {
int mid = (lo + hi) / 2;
sector_t a = BB_OFFSET(p[mid]);
if (a < target)
/* This could still be the one, earlier ranges
* could not.
*/
lo = mid;
else
/* This and later ranges are definitely out. */
hi = mid;
}
/* 'lo' might be the last that started before target, but 'hi' isn't */
if (hi > lo) {
/* need to check all range that end after 's' to see if
* any are unacknowledged.
*/
while (lo >= 0 &&
BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
if (BB_OFFSET(p[lo]) < target) {
/* starts before the end, and finishes after
* the start, so they must overlap
*/
if (rv != -1 && BB_ACK(p[lo]))
rv = 1;
else
rv = -1;
*first_bad = BB_OFFSET(p[lo]);
*bad_sectors = BB_LEN(p[lo]);
}
lo--;
}
}
if (read_seqretry(&bb->lock, seq))
goto retry;
return rv;
}
EXPORT_SYMBOL_GPL(badblocks_check);
/**
* badblocks_set() - Add a range of bad blocks to the table.
* @bb: the badblocks structure that holds all badblock information
* @s: first sector to mark as bad
* @sectors: number of sectors to mark as bad
* @acknowledged: weather to mark the bad sectors as acknowledged
*
* This might extend the table, or might contract it if two adjacent ranges
* can be merged. We binary-search to find the 'insertion' point, then
* decide how best to handle it.
*
* Return:
* 0: success
* 1: failed to set badblocks (out of space)
*/
int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
int acknowledged)
{
u64 *p;
int lo, hi;
int rv = 0;
unsigned long flags;
if (bb->shift < 0)
/* badblocks are disabled */
return 0;
if (bb->shift) {
/* round the start down, and the end up */
sector_t next = s + sectors;
s >>= bb->shift;
next += (1<<bb->shift) - 1;
next >>= bb->shift;
sectors = next - s;
}
write_seqlock_irqsave(&bb->lock, flags);
p = bb->page;
lo = 0;
hi = bb->count;
/* Find the last range that starts at-or-before 's' */
while (hi - lo > 1) {
int mid = (lo + hi) / 2;
sector_t a = BB_OFFSET(p[mid]);
if (a <= s)
lo = mid;
else
hi = mid;
}
if (hi > lo && BB_OFFSET(p[lo]) > s)
hi = lo;
if (hi > lo) {
/* we found a range that might merge with the start
* of our new range
*/
sector_t a = BB_OFFSET(p[lo]);
sector_t e = a + BB_LEN(p[lo]);
int ack = BB_ACK(p[lo]);
if (e >= s) {
/* Yes, we can merge with a previous range */
if (s == a && s + sectors >= e)
/* new range covers old */
ack = acknowledged;
else
ack = ack && acknowledged;
if (e < s + sectors)
e = s + sectors;
if (e - a <= BB_MAX_LEN) {
p[lo] = BB_MAKE(a, e-a, ack);
s = e;
} else {
/* does not all fit in one range,
* make p[lo] maximal
*/
if (BB_LEN(p[lo]) != BB_MAX_LEN)
p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
s = a + BB_MAX_LEN;
}
sectors = e - s;
}
}
if (sectors && hi < bb->count) {
/* 'hi' points to the first range that starts after 's'.
* Maybe we can merge with the start of that range
*/
sector_t a = BB_OFFSET(p[hi]);
sector_t e = a + BB_LEN(p[hi]);
int ack = BB_ACK(p[hi]);
if (a <= s + sectors) {
/* merging is possible */
if (e <= s + sectors) {
/* full overlap */
e = s + sectors;
ack = acknowledged;
} else
ack = ack && acknowledged;
a = s;
if (e - a <= BB_MAX_LEN) {
p[hi] = BB_MAKE(a, e-a, ack);
s = e;
} else {
p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
s = a + BB_MAX_LEN;
}
sectors = e - s;
lo = hi;
hi++;
}
}
if (sectors == 0 && hi < bb->count) {
/* we might be able to combine lo and hi */
/* Note: 's' is at the end of 'lo' */
sector_t a = BB_OFFSET(p[hi]);
int lolen = BB_LEN(p[lo]);
int hilen = BB_LEN(p[hi]);
int newlen = lolen + hilen - (s - a);
if (s >= a && newlen < BB_MAX_LEN) {
/* yes, we can combine them */
int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
memmove(p + hi, p + hi + 1,
(bb->count - hi - 1) * 8);
bb->count--;
}
}
while (sectors) {
/* didn't merge (it all).
* Need to add a range just before 'hi'
*/
if (bb->count >= MAX_BADBLOCKS) {
/* No room for more */
rv = 1;
break;
} else {
int this_sectors = sectors;
memmove(p + hi + 1, p + hi,
(bb->count - hi) * 8);
bb->count++;
if (this_sectors > BB_MAX_LEN)
this_sectors = BB_MAX_LEN;
p[hi] = BB_MAKE(s, this_sectors, acknowledged);
sectors -= this_sectors;
s += this_sectors;
}
}
bb->changed = 1;
if (!acknowledged)
bb->unacked_exist = 1;
write_sequnlock_irqrestore(&bb->lock, flags);
return rv;
}
EXPORT_SYMBOL_GPL(badblocks_set);
/**
* badblocks_clear() - Remove a range of bad blocks to the table.
* @bb: the badblocks structure that holds all badblock information
* @s: first sector to mark as bad
* @sectors: number of sectors to mark as bad
*
* This may involve extending the table if we spilt a region,
* but it must not fail. So if the table becomes full, we just
* drop the remove request.
*
* Return:
* 0: success
* 1: failed to clear badblocks
*/
int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
{
u64 *p;
int lo, hi;
sector_t target = s + sectors;
int rv = 0;
if (bb->shift > 0) {
/* When clearing we round the start up and the end down.
* This should not matter as the shift should align with
* the block size and no rounding should ever be needed.
* However it is better the think a block is bad when it
* isn't than to think a block is not bad when it is.
*/
s += (1<<bb->shift) - 1;
s >>= bb->shift;
target >>= bb->shift;
sectors = target - s;
}
write_seqlock_irq(&bb->lock);
p = bb->page;
lo = 0;
hi = bb->count;
/* Find the last range that starts before 'target' */
while (hi - lo > 1) {
int mid = (lo + hi) / 2;
sector_t a = BB_OFFSET(p[mid]);
if (a < target)
lo = mid;
else
hi = mid;
}
if (hi > lo) {
/* p[lo] is the last range that could overlap the
* current range. Earlier ranges could also overlap,
* but only this one can overlap the end of the range.
*/
if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
/* Partial overlap, leave the tail of this range */
int ack = BB_ACK(p[lo]);
sector_t a = BB_OFFSET(p[lo]);
sector_t end = a + BB_LEN(p[lo]);
if (a < s) {
/* we need to split this range */
if (bb->count >= MAX_BADBLOCKS) {
rv = -ENOSPC;
goto out;
}
memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
bb->count++;
p[lo] = BB_MAKE(a, s-a, ack);
lo++;
}
p[lo] = BB_MAKE(target, end - target, ack);
/* there is no longer an overlap */
hi = lo;
lo--;
}
while (lo >= 0 &&
BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
/* This range does overlap */
if (BB_OFFSET(p[lo]) < s) {
/* Keep the early parts of this range. */
int ack = BB_ACK(p[lo]);
sector_t start = BB_OFFSET(p[lo]);
p[lo] = BB_MAKE(start, s - start, ack);
/* now low doesn't overlap, so.. */
break;
}
lo--;
}
/* 'lo' is strictly before, 'hi' is strictly after,
* anything between needs to be discarded
*/