Commit f0c98ebc authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'libnvdimm-for-4.8' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm updates from Dan Williams:

 - Replace pcommit with ADR / directed-flushing.

   The pcommit instruction, which has not shipped on any product, is
   deprecated.  Instead, the requirement is that platforms implement
   either ADR, or provide one or more flush addresses per nvdimm.

   ADR (Asynchronous DRAM Refresh) flushes data in posted write buffers
   to the memory controller on a power-fail event.

   Flush addresses are defined in ACPI 6.x as an NVDIMM Firmware
   Interface Table (NFIT) sub-structure: "Flush Hint Address Structure".
   A flush hint is an mmio address that when written and fenced assures
   that all previous posted writes targeting a given dimm have been
   flushed to media.

 - On-demand ARS (address range scrub).

   Linux uses the results of the ACPI ARS commands to track bad blocks
   in pmem devices.  When latent errors are detected we re-scrub the
   media to refresh the bad block list, userspace can also request a
   re-scrub at any time.

 - Support for the Microsoft DSM (device specific method) command
   format.

 - Support for EDK2/OVMF virtual disk device memory ranges.

 - Various fixes and cleanups across the subsystem.

* tag 'libnvdimm-for-4.8' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (41 commits)
  libnvdimm-btt: Delete an unnecessary check before the function call "__nd_device_register"
  nfit: do an ARS scrub on hitting a latent media error
  nfit: move to nfit/ sub-directory
  nfit, libnvdimm: allow an ARS scrub to be triggered on demand
  libnvdimm: register nvdimm_bus devices with an nd_bus driver
  pmem: clarify a debug print in pmem_clear_poison
  x86/insn: remove pcommit
  Revert "KVM: x86: add pcommit support"
  nfit, tools/testing/nvdimm/: unify shutdown paths
  libnvdimm: move ->module to struct nvdimm_bus_descriptor
  nfit: cleanup acpi_nfit_init calling convention
  nfit: fix _FIT evaluation memory leak + use after free
  tools/testing/nvdimm: add manufacturing_{date|location} dimm properties
  tools/testing/nvdimm: add virtual ramdisk range
  acpi, nfit: treat virtual ramdisk SPA as pmem region
  pmem: kill __pmem address space
  pmem: kill wmb_pmem()
  libnvdimm, pmem: use nvdimm_flush() for namespace I/O writes
  fs/dax: remove wmb_pmem()
  libnvdimm, pmem: flush posted-write queues on shutdown
  ...
parents d94ba9e7 0606263f
......@@ -395,7 +395,7 @@ prototypes:
int (*release) (struct gendisk *, fmode_t);
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*direct_access) (struct block_device *, sector_t, void __pmem **,
int (*direct_access) (struct block_device *, sector_t, void **,
unsigned long *);
int (*media_changed) (struct gendisk *);
void (*unlock_native_capacity) (struct gendisk *);
......
......@@ -256,28 +256,18 @@ If any of these error conditions are encountered, the arena is put into a read
only state using a flag in the info block.
5. In-kernel usage
==================
5. Usage
========
Any block driver that supports byte granularity IO to the storage may register
with the BTT. It will have to provide the rw_bytes interface in its
block_device_operations struct:
The BTT can be set up on any disk (namespace) exposed by the libnvdimm subsystem
(pmem, or blk mode). The easiest way to set up such a namespace is using the
'ndctl' utility [1]:
int (*rw_bytes)(struct gendisk *, void *, size_t, off_t, int rw);
For example, the ndctl command line to setup a btt with a 4k sector size is:
It may register with the BTT after it adds its own gendisk, using btt_init:
ndctl create-namespace -f -e namespace0.0 -m sector -l 4k
struct btt *btt_init(struct gendisk *disk, unsigned long long rawsize,
u32 lbasize, u8 uuid[], int maxlane);
See ndctl create-namespace --help for more options.
note that maxlane is the maximum amount of concurrency the driver wishes to
allow the BTT to use.
The BTT 'disk' appears as a stacked block device that grabs the underlying block
device in the O_EXCL mode.
When the driver wishes to remove the backing disk, it should similarly call
btt_fini using the same struct btt* handle that was provided to it by btt_init.
void btt_fini(struct btt *btt);
[1]: https://github.com/pmem/ndctl
......@@ -143,12 +143,12 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
*/
static long
axon_ram_direct_access(struct block_device *device, sector_t sector,
void __pmem **kaddr, pfn_t *pfn, long size)
void **kaddr, pfn_t *pfn, long size)
{
struct axon_ram_bank *bank = device->bd_disk->private_data;
loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
*kaddr = (void __pmem __force *) bank->io_addr + offset;
*kaddr = (void *) bank->io_addr + offset;
*pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
return bank->size - offset;
}
......
......@@ -225,7 +225,6 @@
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */
#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
......
......@@ -26,13 +26,11 @@
* @n: length of the copy in bytes
*
* Copy data to persistent memory media via non-temporal stores so that
* a subsequent arch_wmb_pmem() can flush cpu and memory controller
* write buffers to guarantee durability.
* a subsequent pmem driver flush operation will drain posted write queues.
*/
static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
size_t n)
static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
{
int unwritten;
int rem;
/*
* We are copying between two kernel buffers, if
......@@ -40,59 +38,36 @@ static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
* fault) we would have already reported a general protection fault
* before the WARN+BUG.
*/
unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
(void __user *) src, n);
if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
__func__, dst, src, unwritten))
rem = __copy_from_user_inatomic_nocache(dst, (void __user *) src, n);
if (WARN(rem, "%s: fault copying %p <- %p unwritten: %d\n",
__func__, dst, src, rem))
BUG();
}
static inline int arch_memcpy_from_pmem(void *dst, const void __pmem *src,
size_t n)
static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
{
if (static_cpu_has(X86_FEATURE_MCE_RECOVERY))
return memcpy_mcsafe(dst, (void __force *) src, n);
memcpy(dst, (void __force *) src, n);
return memcpy_mcsafe(dst, src, n);
memcpy(dst, src, n);
return 0;
}
/**
* arch_wmb_pmem - synchronize writes to persistent memory
*
* After a series of arch_memcpy_to_pmem() operations this drains data
* from cpu write buffers and any platform (memory controller) buffers
* to ensure that written data is durable on persistent memory media.
*/
static inline void arch_wmb_pmem(void)
{
/*
* wmb() to 'sfence' all previous writes such that they are
* architecturally visible to 'pcommit'. Note, that we've
* already arranged for pmem writes to avoid the cache via
* arch_memcpy_to_pmem().
*/
wmb();
pcommit_sfence();
}
/**
* arch_wb_cache_pmem - write back a cache range with CLWB
* @vaddr: virtual start address
* @size: number of bytes to write back
*
* Write back a cache range using the CLWB (cache line write back)
* instruction. This function requires explicit ordering with an
* arch_wmb_pmem() call.
* instruction.
*/
static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
static inline void arch_wb_cache_pmem(void *addr, size_t size)
{
u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
unsigned long clflush_mask = x86_clflush_size - 1;
void *vaddr = (void __force *)addr;
void *vend = vaddr + size;
void *vend = addr + size;
void *p;
for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
for (p = (void *)((unsigned long)addr & ~clflush_mask);
p < vend; p += x86_clflush_size)
clwb(p);
}
......@@ -113,16 +88,14 @@ static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
* @i: iterator with source data
*
* Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
* This function requires explicit ordering with an arch_wmb_pmem() call.
*/
static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
struct iov_iter *i)
{
void *vaddr = (void __force *)addr;
size_t len;
/* TODO: skip the write-back by always using non-temporal stores */
len = copy_from_iter_nocache(vaddr, bytes, i);
len = copy_from_iter_nocache(addr, bytes, i);
if (__iter_needs_pmem_wb(i))
arch_wb_cache_pmem(addr, bytes);
......@@ -136,28 +109,16 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
* @size: number of bytes to zero
*
* Write zeros into the memory range starting at 'addr' for 'size' bytes.
* This function requires explicit ordering with an arch_wmb_pmem() call.
*/
static inline void arch_clear_pmem(void __pmem *addr, size_t size)
static inline void arch_clear_pmem(void *addr, size_t size)
{
void *vaddr = (void __force *)addr;
memset(vaddr, 0, size);
memset(addr, 0, size);
arch_wb_cache_pmem(addr, size);
}
static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
static inline void arch_invalidate_pmem(void *addr, size_t size)
{
clflush_cache_range((void __force *) addr, size);
}
static inline bool __arch_has_wmb_pmem(void)
{
/*
* We require that wmb() be an 'sfence', that is only guaranteed on
* 64-bit builds
*/
return static_cpu_has(X86_FEATURE_PCOMMIT);
clflush_cache_range(addr, size);
}
#endif /* CONFIG_ARCH_HAS_PMEM_API */
#endif /* __ASM_X86_PMEM_H__ */
......@@ -253,52 +253,6 @@ static inline void clwb(volatile void *__p)
: [pax] "a" (p));
}
/**
* pcommit_sfence() - persistent commit and fence
*
* The PCOMMIT instruction ensures that data that has been flushed from the
* processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
* memory and is durable on the DIMM. The primary use case for this is
* persistent memory.
*
* This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT
* with appropriate fencing.
*
* Example:
* void flush_and_commit_buffer(void *vaddr, unsigned int size)
* {
* unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
* void *vend = vaddr + size;
* void *p;
*
* for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
* p < vend; p += boot_cpu_data.x86_clflush_size)
* clwb(p);
*
* // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes
* // MFENCE via mb() also works
* wmb();
*
* // PCOMMIT and the required SFENCE for ordering
* pcommit_sfence();
* }
*
* After this function completes the data pointed to by 'vaddr' has been
* accepted to memory and will be durable if the 'vaddr' points to persistent
* memory.
*
* PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify
* things we include both the PCOMMIT and the required SFENCE in the
* alternatives generated by pcommit_sfence().
*/
static inline void pcommit_sfence(void)
{
alternative(ASM_NOP7,
".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */
"sfence",
X86_FEATURE_PCOMMIT);
}
#define nop() asm volatile ("nop")
......
......@@ -72,7 +72,6 @@
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
#define SECONDARY_EXEC_PCOMMIT 0x00200000
#define SECONDARY_EXEC_TSC_SCALING 0x02000000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
......
......@@ -78,7 +78,6 @@
#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
#define EXIT_REASON_PCOMMIT 65
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
......@@ -127,8 +126,7 @@
{ EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" }, \
{ EXIT_REASON_PCOMMIT, "PCOMMIT" }
{ EXIT_REASON_XRSTORS, "XRSTORS" }
#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
......
......@@ -366,7 +366,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
......
......@@ -144,14 +144,6 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_RTM));
}
static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
}
static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
......
......@@ -2707,8 +2707,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_PCOMMIT;
SECONDARY_EXEC_XSAVES;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
......@@ -3270,7 +3269,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_PCOMMIT |
SECONDARY_EXEC_TSC_SCALING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
......@@ -4858,9 +4856,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
if (!enable_pml)
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
/* Currently, we allow L1 guest to directly run pcommit instruction. */
exec_control &= ~SECONDARY_EXEC_PCOMMIT;
return exec_control;
}
......@@ -4904,9 +4899,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
if (cpu_has_secondary_exec_ctrls())
if (cpu_has_secondary_exec_ctrls()) {
vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
vmx_secondary_exec_control(vmx));
}
if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
......@@ -7564,13 +7560,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
static int handle_pcommit(struct kvm_vcpu *vcpu)
{
/* we never catch pcommit instruct for L1 guest. */
WARN_ON(1);
return 1;
}
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
......@@ -7621,7 +7610,6 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_PCOMMIT] = handle_pcommit,
};
static const int kvm_vmx_max_exit_handlers =
......@@ -7930,8 +7918,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* the XSS exit bitmap in vmcs12.
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
case EXIT_REASON_PCOMMIT:
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
default:
return true;
}
......@@ -9094,15 +9080,6 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (cpu_has_secondary_exec_ctrls())
vmcs_set_secondary_exec_control(secondary_exec_ctl);
if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
if (guest_cpuid_has_pcommit(vcpu))
vmx->nested.nested_vmx_secondary_ctls_high |=
SECONDARY_EXEC_PCOMMIT;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
~SECONDARY_EXEC_PCOMMIT;
}
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
......@@ -9715,8 +9692,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_PCOMMIT);
SECONDARY_EXEC_APIC_REGISTER_VIRT);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
......
......@@ -1012,7 +1012,7 @@ GrpTable: Grp15
4: XSAVE
5: XRSTOR | lfence (11B)
6: XSAVEOPT | clwb (66) | mfence (11B)
7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B)
7: clflush | clflushopt (66) | sfence (11B)
EndTable
GrpTable: Grp16
......
......@@ -454,32 +454,7 @@ config ACPI_REDUCED_HARDWARE_ONLY
If you are unsure what to do, do not enable this option.
config ACPI_NFIT
tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
depends on PHYS_ADDR_T_64BIT
depends on BLK_DEV
depends on ARCH_HAS_MMIO_FLUSH
select LIBNVDIMM
help
Infrastructure to probe ACPI 6 compliant platforms for
NVDIMMs (NFIT) and register a libnvdimm device tree. In
addition to storage devices this also enables libnvdimm to pass
ACPI._DSM messages for platform/dimm configuration.
To compile this driver as a module, choose M here:
the module will be called nfit.
config ACPI_NFIT_DEBUG
bool "NFIT DSM debug"
depends on ACPI_NFIT
depends on DYNAMIC_DEBUG
default n
help
Enabling this option causes the nfit driver to dump the
input and output buffers of _DSM operations on the ACPI0012
device and its children. This can be very verbose, so leave
it disabled unless you are debugging a hardware / firmware
issue.
source "drivers/acpi/nfit/Kconfig"
source "drivers/acpi/apei/Kconfig"
source "drivers/acpi/dptf/Kconfig"
......
......@@ -69,7 +69,7 @@ obj-$(CONFIG_ACPI_PCI_SLOT) += pci_slot.o
obj-$(CONFIG_ACPI_PROCESSOR) += processor.o
obj-$(CONFIG_ACPI) += container.o
obj-$(CONFIG_ACPI_THERMAL) += thermal.o
obj-$(CONFIG_ACPI_NFIT) += nfit.o
obj-$(CONFIG_ACPI_NFIT) += nfit/
obj-$(CONFIG_ACPI) += acpi_memhotplug.o
obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o
obj-$(CONFIG_ACPI_BATTERY) += battery.o
......
config ACPI_NFIT
tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
depends on PHYS_ADDR_T_64BIT
depends on BLK_DEV
depends on ARCH_HAS_MMIO_FLUSH
select LIBNVDIMM
help
Infrastructure to probe ACPI 6 compliant platforms for
NVDIMMs (NFIT) and register a libnvdimm device tree. In
addition to storage devices this also enables libnvdimm to pass
ACPI._DSM messages for platform/dimm configuration.
To compile this driver as a module, choose M here:
the module will be called nfit.
config ACPI_NFIT_DEBUG
bool "NFIT DSM debug"
depends on ACPI_NFIT
depends on DYNAMIC_DEBUG
default n
help
Enabling this option causes the nfit driver to dump the
input and output buffers of _DSM operations on the ACPI0012
device and its children. This can be very verbose, so leave
it disabled unless you are debugging a hardware / firmware
issue.
obj-$(CONFIG_ACPI_NFIT) := nfit.o
nfit-y := core.o
nfit-$(CONFIG_X86_MCE) += mce.o
/*
* NFIT - Machine Check Handler
*
* Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/notifier.h>
#include <linux/acpi.h>
#include <asm/mce.h>
#include "nfit.h"
static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *mce = (struct mce *)data;
struct acpi_nfit_desc *acpi_desc;
struct nfit_spa *nfit_spa;
/* We only care about memory errors */
if (!(mce->status & MCACOD))
return NOTIFY_DONE;
/*
* mce->addr contains the physical addr accessed that caused the
* machine check. We need to walk through the list of NFITs, and see
* if any of them matches that address, and only then start a scrub.
*/
mutex_lock(&acpi_desc_lock);
list_for_each_entry(acpi_desc, &acpi_descs, list) {
struct device *dev = acpi_desc->dev;
int found_match = 0;
mutex_lock(&acpi_desc->init_mutex);
list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
struct acpi_nfit_system_address *spa = nfit_spa->spa;
if (nfit_spa_type(spa) == NFIT_SPA_PM)
continue;
/* find the spa that covers the mce addr */
if (spa->address > mce->addr)
continue;
if ((spa->address + spa->length - 1) < mce->addr)
continue;
found_match = 1;
dev_dbg(dev, "%s: addr in SPA %d (0x%llx, 0x%llx)\n",
__func__, spa->range_index, spa->address,
spa->length);
/*
* We can break at the first match because we're going
* to rescan all the SPA ranges. There shouldn't be any
* aliasing anyway.
*/
break;
}
mutex_unlock(&acpi_desc->init_mutex);
/*
* We can ignore an -EBUSY here because if an ARS is already
* in progress, just let that be the last authoritative one
*/
if (found_match)
acpi_nfit_ars_rescan(acpi_desc);
}
mutex_unlock(&acpi_desc_lock);
return NOTIFY_DONE;
}
static struct notifier_block nfit_mce_dec = {
.notifier_call = nfit_handle_mce,
};
void nfit_mce_register(void)
{
mce_register_decode_chain(&nfit_mce_dec);
}
void nfit_mce_unregister(void)
{
mce_unregister_decode_chain(&nfit_mce_dec);
}
<
......@@ -16,6 +16,7 @@
#define __NFIT_H__
#include <linux/workqueue.h>
#include <linux/libnvdimm.h>
#include <linux/ndctl.h>
#include <linux/types.h>
#include <linux/uuid.h>
#include <linux/acpi.h>
......@@ -31,6 +32,9 @@
#define UUID_NFIT_DIMM_N_HPE1 "9002c334-acf3-4c0e-9642-a235f0d53bc6"
#define UUID_NFIT_DIMM_N_HPE2 "5008664b-b758-41a0-a03c-27c2f2d04f7e"
/* https://msdn.microsoft.com/library/windows/hardware/mt604741 */
#define UUID_NFIT_DIMM_N_MSFT "1ee68b36-d4bd-4a1a-9a16-4f8e53d46e05"
#define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
| ACPI_NFIT_MEM_NOT_ARMED)
......@@ -40,6 +44,7 @@ enum nfit_uuids {
NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
NFIT_DEV_DIMM_N_HPE1 = NVDIMM_FAMILY_HPE1,
NFIT_DEV_DIMM_N_HPE2 = NVDIMM_FAMILY_HPE2,
NFIT_DEV_DIMM_N_MSFT = NVDIMM_FAMILY_MSFT,
NFIT_SPA_VOLATILE,
NFIT_SPA_PM,
NFIT_SPA_DCR,
......@@ -74,37 +79,37 @@ enum {
};
struct nfit_spa {
struct acpi_nfit_system_address *spa;
struct list_head list;
struct nd_region *nd_region;
unsigned int ars_done:1;
unsigned int ars_required:1;
u32 clear_err_unit;
u32 max_ars;
struct acpi_nfit_system_address spa[0];
};
struct nfit_dcr {
struct acpi_nfit_control_region *dcr;
struct list_head list;
struct acpi_nfit_control_region dcr[0];
};
struct nfit_bdw {
struct acpi_nfit_data_region *bdw;
struct list_head list;
struct acpi_nfit_data_region bdw[0];
};
struct nfit_idt {
struct acpi_nfit_interleave *idt;
struct list_head list;
struct acpi_nfit_interleave idt[0];
};
struct nfit_flush {
struct acpi_nfit_flush_address *flush;
struct list_head list;
struct acpi_nfit_flush_address flush[0];
};
struct nfit_memdev {
struct acpi_nfit_memory_map *memdev;
struct list_head list;
struct acpi_nfit_memory_map memdev[0];
};
/* assembled tables for a given dimm/memory-device */
......@@ -123,6 +128,7 @@ struct nfit_mem {
struct list_head list;
struct acpi_device *adev;
struct acpi_nfit_desc *acpi_desc;
struct resource *flush_wpq;
unsigned long dsm_mask;
int family;
};
......@@ -130,10 +136,7 @@ struct nfit_mem {
struct acpi_nfit_desc {
struct nvdimm_bus_descriptor nd_desc;
struct acpi_table_header acpi_header;
struct acpi_nfit_header *nfit;
struct mutex spa_map_mutex;
struct mutex init_mutex;
struct list_head spa_maps;
struct list_head memdevs;
struct list_head flushes;
struct list_head dimms;
......@@ -146,6 +149,9 @@ struct acpi_nfit_desc {
struct nd_cmd_ars_status *ars_status;
size_t ars_status_size;
struct work_struct work;
struct list_head list;
struct kernfs_node *scrub_count_state;
unsigned int scrub_count;
unsigned int cancel:1;
unsigned long dimm_cmd_force_en;
unsigned long bus_cmd_force_en;
......@@ -161,7 +167,7 @@ enum nd_blk_mmio_selector {
struct nd_blk_addr {
union {
void __iomem *base;
void __pmem *aperture;
void *aperture;
};
};
......@@ -180,28 +186,26 @@ struct nfit_blk {
u64 bdw_offset; /* post interleave offset */
u64 stat_offset;
u64 cmd_offset;
void __iomem *nvdimm_flush;
u32 dimm_flags;
};
enum spa_map_type {
SPA_MAP_CONTROL,
SPA_MAP_APERTURE,
};
struct nfit_spa_mapping {
struct acpi_nfit_desc *acpi_desc;
struct acpi_nfit_system_address *spa;
struct list_head list;
struct kref kref;
enum spa_map_type type;
struct nd_blk_addr addr;
};
extern struct list_head acpi_descs;
extern struct mutex acpi_desc_lock;
int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc);
static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref)
#ifdef CONFIG_X86_MCE
void nfit_mce_register(void);
void nfit_mce_unregister(void);
#else
static inline void nfit_mce_register(void)
{
return container_of(kref, struct nfit_spa_mapping, kref);
}