Commit 3e6bdf47 authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86

* git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86:
  x86: fix deadlock, make pgd_lock irq-safe
  virtio: fix trivial build bug
  x86: fix mttr trimming
  x86: delay CPA self-test and repeat it
  x86: fix 64-bit sections
  generic: add __FINITDATA
  x86: remove suprious ifdefs from pageattr.c
  x86: mark the .rodata section also NX
  x86: fix iret exception recovery on 64-bit
  cpuidle: dubious one-bit signed bitfield in cpuidle.h
  x86: fix sparse warnings in powernow-k8.c
  x86: fix sparse error in traps_32.c
  x86: trivial sparse/checkpatch in quirks.c
  x86 ptrace: disallow null cs/ss
  MAINTAINERS: RDC R-321x SoC maintainer
  brk randomization: introduce CONFIG_COMPAT_BRK
  brk: check the lower bound properly
  x86: remove X2 workaround
  x86: make spurious fault handler aware of large mappings
  x86: make traps on entry code be debuggable in user space, 64-bit
parents 3d4d4582 58d5d0d8
......@@ -3224,6 +3224,12 @@ M: mporter@kernel.crashing.org
L: linux-kernel@vger.kernel.org
S: Maintained
RDC R-321X SoC
P: Florian Fainelli
M: florian.fainelli@telecomint.eu
L: linux-kernel@vger.kernel.org
S: Maintained
RDC R6040 FAST ETHERNET DRIVER
P: Florian Fainelli
M: florian.fainelli@telecomint.eu
......
......@@ -220,9 +220,9 @@ config DEBUG_BOOT_PARAMS
This option will cause struct boot_params to be exported via debugfs.
config CPA_DEBUG
bool "CPA self test code"
bool "CPA self-test code"
depends on DEBUG_KERNEL
help
Do change_page_attr self tests at boot.
Do change_page_attr() self-tests every 30 seconds.
endmenu
......@@ -827,7 +827,6 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
for (i = 0; i < data->acpi_data.state_count; i++) {
u32 index;
u32 hi = 0, lo = 0;
index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
if (index > data->max_hw_pstate) {
......
......@@ -659,7 +659,7 @@ static __init int amd_special_default_mtrr(void)
*/
int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
{
unsigned long i, base, size, highest_addr = 0, def, dummy;
unsigned long i, base, size, highest_pfn = 0, def, dummy;
mtrr_type type;
u64 trim_start, trim_size;
......@@ -682,28 +682,27 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
mtrr_if->get(i, &base, &size, &type);
if (type != MTRR_TYPE_WRBACK)
continue;
base <<= PAGE_SHIFT;
size <<= PAGE_SHIFT;
if (highest_addr < base + size)
highest_addr = base + size;
if (highest_pfn < base + size)
highest_pfn = base + size;
}
/* kvm/qemu doesn't have mtrr set right, don't trim them all */
if (!highest_addr) {
if (!highest_pfn) {
printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n");
WARN_ON(1);
return 0;
}
if ((highest_addr >> PAGE_SHIFT) < end_pfn) {
if (highest_pfn < end_pfn) {
printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
" all of memory, losing %LdMB of RAM.\n",
(((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20);
" all of memory, losing %luMB of RAM.\n",
(end_pfn - highest_pfn) >> (20 - PAGE_SHIFT));
WARN_ON(1);
printk(KERN_INFO "update e820 for mtrr\n");
trim_start = highest_addr;
trim_start = highest_pfn;
trim_start <<= PAGE_SHIFT;
trim_size = end_pfn;
trim_size <<= PAGE_SHIFT;
trim_size -= trim_start;
......
......@@ -582,7 +582,6 @@ retint_restore_args: /* return to kernel space */
TRACE_IRQS_IRETQ
restore_args:
RESTORE_ARGS 0,8,0
iret_label:
#ifdef CONFIG_PARAVIRT
INTERRUPT_RETURN
#endif
......@@ -593,13 +592,22 @@ ENTRY(native_iret)
.quad native_iret, bad_iret
.previous
.section .fixup,"ax"
/* force a signal here? this matches i386 behaviour */
/* running with kernel gs */
bad_iret:
movq $11,%rdi /* SIGSEGV */
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
jmp do_exit
/*
* The iret traps when the %cs or %ss being restored is bogus.
* We've lost the original trap vector and error code.
* #GPF is the most likely one to get for an invalid selector.
* So pretend we completed the iret and took the #GPF in user mode.
*
* We are now running with the kernel GS after exception recovery.
* But error_entry expects us to have user GS to match the user %cs,
* so swap back.
*/
pushq $0
SWAPGS
jmp general_protection
.previous
/* edi: workmask, edx: work */
......@@ -911,7 +919,7 @@ error_kernelspace:
iret run with kernel gs again, so don't set the user space flag.
B stepping K8s sometimes report an truncated RIP for IRET
exceptions returning to compat mode. Check for these here too. */
leaq iret_label(%rip),%rbp
leaq native_iret(%rip),%rbp
cmpq %rbp,RIP(%rsp)
je error_swapgs
movl %ebp,%ebp /* zero extend */
......
......@@ -250,18 +250,13 @@ ENTRY(secondary_startup_64)
lretq
/* SMP bootup changes these two */
#ifndef CONFIG_HOTPLUG_CPU
.pushsection .init.data
#endif
__CPUINITDATA
.align 8
.globl initial_code
initial_code:
ENTRY(initial_code)
.quad x86_64_start_kernel
#ifndef CONFIG_HOTPLUG_CPU
.popsection
#endif
.globl init_rsp
init_rsp:
__FINITDATA
ENTRY(init_rsp)
.quad init_thread_union+THREAD_SIZE-8
bad_address:
......
......@@ -103,9 +103,26 @@ static int set_segment_reg(struct task_struct *task,
if (invalid_selector(value))
return -EIO;
if (offset != offsetof(struct user_regs_struct, gs))
/*
* For %cs and %ss we cannot permit a null selector.
* We can permit a bogus selector as long as it has USER_RPL.
* Null selectors are fine for other segment registers, but
* we will never get back to user mode with invalid %cs or %ss
* and will take the trap in iret instead. Much code relies
* on user_mode() to distinguish a user trap frame (which can
* safely use invalid selectors) from a kernel trap frame.
*/
switch (offset) {
case offsetof(struct user_regs_struct, cs):
case offsetof(struct user_regs_struct, ss):
if (unlikely(value == 0))
return -EIO;
default:
*pt_regs_access(task_pt_regs(task), offset) = value;
else {
break;
case offsetof(struct user_regs_struct, gs):
task->thread.gs = value;
if (task == current)
/*
......@@ -227,12 +244,16 @@ static int set_segment_reg(struct task_struct *task,
* Can't actually change these in 64-bit mode.
*/
case offsetof(struct user_regs_struct,cs):
if (unlikely(value == 0))
return -EIO;
#ifdef CONFIG_IA32_EMULATION
if (test_tsk_thread_flag(task, TIF_IA32))
task_pt_regs(task)->cs = value;
#endif
break;
case offsetof(struct user_regs_struct,ss):
if (unlikely(value == 0))
return -EIO;
#ifdef CONFIG_IA32_EMULATION
if (test_tsk_thread_flag(task, TIF_IA32))
task_pt_regs(task)->ss = value;
......
......@@ -380,19 +380,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367,
void force_hpet_resume(void)
{
switch (force_hpet_resume_type) {
case ICH_FORCE_HPET_RESUME:
return ich_force_hpet_resume();
case OLD_ICH_FORCE_HPET_RESUME:
return old_ich_force_hpet_resume();
case VT8237_FORCE_HPET_RESUME:
return vt8237_force_hpet_resume();
case NVIDIA_FORCE_HPET_RESUME:
return nvidia_force_hpet_resume();
default:
case ICH_FORCE_HPET_RESUME:
ich_force_hpet_resume();
return;
case OLD_ICH_FORCE_HPET_RESUME:
old_ich_force_hpet_resume();
return;
case VT8237_FORCE_HPET_RESUME:
vt8237_force_hpet_resume();
return;
case NVIDIA_FORCE_HPET_RESUME:
nvidia_force_hpet_resume();
return;
default:
break;
}
}
......
......@@ -139,7 +139,6 @@ static int test_NX(void)
* Until then, don't run them to avoid too many people getting scared
* by the error message
*/
#if 0
#ifdef CONFIG_DEBUG_RODATA
/* Test 3: Check if the .rodata section is executable */
......@@ -152,6 +151,7 @@ static int test_NX(void)
}
#endif
#if 0
/* Test 4: Check if the .data section of a module is executable */
if (test_address(&test_data)) {
printk(KERN_ERR "test_nx: .data section is executable\n");
......
......@@ -1176,17 +1176,12 @@ void __init trap_init(void)
#endif
set_trap_gate(19,&simd_coprocessor_error);
/*
* Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
* Generate a build-time error if the alignment is wrong.
*/
BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
if (cpu_has_fxsr) {
/*
* Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
* Generates a compile-time "error: zero width for bit-field" if
* the alignment is wrong.
*/
struct fxsrAlignAssert {
int _:!(offsetof(struct task_struct,
thread.i387.fxsave) & 15);
};
printk(KERN_INFO "Enabling fast FPU save and restore... ");
set_in_cr4(X86_CR4_OSFXSR);
printk("done.\n");
......
......@@ -428,6 +428,16 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
}
#endif
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & PF_WRITE) && !pte_write(*pte))
return 0;
if ((error_code & PF_INSTR) && !pte_exec(*pte))
return 0;
return 1;
}
/*
* Handle a spurious fault caused by a stale TLB entry. This allows
* us to lazily refresh the TLB when increasing the permissions of a
......@@ -457,20 +467,21 @@ static int spurious_fault(unsigned long address,
if (!pud_present(*pud))
return 0;
if (pud_large(*pud))
return spurious_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
if (pmd_large(*pmd))
return spurious_fault_check(error_code, (pte_t *) pmd);
pte = pte_offset_kernel(pmd, address);
if (!pte_present(*pte))
return 0;
if ((error_code & PF_WRITE) && !pte_write(*pte))
return 0;
if ((error_code & PF_INSTR) && !pte_exec(*pte))
return 0;
return 1;
return spurious_fault_check(error_code, pte);
}
/*
......@@ -947,11 +958,12 @@ void vmalloc_sync_all(void)
for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
if (!test_bit(pgd_index(address), insync)) {
const pgd_t *pgd_ref = pgd_offset_k(address);
unsigned long flags;
struct page *page;
if (pgd_none(*pgd_ref))
continue;
spin_lock(&pgd_lock);
spin_lock_irqsave(&pgd_lock, flags);
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
......@@ -960,7 +972,7 @@ void vmalloc_sync_all(void)
else
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
}
spin_unlock(&pgd_lock);
spin_unlock_irqrestore(&pgd_lock, flags);
set_bit(pgd_index(address), insync);
}
if (address == start)
......
......@@ -591,10 +591,17 @@ void mark_rodata_ro(void)
if (end <= start)
return;
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
/*
* The rodata section (but not the kernel text!) should also be
* not-executable.
*/
start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
set_memory_nx(start, (end - start) >> PAGE_SHIFT);
rodata_test();
......
......@@ -5,6 +5,7 @@
* and compares page tables forwards and afterwards.
*/
#include <linux/bootmem.h>
#include <linux/kthread.h>
#include <linux/random.h>
#include <linux/kernel.h>
#include <linux/init.h>
......@@ -14,8 +15,13 @@
#include <asm/pgtable.h>
#include <asm/kdebug.h>
/*
* Only print the results of the first pass:
*/
static __read_mostly int print = 1;
enum {
NTEST = 4000,
NTEST = 400,
#ifdef CONFIG_X86_64
LPS = (1 << PMD_SHIFT),
#elif defined(CONFIG_X86_PAE)
......@@ -31,7 +37,7 @@ struct split_state {
long min_exec, max_exec;
};
static __init int print_split(struct split_state *s)
static int print_split(struct split_state *s)
{
long i, expected, missed = 0;
int printed = 0;
......@@ -82,10 +88,13 @@ static __init int print_split(struct split_state *s)
s->max_exec = addr;
}
}
printk(KERN_INFO
"CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
s->spg, s->lpg, s->gpg, s->exec,
s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed);
if (print) {
printk(KERN_INFO
" 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
s->spg, s->lpg, s->gpg, s->exec,
s->min_exec != ~0UL ? s->min_exec : 0,
s->max_exec, missed);
}
expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
if (expected != i) {
......@@ -96,11 +105,11 @@ static __init int print_split(struct split_state *s)
return err;
}
static unsigned long __initdata addr[NTEST];
static unsigned int __initdata len[NTEST];
static unsigned long addr[NTEST];
static unsigned int len[NTEST];
/* Change the global bit on random pages in the direct mapping */
static __init int exercise_pageattr(void)
static int pageattr_test(void)
{
struct split_state sa, sb, sc;
unsigned long *bm;
......@@ -110,7 +119,8 @@ static __init int exercise_pageattr(void)
int i, k;
int err;
printk(KERN_INFO "CPA exercising pageattr\n");
if (print)
printk(KERN_INFO "CPA self-test:\n");
bm = vmalloc((max_pfn_mapped + 7) / 8);
if (!bm) {
......@@ -186,7 +196,6 @@ static __init int exercise_pageattr(void)
failed += print_split(&sb);
printk(KERN_INFO "CPA reverting everything\n");
for (i = 0; i < NTEST; i++) {
if (!addr[i])
continue;
......@@ -214,12 +223,40 @@ static __init int exercise_pageattr(void)
failed += print_split(&sc);
if (failed) {
printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n");
printk(KERN_ERR "NOT PASSED. Please report.\n");
WARN_ON(1);
return -EINVAL;
} else {
printk(KERN_INFO "CPA selftests PASSED\n");
if (print)
printk(KERN_INFO "ok.\n");
}
return 0;
}
module_init(exercise_pageattr);
static int do_pageattr_test(void *__unused)
{
while (!kthread_should_stop()) {
schedule_timeout_interruptible(HZ*30);
if (pageattr_test() < 0)
break;
if (print)
print--;
}
return 0;
}
static int start_pageattr_test(void)
{
struct task_struct *p;
p = kthread_create(do_pageattr_test, NULL, "pageattr-test");
if (!IS_ERR(p))
wake_up_process(p);
else
WARN_ON(1);
return 0;
}
module_init(start_pageattr_test);
......@@ -167,8 +167,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext)))
pgprot_val(forbidden) |= _PAGE_NX;
#ifdef CONFIG_DEBUG_RODATA
/* The .rodata section needs to be read-only */
if (within(address, (unsigned long)__start_rodata,
(unsigned long)__end_rodata))
......@@ -179,7 +177,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
if (within(address, virt_to_highmap(__start_rodata),
virt_to_highmap(__end_rodata)))
pgprot_val(forbidden) |= _PAGE_RW;
#endif
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
......@@ -260,17 +257,6 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pgprot_t old_prot, new_prot;
int level, do_split = 1;
/*
* An Athlon 64 X2 showed hard hangs if we tried to preserve
* largepages and changed the PSE entry from RW to RO.
*
* As AMD CPUs have a long series of erratas in this area,
* (and none of the known ones seem to explain this hang),
* disable this code until the hang can be debugged:
*/
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return 1;
spin_lock_irqsave(&pgd_lock, flags);
/*
* Check for races, another CPU might have split this page
......
......@@ -1077,7 +1077,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
current->mm->start_stack = bprm->p;
#ifdef arch_randomize_brk
if (current->flags & PF_RANDOMIZE)
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1))
current->mm->brk = current->mm->start_brk =
arch_randomize_brk(current->mm);
#endif
......
......@@ -42,19 +42,21 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
static inline void pgd_list_add(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
unsigned long flags;
spin_lock(&pgd_lock);
spin_lock_irqsave(&pgd_lock, flags);
list_add(&page->lru, &pgd_list);
spin_unlock(&pgd_lock);
spin_unlock_irqrestore(&pgd_lock, flags);
}
static inline void pgd_list_del(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
unsigned long flags;
spin_lock(&pgd_lock);
spin_lock_irqsave(&pgd_lock, flags);
list_del(&page->lru);
spin_unlock(&pgd_lock);
spin_unlock_irqrestore(&pgd_lock, flags);
}
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
......
......@@ -79,7 +79,7 @@ struct cpuidle_state_kobj {
};
struct cpuidle_device {
int enabled:1;
unsigned int enabled:1;
unsigned int cpu;
int last_residency;
......
......@@ -110,6 +110,7 @@
#define __FINIT .previous
#define __INITDATA .section ".init.data","aw"
#define __FINITDATA .previous
#define __DEVINIT .section ".devinit.text", "ax"
#define __DEVINITDATA .section ".devinit.data", "aw"
......
......@@ -541,6 +541,18 @@ config ELF_CORE
help
Enable support for generating core dumps. Disabling saves about 4k.
config COMPAT_BRK
bool "Disable heap randomization"
default y
help
Randomizing heap placement makes heap exploits harder, but it
also breaks ancient binaries (including anything libc5 based).
This option changes the bootup default to heap randomization
disabled, and can be overriden runtime by setting
/proc/sys/kernel/randomize_va_space to 2.
On non-ancient distros (post-2000 ones) Y is usually a safe choice.
config BASE_FULL
default y
bool "Enable full-sized data structures for core" if EMBEDDED
......
......@@ -82,7 +82,18 @@ void * high_memory;
EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
int randomize_va_space __read_mostly = 1;
/*
* Randomize the address space (stacks, mmaps, brk, etc.).
*
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
* as ancient (libc5 based) binaries can segfault. )
*/
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
1;
#else
2;
#endif
static int __init disable_randmaps(char *s)
{
......
......@@ -245,7 +245,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
down_write(&mm->mmap_sem);
if (brk < mm->end_code)