Commit 9a0b5817 authored by Gerd Hoffmann's avatar Gerd Hoffmann Committed by Linus Torvalds
Browse files

[PATCH] x86: SMP alternatives



Implement SMP alternatives, i.e.  switching at runtime between different
code versions for UP and SMP.  The code can patch both SMP->UP and UP->SMP.
The UP->SMP case is useful for CPU hotplug.

With CONFIG_CPU_HOTPLUG enabled the code switches to UP at boot time and
when the number of CPUs goes down to 1, and switches to SMP when the number
of CPUs goes up to 2.

Without CONFIG_CPU_HOTPLUG or on non-SMP-capable systems the code is
patched once at boot time (if needed) and the tables are released
afterwards.

The changes in detail:

  * The current alternatives bits are moved to a separate file,
    the SMP alternatives code is added there.

  * The patch adds some new elf sections to the kernel:
    .smp_altinstructions
	like .altinstructions, also contains a list
	of alt_instr structs.
    .smp_altinstr_replacement
	like .altinstr_replacement, but also has some space to
	save original instruction before replaving it.
    .smp_locks
	list of pointers to lock prefixes which can be nop'ed
	out on UP.
    The first two are used to replace more complex instruction
    sequences such as spinlocks and semaphores.  It would be possible
    to deal with the lock prefixes with that as well, but by handling
    them as special case the table sizes become much smaller.

 * The sections are page-aligned and padded up to page size, so they
   can be free if they are not needed.

 * Splitted the code to release init pages to a separate function and
   use it to release the elf sections if they are unused.
Signed-off-by: default avatarGerd Hoffmann <kraxel@suse.de>
Signed-off-by: default avatarChuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 4d7d8c82
......@@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.lds
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
quirks.o i8237.o topology.o
quirks.o i8237.o topology.o alternative.o
obj-y += cpu/
obj-y += timers/
......
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#define DEBUG 0
#if DEBUG
# define DPRINTK(fmt, args...) printk(fmt, args)
#else
# define DPRINTK(fmt, args...)
#endif
/* Use inline assembly to define this because the nops are defined
as inline assembly strings in the include files and we cannot
get them easily into strings. */
asm("\t.data\nintelnops: "
GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
GENERIC_NOP7 GENERIC_NOP8);
asm("\t.data\nk8nops: "
K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
K8_NOP7 K8_NOP8);
asm("\t.data\nk7nops: "
K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
K7_NOP7 K7_NOP8);
extern unsigned char intelnops[], k8nops[], k7nops[];
static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
NULL,
intelnops,
intelnops + 1,
intelnops + 1 + 2,
intelnops + 1 + 2 + 3,
intelnops + 1 + 2 + 3 + 4,
intelnops + 1 + 2 + 3 + 4 + 5,
intelnops + 1 + 2 + 3 + 4 + 5 + 6,
intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
NULL,
k8nops,
k8nops + 1,
k8nops + 1 + 2,
k8nops + 1 + 2 + 3,
k8nops + 1 + 2 + 3 + 4,
k8nops + 1 + 2 + 3 + 4 + 5,
k8nops + 1 + 2 + 3 + 4 + 5 + 6,
k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
NULL,
k7nops,
k7nops + 1,
k7nops + 1 + 2,
k7nops + 1 + 2 + 3,
k7nops + 1 + 2 + 3 + 4,
k7nops + 1 + 2 + 3 + 4 + 5,
k7nops + 1 + 2 + 3 + 4 + 5 + 6,
k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
static struct nop {
int cpuid;
unsigned char **noptable;
} noptypes[] = {
{ X86_FEATURE_K8, k8_nops },
{ X86_FEATURE_K7, k7_nops },
{ -1, NULL }
};
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
extern u8 *__smp_locks[], *__smp_locks_end[];
extern u8 __smp_alt_begin[], __smp_alt_end[];
static unsigned char** find_nop_table(void)
{
unsigned char **noptable = intel_nops;
int i;
for (i = 0; noptypes[i].cpuid >= 0; i++) {
if (boot_cpu_has(noptypes[i].cpuid)) {
noptable = noptypes[i].noptable;
break;
}
}
return noptable;
}
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
self modifying code. This implies that assymetric systems where
APs have less capabilities than the boot processor are not handled.
Tough. Make sure you disable such features by hand. */
void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
{
unsigned char **noptable = find_nop_table();
struct alt_instr *a;
int diff, i, k;
DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
for (a = start; a < end; a++) {
BUG_ON(a->replacementlen > a->instrlen);
if (!boot_cpu_has(a->cpuid))
continue;
memcpy(a->instr, a->replacement, a->replacementlen);
diff = a->instrlen - a->replacementlen;
/* Pad the rest with nops */
for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
k = diff;
if (k > ASM_NOP_MAX)
k = ASM_NOP_MAX;
memcpy(a->instr + i, noptable[k], k);
}
}
}
static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end)
{
struct alt_instr *a;
DPRINTK("%s: alt table %p-%p\n", __FUNCTION__, start, end);
for (a = start; a < end; a++) {
memcpy(a->replacement + a->replacementlen,
a->instr,
a->instrlen);
}
}
static void alternatives_smp_apply(struct alt_instr *start, struct alt_instr *end)
{
struct alt_instr *a;
for (a = start; a < end; a++) {
memcpy(a->instr,
a->replacement + a->replacementlen,
a->instrlen);
}
}
static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
u8 **ptr;
for (ptr = start; ptr < end; ptr++) {
if (*ptr < text)
continue;
if (*ptr > text_end)
continue;
**ptr = 0xf0; /* lock prefix */
};
}
static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
unsigned char **noptable = find_nop_table();
u8 **ptr;
for (ptr = start; ptr < end; ptr++) {
if (*ptr < text)
continue;
if (*ptr > text_end)
continue;
**ptr = noptable[1][0];
};
}
struct smp_alt_module {
/* what is this ??? */
struct module *mod;
char *name;
/* ptrs to lock prefixes */
u8 **locks;
u8 **locks_end;
/* .text segment, needed to avoid patching init code ;) */
u8 *text;
u8 *text_end;
struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
static DEFINE_SPINLOCK(smp_alt);
static int smp_alt_once = 0;
static int __init bootonly(char *str)
{
smp_alt_once = 1;
return 1;
}
__setup("smp-alt-boot", bootonly);
void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
void *text, void *text_end)
{
struct smp_alt_module *smp;
unsigned long flags;
if (smp_alt_once) {
if (boot_cpu_has(X86_FEATURE_UP))
alternatives_smp_unlock(locks, locks_end,
text, text_end);
return;
}
smp = kzalloc(sizeof(*smp), GFP_KERNEL);
if (NULL == smp)
return; /* we'll run the (safe but slow) SMP code then ... */
smp->mod = mod;
smp->name = name;
smp->locks = locks;
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
__FUNCTION__, smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
spin_lock_irqsave(&smp_alt, flags);
list_add_tail(&smp->next, &smp_alt_modules);
if (boot_cpu_has(X86_FEATURE_UP))
alternatives_smp_unlock(smp->locks, smp->locks_end,
smp->text, smp->text_end);
spin_unlock_irqrestore(&smp_alt, flags);
}
void alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
unsigned long flags;
if (smp_alt_once)
return;
spin_lock_irqsave(&smp_alt, flags);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
list_del(&item->next);
spin_unlock_irqrestore(&smp_alt, flags);
DPRINTK("%s: %s\n", __FUNCTION__, item->name);
kfree(item);
return;
}
spin_unlock_irqrestore(&smp_alt, flags);
}
void alternatives_smp_switch(int smp)
{
struct smp_alt_module *mod;
unsigned long flags;
if (smp_alt_once)
return;
BUG_ON(!smp && (num_online_cpus() > 1));
spin_lock_irqsave(&smp_alt, flags);
if (smp) {
printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
alternatives_smp_apply(__smp_alt_instructions,
__smp_alt_instructions_end);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_lock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
} else {
printk(KERN_INFO "SMP alternatives: switching to UP code\n");
set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
apply_alternatives(__smp_alt_instructions,
__smp_alt_instructions_end);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_unlock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
}
spin_unlock_irqrestore(&smp_alt, flags);
}
void __init alternative_instructions(void)
{
apply_alternatives(__alt_instructions, __alt_instructions_end);
/* switch to patch-once-at-boottime-only mode and free the
* tables in case we know the number of CPUs will never ever
* change */
#ifdef CONFIG_HOTPLUG_CPU
if (num_possible_cpus() < 2)
smp_alt_once = 1;
#else
smp_alt_once = 1;
#endif
if (smp_alt_once) {
if (1 == num_possible_cpus()) {
printk(KERN_INFO "SMP alternatives: switching to UP code\n");
set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
apply_alternatives(__smp_alt_instructions,
__smp_alt_instructions_end);
alternatives_smp_unlock(__smp_locks, __smp_locks_end,
_text, _etext);
}
free_init_pages("SMP alternatives",
(unsigned long)__smp_alt_begin,
(unsigned long)__smp_alt_end);
} else {
alternatives_smp_save(__smp_alt_instructions,
__smp_alt_instructions_end);
alternatives_smp_module_add(NULL, "core kernel",
__smp_locks, __smp_locks_end,
_text, _etext);
alternatives_smp_switch(0);
}
}
......@@ -40,7 +40,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
/* Other (Linux-defined) */
"cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
NULL, NULL, NULL, NULL,
"constant_tsc", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
"constant_tsc", "up", NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
......
......@@ -104,26 +104,38 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
return -ENOEXEC;
}
extern void apply_alternatives(void *start, void *end);
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
const Elf_Shdr *s;
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
/* look for .altinstructions to patch */
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
void *seg;
if (strcmp(".altinstructions", secstrings + s->sh_name))
continue;
seg = (void *)s->sh_addr;
apply_alternatives(seg, seg + s->sh_size);
}
if (!strcmp(".text", secstrings + s->sh_name))
text = s;
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks= s;
}
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
if (locks && text) {
void *lseg = (void *)locks->sh_addr;
void *tseg = (void *)text->sh_addr;
alternatives_smp_module_add(me, me->name,
lseg, lseg + locks->sh_size,
tseg, tseg + text->sh_size);
}
return 0;
}
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
}
......@@ -110,11 +110,11 @@ asm(
".align 4\n"
".globl __write_lock_failed\n"
"__write_lock_failed:\n\t"
LOCK "addl $" RW_LOCK_BIAS_STR ",(%eax)\n"
LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ",(%eax)\n"
"1: rep; nop\n\t"
"cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t"
"jne 1b\n\t"
LOCK "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t"
LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t"
"jnz __write_lock_failed\n\t"
"ret"
);
......@@ -124,11 +124,11 @@ asm(
".align 4\n"
".globl __read_lock_failed\n"
"__read_lock_failed:\n\t"
LOCK "incl (%eax)\n"
LOCK_PREFIX "incl (%eax)\n"
"1: rep; nop\n\t"
"cmpl $1,(%eax)\n\t"
"js 1b\n\t"
LOCK "decl (%eax)\n\t"
LOCK_PREFIX "decl (%eax)\n\t"
"js __read_lock_failed\n\t"
"ret"
);
......
......@@ -1377,101 +1377,6 @@ static void __init register_memory(void)
pci_mem_start, gapstart, gapsize);
}
/* Use inline assembly to define this because the nops are defined
as inline assembly strings in the include files and we cannot
get them easily into strings. */
asm("\t.data\nintelnops: "
GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
GENERIC_NOP7 GENERIC_NOP8);
asm("\t.data\nk8nops: "
K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
K8_NOP7 K8_NOP8);
asm("\t.data\nk7nops: "
K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
K7_NOP7 K7_NOP8);
extern unsigned char intelnops[], k8nops[], k7nops[];
static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
NULL,
intelnops,
intelnops + 1,
intelnops + 1 + 2,
intelnops + 1 + 2 + 3,
intelnops + 1 + 2 + 3 + 4,
intelnops + 1 + 2 + 3 + 4 + 5,
intelnops + 1 + 2 + 3 + 4 + 5 + 6,
intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
NULL,
k8nops,
k8nops + 1,
k8nops + 1 + 2,
k8nops + 1 + 2 + 3,
k8nops + 1 + 2 + 3 + 4,
k8nops + 1 + 2 + 3 + 4 + 5,
k8nops + 1 + 2 + 3 + 4 + 5 + 6,
k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
NULL,
k7nops,
k7nops + 1,
k7nops + 1 + 2,
k7nops + 1 + 2 + 3,
k7nops + 1 + 2 + 3 + 4,
k7nops + 1 + 2 + 3 + 4 + 5,
k7nops + 1 + 2 + 3 + 4 + 5 + 6,
k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
static struct nop {
int cpuid;
unsigned char **noptable;
} noptypes[] = {
{ X86_FEATURE_K8, k8_nops },
{ X86_FEATURE_K7, k7_nops },
{ -1, NULL }
};
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
self modifying code. This implies that assymetric systems where
APs have less capabilities than the boot processor are not handled.
Tough. Make sure you disable such features by hand. */
void apply_alternatives(void *start, void *end)
{
struct alt_instr *a;
int diff, i, k;
unsigned char **noptable = intel_nops;
for (i = 0; noptypes[i].cpuid >= 0; i++) {
if (boot_cpu_has(noptypes[i].cpuid)) {
noptable = noptypes[i].noptable;
break;
}
}
for (a = start; (void *)a < end; a++) {
if (!boot_cpu_has(a->cpuid))
continue;
BUG_ON(a->replacementlen > a->instrlen);
memcpy(a->instr, a->replacement, a->replacementlen);
diff = a->instrlen - a->replacementlen;
/* Pad the rest with nops */
for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
k = diff;
if (k > ASM_NOP_MAX)
k = ASM_NOP_MAX;
memcpy(a->instr + i, noptable[k], k);
}
}
}
void __init alternative_instructions(void)
{
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
apply_alternatives(__alt_instructions, __alt_instructions_end);
}
static char * __init machine_specific_memory_setup(void);
#ifdef CONFIG_MCA
......
......@@ -899,6 +899,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
unsigned short nmi_high = 0, nmi_low = 0;
++cpucount;
alternatives_smp_switch(1);
/*
* We can't use kernel_thread since we must avoid to
......@@ -1368,6 +1369,8 @@ void __cpu_die(unsigned int cpu)
/* They ack this in play_dead by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
printk ("CPU %d is now offline\n", cpu);
if (1 == num_online_cpus())
alternatives_smp_switch(0);
return;
}
msleep(100);
......
......@@ -68,6 +68,26 @@ SECTIONS
*(.data.init_task)
}
/* might get freed after init */
. = ALIGN(4096);
__smp_alt_begin = .;
__smp_alt_instructions = .;
.smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
*(.smp_altinstructions)
}
__smp_alt_instructions_end = .;
. = ALIGN(4);
__smp_locks = .;
.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
*(.smp_locks)
}
__smp_locks_end = .;
.smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
*(.smp_altinstr_replacement)
}
. = ALIGN(4096);
__smp_alt_end = .;
/* will be freed after init */
. = ALIGN(4096); /* Init code and data */
__init_begin = .;
......
......@@ -720,21 +720,6 @@ static int noinline do_test_wp_bit(void)
return flag;
}
void free_initmem(void)
{
unsigned long addr;
addr = (unsigned long)(&__init_begin);
for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
ClearPageReserved(virt_to_page(addr));
init_page_count(virt_to_page(addr));
memset((void *)addr, 0xcc, PAGE_SIZE);
free_page(addr);
totalram_pages++;
}
printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10);
}
#ifdef CONFIG_DEBUG_RODATA
extern char __start_rodata, __end_rodata;
......@@ -758,17 +743,31 @@ void mark_rodata_ro(void)
}
#endif
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
unsigned long addr;
for (addr = begin; addr < end; addr += PAGE_SIZE) {
ClearPageReserved(virt_to_page(addr));
init_page_count(virt_to_page(addr));
memset((void *)addr, 0xcc, PAGE_SIZE);
free_page(addr);
totalram_pages++;