Commit 919c0d14 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm:
  KVM: Remove now unused structs from kvm_para.h
  x86: KVM guest: Use the paravirt clocksource structs and functions
  KVM: Make kvm host use the paravirt clocksource structs
  x86: Make xen use the paravirt clocksource structs and functions
  x86: Add structs and functions for paravirt clocksource
  KVM: VMX: Fix host msr corruption with preemption enabled
  KVM: ioapic: fix lost interrupt when changing a device's irq
  KVM: MMU: Fix oops on guest userspace access to guest pagetable
  KVM: MMU: large page update_pte issue with non-PAE 32-bit guests (resend)
  KVM: MMU: Fix rmap_write_protect() hugepage iteration bug
  KVM: close timer injection race window in __vcpu_run
  KVM: Fix race between timer migration and vcpu migration
parents de08341a 6b1ed908
......@@ -383,6 +383,7 @@ config VMI
config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
select PARAVIRT_CLOCK
depends on !(X86_VISWS || X86_VOYAGER)
help
Turning on this option will allow you to run a paravirtualized clock
......@@ -410,6 +411,10 @@ config PARAVIRT
over full virtualization. However, when run without a hypervisor
the kernel is theoretically slower and slightly larger.
config PARAVIRT_CLOCK
bool
default n
endif
config MEMTEST_BOOTPARAM
......
......@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
......
......@@ -18,6 +18,7 @@
#include <linux/clocksource.h>
#include <linux/kvm_para.h>
#include <asm/pvclock.h>
#include <asm/arch_hooks.h>
#include <asm/msr.h>
#include <asm/apic.h>
......@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
static struct pvclock_wall_clock wall_clock;
static inline u64 kvm_get_delta(u64 last_tsc)
{
int cpu = smp_processor_id();
u64 delta = native_read_tsc() - last_tsc;
return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
}
static struct kvm_wall_clock wall_clock;
static cycle_t kvm_clock_read(void);
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
......@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
*/
static unsigned long kvm_get_wallclock(void)
{
u32 wc_sec, wc_nsec;
u64 delta;
struct pvclock_vcpu_time_info *vcpu_time;
struct timespec ts;
int version, nsec;
int low, high;
low = (int)__pa(&wall_clock);
high = ((u64)__pa(&wall_clock) >> 32);
native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
delta = kvm_clock_read();
vcpu_time = &get_cpu_var(hv_clock);
pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
put_cpu_var(hv_clock);
native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
do {
version = wall_clock.wc_version;
rmb();
wc_sec = wall_clock.wc_sec;
wc_nsec = wall_clock.wc_nsec;
rmb();
} while ((wall_clock.wc_version != version) || (version & 1));
delta = kvm_clock_read() - delta;
delta += wc_nsec;
nsec = do_div(delta, NSEC_PER_SEC);
set_normalized_timespec(&ts, wc_sec + delta, nsec);
/*
* Of all mechanisms of time adjustment I've tested, this one
* was the champion!
*/
return ts.tv_sec + 1;
return ts.tv_sec;
}
static int kvm_set_wallclock(unsigned long now)
{
return 0;
return -1;
}
/*
* This is our read_clock function. The host puts an tsc timestamp each time
* it updates a new time. Without the tsc adjustment, we can have a situation
* in which a vcpu starts to run earlier (smaller system_time), but probes
* time later (compared to another vcpu), leading to backwards time
*/
static cycle_t kvm_clock_read(void)
{
u64 last_tsc, now;
int cpu;
struct pvclock_vcpu_time_info *src;
cycle_t ret;
preempt_disable();
cpu = smp_processor_id();
last_tsc = get_clock(cpu, tsc_timestamp);
now = get_clock(cpu, system_time);
now += kvm_get_delta(last_tsc);
preempt_enable();
return now;
src = &get_cpu_var(hv_clock);
ret = pvclock_clocksource_read(src);
put_cpu_var(hv_clock);
return ret;
}
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_read,
......@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static int kvm_register_clock(void)
static int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high;
low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
}
......@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
* Now that the first cpu already had this clocksource initialized,
* we shouldn't fail.
*/
WARN_ON(kvm_register_clock());
WARN_ON(kvm_register_clock("secondary cpu clock"));
/* ok, done with our trickery, call native */
setup_secondary_APIC_clock();
}
#endif
#ifdef CONFIG_SMP
void __init kvm_smp_prepare_boot_cpu(void)
{
WARN_ON(kvm_register_clock("primary cpu clock"));
native_smp_prepare_boot_cpu();
}
#endif
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
......@@ -174,13 +148,16 @@ void __init kvmclock_init(void)
return;
if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
if (kvm_register_clock())
if (kvm_register_clock("boot clock"))
return;
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
pv_time_ops.sched_clock = kvm_clock_read;
#ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
......
/* paravirtual clock -- common code used by kvm/xen
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <asm/pvclock.h>
/*
* These are perodically updated
* xen: magic shared_info page
* kvm: gpa registered via msr
* and then copied here.
*/
struct pvclock_shadow_time {
u64 tsc_timestamp; /* TSC at last update of time vals. */
u64 system_timestamp; /* Time, in nanosecs, since boot. */
u32 tsc_to_nsec_mul;
int tsc_shift;
u32 version;
};
/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
*/
static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
{
u64 product;
#ifdef __i386__
u32 tmp1, tmp2;
#endif
if (shift < 0)
delta >>= -shift;
else
delta <<= shift;
#ifdef __i386__
__asm__ (
"mul %5 ; "
"mov %4,%%eax ; "
"mov %%edx,%4 ; "
"mul %5 ; "
"xor %5,%5 ; "
"add %4,%%eax ; "
"adc %5,%%edx ; "
: "=A" (product), "=r" (tmp1), "=r" (tmp2)
: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif __x86_64__
__asm__ (
"mul %%rdx ; shrd $32,%%rdx,%%rax"
: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
#else
#error implement me!
#endif
return product;
}
static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
{
u64 delta = native_read_tsc() - shadow->tsc_timestamp;
return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}
/*
* Reads a consistent set of time-base values from hypervisor,
* into a shadow data area.
*/
static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
struct pvclock_vcpu_time_info *src)
{
do {
dst->version = src->version;
rmb(); /* fetch version before data */
dst->tsc_timestamp = src->tsc_timestamp;
dst->system_timestamp = src->system_time;
dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
dst->tsc_shift = src->tsc_shift;
rmb(); /* test version after fetching data */
} while ((src->version & 1) || (dst->version != src->version));
return dst->version;
}
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
struct pvclock_shadow_time shadow;
unsigned version;
cycle_t ret, offset;
do {
version = pvclock_get_time_values(&shadow, src);
barrier();
offset = pvclock_get_nsec_offset(&shadow);
ret = shadow.system_timestamp + offset;
barrier();
} while (version != src->version);
return ret;
}
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
struct pvclock_vcpu_time_info *vcpu_time,
struct timespec *ts)
{
u32 version;
u64 delta;
struct timespec now;
/* get wallclock at system boot */
do {
version = wall_clock->version;
rmb(); /* fetch version before time */
now.tv_sec = wall_clock->sec;
now.tv_nsec = wall_clock->nsec;
rmb(); /* fetch time before checking version */
} while ((wall_clock->version & 1) || (version != wall_clock->version));
delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
......@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
atomic_inc(&pt->pending);
smp_mb__after_atomic_inc();
if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(&vcpu0->wq);
if (vcpu0) {
set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
if (waitqueue_active(&vcpu0->wq)) {
vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(&vcpu0->wq);
}
}
pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
......
......@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
wait_queue_head_t *q = &apic->vcpu->wq;
atomic_inc(&apic->timer.pending);
set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
if (waitqueue_active(q)) {
apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(q);
......
......@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
rmap_remove(kvm, spte);
--kvm->stat.lpages;
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
spte = NULL;
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
......@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
struct kvm_mmu_page *shadow;
spte |= PT_WRITABLE_MASK;
if (user_fault) {
mmu_unshadow(vcpu->kvm, gfn);
goto unshadowed;
}
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
if (shadow ||
......@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
}
unshadowed:
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
......@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
u64 *spte,
const void *new)
{
if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
&& !vcpu->arch.update_pte.largepage) {
++vcpu->kvm->stat.mmu_pde_zapped;
return;
}
if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
if (!vcpu->arch.update_pte.largepage ||
sp->role.glevels == PT32_ROOT_LEVEL) {
++vcpu->kvm->stat.mmu_pde_zapped;
return;
}
}
++vcpu->kvm->stat.mmu_pte_updated;
if (sp->role.glevels == PT32_ROOT_LEVEL)
......
......@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
load_transition_efer(vmx);
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
static void __vmx_load_host_state(struct vcpu_vmx *vmx)
{
unsigned long flags;
......@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
reload_host_efer(vmx);
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
{
preempt_disable();
__vmx_load_host_state(vmx);
preempt_enable();
}
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
......@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_load_host_state(to_vmx(vcpu));
__vmx_load_host_state(to_vmx(vcpu));
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
......@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
switch (msr_index) {
#ifdef CONFIG_X86_64
case MSR_EFER:
vmx_load_host_state(vmx);
ret = kvm_set_msr_common(vcpu, msr_index, data);
if (vmx->host_state.loaded) {
reload_host_efer(vmx);
load_transition_efer(vmx);
}
break;
case MSR_FS_BASE:
vmcs_writel(GUEST_FS_BASE, data);
......@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
guest_write_tsc(data);
break;
default:
vmx_load_host_state(vmx);
msr = find_msr_entry(vmx, msr_index);
if (msr) {
msr->data = data;
if (vmx->host_state.loaded)
load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
break;
}
ret = kvm_set_msr_common(vcpu, msr_index, data);
......
......@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
static int version;
struct kvm_wall_clock wc;
struct timespec wc_ts;
struct pvclock_wall_clock wc;
struct timespec now, sys, boot;
if (!wall_clock)
return;
......@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
wc_ts = current_kernel_time();
wc.wc_sec = wc_ts.tv_sec;
wc.wc_nsec = wc_ts.tv_nsec;
wc.wc_version = version;
/*
* The guest calculates current wall clock time by adding
* system time (updated by kvm_write_guest_time below) to the
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
now = current_kernel_time();
ktime_get_ts(&sys);
boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
wc.sec = boot.tv_sec;
wc.nsec = boot.tv_nsec;
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
......@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
uint32_t quotient, remainder;
/* Don't try to replace with do_div(), this one calculates
* "(dividend << 32) / divisor" */
__asm__ ( "divl %4"
: "=a" (quotient), "=d" (remainder)
: "0" (0), "1" (dividend), "r" (divisor) );
return quotient;
}
static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
{
uint64_t nsecs = 1000000000LL;
int32_t shift = 0;
uint64_t tps64;
uint32_t tps32;
tps64 = tsc_khz * 1000LL;
while (tps64 > nsecs*2) {
tps64 >>= 1;
shift--;
}
tps32 = (uint32_t)tps64;
while (tps32 <= (uint32_t)nsecs) {
tps32 <<= 1;
shift++;
}
hv_clock->tsc_shift = shift;
hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
__FUNCTION__, tsc_khz, hv_clock->tsc_shift,
hv_clock->tsc_to_system_mul);
}
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
......@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
if ((!vcpu->time_page))
return;
if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
vcpu->hv_clock_tsc_khz = tsc_khz;
}
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
......@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
* state, we just write "2" at the end
* state, we just increase by 2 at the end.
*/
vcpu->hv_clock.version = 2;
vcpu->hv_clock.version += 2;
shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
sizeof(vcpu->hv_clock));
sizeof(vcpu->hv_clock));
kunmap_atomic(shared_kaddr, KM_USER0);
......@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* ...but clean it before doing the actual write */
vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
vcpu->arch.hv_clock.tsc_to_system_mul =
clocksource_khz2mult(tsc_khz, 22);
vcpu->arch.hv_clock.tsc_shift = 22;
down_read(&current->mm->mmap_sem);
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
......@@ -2759,6 +2808,8 @@ again:
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
&vcpu->requests)) {
kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
......@@ -2772,6 +2823,7 @@ again:
}
}
clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
kvm_inject_pending_timer_irqs(vcpu);
preempt_disable();
......@@ -2781,21 +2833,13 @@ again:
local_irq_disable();
if (need_resched()) {
if (vcpu->requests || need_resched()) {
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
if (vcpu->requests)
if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
if (signal_pending(current)) {
local_irq_enable();
preempt_enable();
......@@ -2825,9 +2869,6 @@ again:
kvm_guest_enter();
if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);