diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a830cbd70153b8662eddf16a87a5336c6cb0742..dfdbf640389536489f5ac05b3258361d133d76d8 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@ void foo(void)
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
 	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
 
 	BLANK();
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 37b8c1d3e022b7ec0a643d3d6a4cc4545d466e4d..514f4d0d2bfa04ebbc6a9167e9f900402761bbe4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -179,7 +179,7 @@ static void lguest_end_context_switch(struct task_struct *next)
 	paravirt_end_context_switch(next);
 }
 
-/*G:033
+/*G:032
  * After that diversion we return to our first native-instruction
  * replacements: four functions for interrupt control.
  *
@@ -199,41 +199,28 @@ static unsigned long save_fl(void)
 {
 	return lguest_data.irq_enabled;
 }
-PV_CALLEE_SAVE_REGS_THUNK(save_fl);
-
-/* restore_flags() just sets the flags back to the value given. */
-static void restore_fl(unsigned long flags)
-{
-	lguest_data.irq_enabled = flags;
-	mb();
-	/* Null hcall forces interrupt delivery now, if irq_pending is
-	 * set to X86_EFLAGS_IF (ie. an interrupt is pending, and flags
-	 * enables interrupts. */
-	if (flags & lguest_data.irq_pending)
-		kvm_hypercall0(LHCALL_SEND_INTERRUPTS);
-}
-PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
 
 /* Interrupts go off... */
 static void irq_disable(void)
 {
 	lguest_data.irq_enabled = 0;
 }
-PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
 
-/* Interrupts go on... */
-static void irq_enable(void)
-{
-	lguest_data.irq_enabled = X86_EFLAGS_IF;
-	mb();
-	/* Null hcall forces interrupt delivery now. */
-	if (lguest_data.irq_pending)
-		kvm_hypercall0(LHCALL_SEND_INTERRUPTS);
+/* Let's pause a moment.  Remember how I said these are called so often?
+ * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
+ * break some rules.  In particular, these functions are assumed to save their
+ * own registers if they need to: normal C functions assume they can trash the
+ * eax register.  To use normal C functions, we use
+ * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
+ * C function, then restores it. */
+PV_CALLEE_SAVE_REGS_THUNK(save_fl);
+PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
+/*:*/
 
-}
-PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+/* These are in i386_head.S */
+extern void lg_irq_enable(void);
+extern void lg_restore_fl(unsigned long flags);
 
-/*:*/
 /*M:003 Note that we don't check for outstanding interrupts when we re-enable
  * them (or when we unmask an interrupt).  This seems to work for the moment,
  * since interrupts are rare and we'll just get the interrupt on the next timer
@@ -1041,9 +1028,9 @@ __init void lguest_init(void)
 	/* interrupt-related operations */
 	pv_irq_ops.init_IRQ = lguest_init_IRQ;
 	pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
-	pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
 	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
-	pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
+	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
 	pv_irq_ops.safe_halt = lguest_safe_halt;
 
 	/* init-time operations */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 3e0c5545d59caaff65bdc3d9146213899342fce9..a9c8cfe61cd4d48497a648538ce5b1c7d04e7cde 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -47,7 +47,63 @@ ENTRY(lguest_entry)
 
 LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
 LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-/*:*/
+
+/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
+ * matter for save_fl and irq_disable later).  If we write our routines
+ * carefully in assembler, we can avoid clobbering any registers and avoid
+ * jumping through the wrapper functions.
+ *
+ * I skipped over our first piece of assembler, but this one is worth studying
+ * in a bit more detail so I'll describe in easy stages.  First, the routine
+ * to enable interrupts: */
+ENTRY(lg_irq_enable)
+	/* The reverse of irq_disable, this sets lguest_data.irq_enabled to
+	 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
+	movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
+	/* But now we need to check if the Host wants to know: there might have
+	 * been interrupts waiting to be delivered, in which case it will have
+	 * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
+	 * jump to send_interrupts, otherwise we're done. */
+	testl $0, lguest_data+LGUEST_DATA_irq_pending
+	jnz send_interrupts
+	/* One cool thing about x86 is that you can do many things without using
+	 * a register.  In this case, the normal path hasn't needed to save or
+	 * restore any registers at all! */
+	ret
+send_interrupts:
+	/* OK, now we need a register: eax is used for the hypercall number,
+	 * which is LHCALL_SEND_INTERRUPTS.
+	 *
+	 * We used not to bother with this pending detection at all, which was
+	 * much simpler.  Sooner or later the Host would realize it had to
+	 * send us an interrupt.  But that turns out to make performance 7
+	 * times worse on a simple tcp benchmark.  So now we do this the hard
+	 * way. */
+	pushl %eax
+	movl $LHCALL_SEND_INTERRUPTS, %eax
+	/* This is a vmcall instruction (same thing that KVM uses).  Older
+	 * assembler versions might not know the "vmcall" instruction, so we
+	 * create one manually here. */
+	.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+	popl %eax
+	ret
+
+/* Finally, the "popf" or "restore flags" routine.  The %eax register holds the
+ * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
+ * enabling interrupts again, if it's 0 we're leaving them off. */
+ENTRY(lg_restore_fl)
+	/* This is just "lguest_data.irq_enabled = flags;" */
+	movl %eax, lguest_data+LGUEST_DATA_irq_enabled
+	/* Now, if the %eax value has enabled interrupts and
+	 * lguest_data.irq_pending is set, we want to tell the Host so it can
+	 * deliver any outstanding interrupts.  Fortunately, both values will
+	 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
+	 * instruction will AND them together for us.  If both are set, we
+	 * jump to send_interrupts. */
+	testl lguest_data+LGUEST_DATA_irq_pending, %eax
+	jnz send_interrupts
+	/* Again, the normal path has used no extra registers.  Clever, huh? */
+	ret
 
 /* These demark the EIP range where host should never deliver interrupts. */
 .global lguest_noirq_start