diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index fc23040d5a263f4842ecddc2297faf6266f5fefa..f770805f1215df787d7840e3acbc7692cd228503 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -17,15 +17,6 @@
 
 	.text
 
-#ifdef CONFIG_PPC64
-#define IN_SYNC		twi	0,r5,0; isync
-#define EIEIO_32
-#define SYNC_64		sync
-#else /* CONFIG_PPC32 */
-#define IN_SYNC
-#define EIEIO_32	eieio
-#define SYNC_64
-#endif
 /*
  * Returns (address we are running at) - (address we were linked at)
  * for use before the text and data are mapped to KERNELBASE.
@@ -70,6 +61,7 @@ _GLOBAL(add_reloc_offset)
  * The *_ns versions don't do byte-swapping.
  */
 _GLOBAL(_insb)
+	sync
 	cmpwi	0,r5,0
 	mtctr	r5
 	subi	r4,r4,1
@@ -78,7 +70,8 @@ _GLOBAL(_insb)
 	eieio
 	stbu	r5,1(r4)
 	bdnz	00b
-	IN_SYNC
+	twi	0,r5,0
+	isync
 	blr
 
 _GLOBAL(_outsb)
@@ -86,14 +79,15 @@ _GLOBAL(_outsb)
 	mtctr	r5
 	subi	r4,r4,1
 	blelr-
+	sync
 00:	lbzu	r5,1(r4)
 	stb	r5,0(r3)
-	EIEIO_32
 	bdnz	00b
-	SYNC_64
+	sync
 	blr
 
 _GLOBAL(_insw)
+	sync
 	cmpwi	0,r5,0
 	mtctr	r5
 	subi	r4,r4,2
@@ -102,7 +96,8 @@ _GLOBAL(_insw)
 	eieio
 	sthu	r5,2(r4)
 	bdnz	00b
-	IN_SYNC
+	twi	0,r5,0
+	isync
 	blr
 
 _GLOBAL(_outsw)
@@ -110,14 +105,15 @@ _GLOBAL(_outsw)
 	mtctr	r5
 	subi	r4,r4,2
 	blelr-
+	sync
 00:	lhzu	r5,2(r4)
-	EIEIO_32
 	sthbrx	r5,0,r3
 	bdnz	00b
-	SYNC_64
+	sync
 	blr
 
 _GLOBAL(_insl)
+	sync
 	cmpwi	0,r5,0
 	mtctr	r5
 	subi	r4,r4,4
@@ -126,7 +122,8 @@ _GLOBAL(_insl)
 	eieio
 	stwu	r5,4(r4)
 	bdnz	00b
-	IN_SYNC
+	twi	0,r5,0
+	isync
 	blr
 
 _GLOBAL(_outsl)
@@ -134,17 +131,18 @@ _GLOBAL(_outsl)
 	mtctr	r5
 	subi	r4,r4,4
 	blelr-
+	sync
 00:	lwzu	r5,4(r4)
 	stwbrx	r5,0,r3
-	EIEIO_32
 	bdnz	00b
-	SYNC_64
+	sync
 	blr
 
 #ifdef CONFIG_PPC32
 _GLOBAL(__ide_mm_insw)
 #endif
 _GLOBAL(_insw_ns)
+	sync
 	cmpwi	0,r5,0
 	mtctr	r5
 	subi	r4,r4,2
@@ -153,7 +151,8 @@ _GLOBAL(_insw_ns)
 	eieio
 	sthu	r5,2(r4)
 	bdnz	00b
-	IN_SYNC
+	twi	0,r5,0
+	isync
 	blr
 
 #ifdef CONFIG_PPC32
@@ -164,17 +163,18 @@ _GLOBAL(_outsw_ns)
 	mtctr	r5
 	subi	r4,r4,2
 	blelr-
+	sync
 00:	lhzu	r5,2(r4)
 	sth	r5,0(r3)
-	EIEIO_32
 	bdnz	00b
-	SYNC_64
+	sync
 	blr
 
 #ifdef CONFIG_PPC32
 _GLOBAL(__ide_mm_insl)
 #endif
 _GLOBAL(_insl_ns)
+	sync
 	cmpwi	0,r5,0
 	mtctr	r5
 	subi	r4,r4,4
@@ -183,7 +183,8 @@ _GLOBAL(_insl_ns)
 	eieio
 	stwu	r5,4(r4)
 	bdnz	00b
-	IN_SYNC
+	twi	0,r5,0
+	isync
 	blr
 
 #ifdef CONFIG_PPC32
@@ -194,10 +195,10 @@ _GLOBAL(_outsl_ns)
 	mtctr	r5
 	subi	r4,r4,4
 	blelr-
+	sync
 00:	lwzu	r5,4(r4)
 	stw	r5,0(r3)
-	EIEIO_32
 	bdnz	00b
-	SYNC_64
+	sync
 	blr
 
diff --git a/include/asm-powerpc/eeh.h b/include/asm-powerpc/eeh.h
index 4df3e80118f43123b936f97ad31b2b8ee2853874..6a784396660bf78a00145b07e9f35c410438898e 100644
--- a/include/asm-powerpc/eeh.h
+++ b/include/asm-powerpc/eeh.h
@@ -205,6 +205,7 @@ static inline void eeh_memset_io(volatile void __iomem *addr, int c,
 	lc |= lc << 8;
 	lc |= lc << 16;
 
+	__asm__ __volatile__ ("sync" : : : "memory");
 	while(n && !EEH_CHECK_ALIGN(p, 4)) {
 		*((volatile u8 *)p) = c;
 		p++;
@@ -229,6 +230,7 @@ static inline void eeh_memcpy_fromio(void *dest, const volatile void __iomem *sr
 	void *destsave = dest;
 	unsigned long nsave = n;
 
+	__asm__ __volatile__ ("sync" : : : "memory");
 	while(n && (!EEH_CHECK_ALIGN(vsrc, 4) || !EEH_CHECK_ALIGN(dest, 4))) {
 		*((u8 *)dest) = *((volatile u8 *)vsrc);
 		__asm__ __volatile__ ("eieio" : : : "memory");
@@ -266,6 +268,7 @@ static inline void eeh_memcpy_toio(volatile void __iomem *dest, const void *src,
 {
 	void *vdest = (void __force *) dest;
 
+	__asm__ __volatile__ ("sync" : : : "memory");
 	while(n && (!EEH_CHECK_ALIGN(vdest, 4) || !EEH_CHECK_ALIGN(src, 4))) {
 		*((volatile u8 *)vdest) = *((u8 *)src);
 		src++;
diff --git a/include/asm-powerpc/io.h b/include/asm-powerpc/io.h
index 36c4c34bf56519a73a5800b6e409975fe62c6d34..212428db0d8b648df0c4911c9e76db0bd5a711ba 100644
--- a/include/asm-powerpc/io.h
+++ b/include/asm-powerpc/io.h
@@ -19,6 +19,7 @@ extern int check_legacy_ioport(unsigned long base_port);
 #include <linux/compiler.h>
 #include <asm/page.h>
 #include <asm/byteorder.h>
+#include <asm/paca.h>
 #ifdef CONFIG_PPC_ISERIES 
 #include <asm/iseries/iseries_io.h>
 #endif  
@@ -162,7 +163,11 @@ extern void _outsw_ns(volatile u16 __iomem *port, const void *buf, int ns);
 extern void _insl_ns(volatile u32 __iomem *port, void *buf, int nl);
 extern void _outsl_ns(volatile u32 __iomem *port, const void *buf, int nl);
 
-#define mmiowb()
+static inline void mmiowb(void)
+{
+	__asm__ __volatile__ ("sync" : : : "memory");
+	get_paca()->io_sync = 0;
+}
 
 /*
  * output pause versions need a delay at least for the
@@ -278,22 +283,23 @@ static inline int in_8(const volatile unsigned char __iomem *addr)
 {
 	int ret;
 
-	__asm__ __volatile__("lbz%U1%X1 %0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("sync; lbz%U1%X1 %0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "m" (*addr));
 	return ret;
 }
 
 static inline void out_8(volatile unsigned char __iomem *addr, int val)
 {
-	__asm__ __volatile__("stb%U0%X0 %1,%0; sync"
+	__asm__ __volatile__("sync; stb%U0%X0 %1,%0"
 			     : "=m" (*addr) : "r" (val));
+	get_paca()->io_sync = 1;
 }
 
 static inline int in_le16(const volatile unsigned short __iomem *addr)
 {
 	int ret;
 
-	__asm__ __volatile__("lhbrx %0,0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("sync; lhbrx %0,0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "r" (addr), "m" (*addr));
 	return ret;
 }
@@ -302,28 +308,30 @@ static inline int in_be16(const volatile unsigned short __iomem *addr)
 {
 	int ret;
 
-	__asm__ __volatile__("lhz%U1%X1 %0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("sync; lhz%U1%X1 %0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "m" (*addr));
 	return ret;
 }
 
 static inline void out_le16(volatile unsigned short __iomem *addr, int val)
 {
-	__asm__ __volatile__("sthbrx %1,0,%2; sync"
+	__asm__ __volatile__("sync; sthbrx %1,0,%2"
 			     : "=m" (*addr) : "r" (val), "r" (addr));
+	get_paca()->io_sync = 1;
 }
 
 static inline void out_be16(volatile unsigned short __iomem *addr, int val)
 {
-	__asm__ __volatile__("sth%U0%X0 %1,%0; sync"
+	__asm__ __volatile__("sync; sth%U0%X0 %1,%0"
 			     : "=m" (*addr) : "r" (val));
+	get_paca()->io_sync = 1;
 }
 
 static inline unsigned in_le32(const volatile unsigned __iomem *addr)
 {
 	unsigned ret;
 
-	__asm__ __volatile__("lwbrx %0,0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("sync; lwbrx %0,0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "r" (addr), "m" (*addr));
 	return ret;
 }
@@ -332,21 +340,23 @@ static inline unsigned in_be32(const volatile unsigned __iomem *addr)
 {
 	unsigned ret;
 
-	__asm__ __volatile__("lwz%U1%X1 %0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("sync; lwz%U1%X1 %0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "m" (*addr));
 	return ret;
 }
 
 static inline void out_le32(volatile unsigned __iomem *addr, int val)
 {
-	__asm__ __volatile__("stwbrx %1,0,%2; sync" : "=m" (*addr)
+	__asm__ __volatile__("sync; stwbrx %1,0,%2" : "=m" (*addr)
 			     : "r" (val), "r" (addr));
+	get_paca()->io_sync = 1;
 }
 
 static inline void out_be32(volatile unsigned __iomem *addr, int val)
 {
-	__asm__ __volatile__("stw%U0%X0 %1,%0; sync"
+	__asm__ __volatile__("sync; stw%U0%X0 %1,%0"
 			     : "=m" (*addr) : "r" (val));
+	get_paca()->io_sync = 1;
 }
 
 static inline unsigned long in_le64(const volatile unsigned long __iomem *addr)
@@ -354,6 +364,7 @@ static inline unsigned long in_le64(const volatile unsigned long __iomem *addr)
 	unsigned long tmp, ret;
 
 	__asm__ __volatile__(
+			     "sync\n"
 			     "ld %1,0(%2)\n"
 			     "twi 0,%1,0\n"
 			     "isync\n"
@@ -372,7 +383,7 @@ static inline unsigned long in_be64(const volatile unsigned long __iomem *addr)
 {
 	unsigned long ret;
 
-	__asm__ __volatile__("ld%U1%X1 %0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("sync; ld%U1%X1 %0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "m" (*addr));
 	return ret;
 }
@@ -389,14 +400,16 @@ static inline void out_le64(volatile unsigned long __iomem *addr, unsigned long
 			     "rldicl %1,%1,32,0\n"
 			     "rlwimi %0,%1,8,8,31\n"
 			     "rlwimi %0,%1,24,16,23\n"
-			     "std %0,0(%3)\n"
-			     "sync"
+			     "sync\n"
+			     "std %0,0(%3)"
 			     : "=&r" (tmp) , "=&r" (val) : "1" (val) , "b" (addr) , "m" (*addr));
+	get_paca()->io_sync = 1;
 }
 
 static inline void out_be64(volatile unsigned long __iomem *addr, unsigned long val)
 {
-	__asm__ __volatile__("std%U0%X0 %1,%0; sync" : "=m" (*addr) : "r" (val));
+	__asm__ __volatile__("sync; std%U0%X0 %1,%0" : "=m" (*addr) : "r" (val));
+	get_paca()->io_sync = 1;
 }
 
 #ifndef CONFIG_PPC_ISERIES 
diff --git a/include/asm-powerpc/paca.h b/include/asm-powerpc/paca.h
index 2d4585f062099da36e7ac6fdc8085e68faae1969..3d5d590bc4b0383dcf658125aab3cf5e039e8ce4 100644
--- a/include/asm-powerpc/paca.h
+++ b/include/asm-powerpc/paca.h
@@ -93,6 +93,7 @@ struct paca_struct {
 	u64 saved_r1;			/* r1 save for RTAS calls */
 	u64 saved_msr;			/* MSR saved here by enter_rtas */
 	u8 proc_enabled;		/* irq soft-enable flag */
+	u8 io_sync;			/* writel() needs spin_unlock sync */
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/include/asm-powerpc/spinlock.h b/include/asm-powerpc/spinlock.h
index 895cb6d3a42a9cd1436670338f7a9fe3c0d53ae6..c31e4382a7759dba58f18f7925e675538a951e5f 100644
--- a/include/asm-powerpc/spinlock.h
+++ b/include/asm-powerpc/spinlock.h
@@ -36,6 +36,19 @@
 #define LOCK_TOKEN	1
 #endif
 
+#if defined(CONFIG_PPC64) && defined(CONFIG_SMP)
+#define CLEAR_IO_SYNC	(get_paca()->io_sync = 0)
+#define SYNC_IO		do {						\
+				if (unlikely(get_paca()->io_sync)) {	\
+					mb();				\
+					get_paca()->io_sync = 0;	\
+				}					\
+			} while (0)
+#else
+#define CLEAR_IO_SYNC
+#define SYNC_IO
+#endif
+
 /*
  * This returns the old value in the lock, so we succeeded
  * in getting the lock if the return value is 0.
@@ -61,6 +74,7 @@ static __inline__ unsigned long __spin_trylock(raw_spinlock_t *lock)
 
 static int __inline__ __raw_spin_trylock(raw_spinlock_t *lock)
 {
+	CLEAR_IO_SYNC;
 	return __spin_trylock(lock) == 0;
 }
 
@@ -91,6 +105,7 @@ extern void __rw_yield(raw_rwlock_t *lock);
 
 static void __inline__ __raw_spin_lock(raw_spinlock_t *lock)
 {
+	CLEAR_IO_SYNC;
 	while (1) {
 		if (likely(__spin_trylock(lock) == 0))
 			break;
@@ -107,6 +122,7 @@ static void __inline__ __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long
 {
 	unsigned long flags_dis;
 
+	CLEAR_IO_SYNC;
 	while (1) {
 		if (likely(__spin_trylock(lock) == 0))
 			break;
@@ -124,6 +140,7 @@ static void __inline__ __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long
 
 static __inline__ void __raw_spin_unlock(raw_spinlock_t *lock)
 {
+	SYNC_IO;
 	__asm__ __volatile__("# __raw_spin_unlock\n\t"
 				LWSYNC_ON_SMP: : :"memory");
 	lock->slock = 0;
diff --git a/include/asm-ppc/io.h b/include/asm-ppc/io.h
index 89c6f1bc3aaba0eb27511d536eb0ffe5985001e7..680555be22ecaedd2075a14a736ed89d1346ee84 100644
--- a/include/asm-ppc/io.h
+++ b/include/asm-ppc/io.h
@@ -63,7 +63,7 @@ extern inline int in_8(const volatile unsigned char __iomem *addr)
 	int ret;
 
 	__asm__ __volatile__(
-		"lbz%U1%X1 %0,%1;\n"
+		"sync; lbz%U1%X1 %0,%1;\n"
 		"twi 0,%0,0;\n"
 		"isync" : "=r" (ret) : "m" (*addr));
 	return ret;
@@ -78,7 +78,7 @@ extern inline int in_le16(const volatile unsigned short __iomem *addr)
 {
 	int ret;
 
-	__asm__ __volatile__("lhbrx %0,0,%1;\n"
+	__asm__ __volatile__("sync; lhbrx %0,0,%1;\n"
 			     "twi 0,%0,0;\n"
 			     "isync" : "=r" (ret) :
 			      "r" (addr), "m" (*addr));
@@ -89,7 +89,7 @@ extern inline int in_be16(const volatile unsigned short __iomem *addr)
 {
 	int ret;
 
-	__asm__ __volatile__("lhz%U1%X1 %0,%1;\n"
+	__asm__ __volatile__("sync; lhz%U1%X1 %0,%1;\n"
 			     "twi 0,%0,0;\n"
 			     "isync" : "=r" (ret) : "m" (*addr));
 	return ret;
@@ -97,20 +97,20 @@ extern inline int in_be16(const volatile unsigned short __iomem *addr)
 
 extern inline void out_le16(volatile unsigned short __iomem *addr, int val)
 {
-	__asm__ __volatile__("sthbrx %1,0,%2; eieio" : "=m" (*addr) :
+	__asm__ __volatile__("sync; sthbrx %1,0,%2" : "=m" (*addr) :
 			      "r" (val), "r" (addr));
 }
 
 extern inline void out_be16(volatile unsigned short __iomem *addr, int val)
 {
-	__asm__ __volatile__("sth%U0%X0 %1,%0; eieio" : "=m" (*addr) : "r" (val));
+	__asm__ __volatile__("sync; sth%U0%X0 %1,%0" : "=m" (*addr) : "r" (val));
 }
 
 extern inline unsigned in_le32(const volatile unsigned __iomem *addr)
 {
 	unsigned ret;
 
-	__asm__ __volatile__("lwbrx %0,0,%1;\n"
+	__asm__ __volatile__("sync; lwbrx %0,0,%1;\n"
 			     "twi 0,%0,0;\n"
 			     "isync" : "=r" (ret) :
 			     "r" (addr), "m" (*addr));
@@ -121,7 +121,7 @@ extern inline unsigned in_be32(const volatile unsigned __iomem *addr)
 {
 	unsigned ret;
 
-	__asm__ __volatile__("lwz%U1%X1 %0,%1;\n"
+	__asm__ __volatile__("sync; lwz%U1%X1 %0,%1;\n"
 			     "twi 0,%0,0;\n"
 			     "isync" : "=r" (ret) : "m" (*addr));
 	return ret;
@@ -129,13 +129,13 @@ extern inline unsigned in_be32(const volatile unsigned __iomem *addr)
 
 extern inline void out_le32(volatile unsigned __iomem *addr, int val)
 {
-	__asm__ __volatile__("stwbrx %1,0,%2; eieio" : "=m" (*addr) :
+	__asm__ __volatile__("sync; stwbrx %1,0,%2" : "=m" (*addr) :
 			     "r" (val), "r" (addr));
 }
 
 extern inline void out_be32(volatile unsigned __iomem *addr, int val)
 {
-	__asm__ __volatile__("stw%U0%X0 %1,%0; eieio" : "=m" (*addr) : "r" (val));
+	__asm__ __volatile__("sync; stw%U0%X0 %1,%0" : "=m" (*addr) : "r" (val));
 }
 #if defined (CONFIG_8260_PCI9)
 #define readb(addr) in_8((volatile u8 *)(addr))
@@ -259,6 +259,7 @@ extern __inline__ unsigned int name(unsigned int port)	\
 {							\
 	unsigned int x;					\
 	__asm__ __volatile__(				\
+		"sync\n"				\
 		"0:"	op "	%0,0,%1\n"		\
 		"1:	twi	0,%0,0\n"		\
 		"2:	isync\n"			\
@@ -284,6 +285,7 @@ extern __inline__ unsigned int name(unsigned int port)	\
 extern __inline__ void name(unsigned int val, unsigned int port) \
 {							\
 	__asm__ __volatile__(				\
+		"sync\n"				\
 		"0:" op " %0,0,%1\n"			\
 		"1:	sync\n"				\
 		"2:\n"					\