diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f4e25c648fbbb977ed53d36489a8086bb474ecd9..ca7acb0c79f0cc79ab989680a853d83403e2a990 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -603,6 +603,15 @@ config NODES_SPAN_OTHER_NODES
 	def_bool y
 	depends on NEED_MULTIPLE_NODES
 
+config PPC_64K_PAGES
+	bool "64k page size"
+	help
+	  This option changes the kernel logical page size to 64k. On machines
+          without processor support for 64k pages, the kernel will simulate
+          them by loading each individual 4k page on demand transparently,
+          while on hardware with such support, it will be used to map
+          normal application pages.
+
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on PPC64 && SMP
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index bc5a3689cc05192c8a8129e72d735e1d50eb7558..b75757251994ec925617658e0eb7d3e3415b3f99 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -125,6 +125,9 @@ int main(void)
 	DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
 	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
+#ifdef CONFIG_PPC_64K_PAGES
+	DEFINE(PACAPGDIR, offsetof(struct paca_struct, pgdir));
+#endif
 #ifdef CONFIG_HUGETLB_PAGE
 	DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas));
 	DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas));
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index b91345fa0805fd36b5874c23d2b072f328522f25..33c63bcf69f8e1670736f0dca9452c91a1774e03 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -240,7 +240,7 @@ struct cpu_spec	cpu_specs[] = {
 		.oprofile_model		= &op_model_power4,
 #endif
 	},
-	{	/* Power5 */
+	{	/* Power5 GR */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x003a0000,
 		.cpu_name		= "POWER5 (gr)",
@@ -255,7 +255,7 @@ struct cpu_spec	cpu_specs[] = {
 		.oprofile_model		= &op_model_power4,
 #endif
 	},
-	{	/* Power5 */
+	{	/* Power5 GS */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x003b0000,
 		.cpu_name		= "POWER5 (gs)",
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 45d81976987fa2189adc1f6d071da6e1da9716df..16ab40daa73852b1f8665888edc882cd69d6997b 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -195,11 +195,11 @@ exception_marker:
 #define EX_R12		24
 #define EX_R13		32
 #define EX_SRR0		40
-#define EX_R3		40	/* SLB miss saves R3, but not SRR0 */
 #define EX_DAR		48
-#define EX_LR		48	/* SLB miss saves LR, but not DAR */
 #define EX_DSISR	56
 #define EX_CCR		60
+#define EX_R3		64
+#define EX_LR		72
 
 #define EXCEPTION_PROLOG_PSERIES(area, label)				\
 	mfspr	r13,SPRN_SPRG3;		/* get paca address into r13 */	\
@@ -419,17 +419,22 @@ data_access_slb_pSeries:
 	mtspr	SPRN_SPRG1,r13
 	RUNLATCH_ON(r13)
 	mfspr	r13,SPRN_SPRG3		/* get paca address into r13 */
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r3,SPRN_DAR
 	std	r9,PACA_EXSLB+EX_R9(r13)	/* save r9 - r12 */
+	mfcr	r9
+#ifdef __DISABLED__
+	/* Keep that around for when we re-implement dynamic VSIDs */
+	cmpdi	r3,0
+	bge	slb_miss_user_pseries
+#endif /* __DISABLED__ */
 	std	r10,PACA_EXSLB+EX_R10(r13)
 	std	r11,PACA_EXSLB+EX_R11(r13)
 	std	r12,PACA_EXSLB+EX_R12(r13)
-	std	r3,PACA_EXSLB+EX_R3(r13)
-	mfspr	r9,SPRN_SPRG1
-	std	r9,PACA_EXSLB+EX_R13(r13)
-	mfcr	r9
+	mfspr	r10,SPRN_SPRG1
+	std	r10,PACA_EXSLB+EX_R13(r13)
 	mfspr	r12,SPRN_SRR1		/* and SRR1 */
-	mfspr	r3,SPRN_DAR
-	b	.do_slb_miss		/* Rel. branch works in real mode */
+	b	.slb_miss_realmode	/* Rel. branch works in real mode */
 
 	STD_EXCEPTION_PSERIES(0x400, instruction_access)
 
@@ -440,17 +445,22 @@ instruction_access_slb_pSeries:
 	mtspr	SPRN_SPRG1,r13
 	RUNLATCH_ON(r13)
 	mfspr	r13,SPRN_SPRG3		/* get paca address into r13 */
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
 	std	r9,PACA_EXSLB+EX_R9(r13)	/* save r9 - r12 */
+	mfcr	r9
+#ifdef __DISABLED__
+	/* Keep that around for when we re-implement dynamic VSIDs */
+	cmpdi	r3,0
+	bge	slb_miss_user_pseries
+#endif /* __DISABLED__ */
 	std	r10,PACA_EXSLB+EX_R10(r13)
 	std	r11,PACA_EXSLB+EX_R11(r13)
 	std	r12,PACA_EXSLB+EX_R12(r13)
-	std	r3,PACA_EXSLB+EX_R3(r13)
-	mfspr	r9,SPRN_SPRG1
-	std	r9,PACA_EXSLB+EX_R13(r13)
-	mfcr	r9
+	mfspr	r10,SPRN_SPRG1
+	std	r10,PACA_EXSLB+EX_R13(r13)
 	mfspr	r12,SPRN_SRR1		/* and SRR1 */
-	mfspr	r3,SPRN_SRR0			/* SRR0 is faulting address */
-	b	.do_slb_miss		/* Rel. branch works in real mode */
+	b	.slb_miss_realmode	/* Rel. branch works in real mode */
 
 	STD_EXCEPTION_PSERIES(0x500, hardware_interrupt)
 	STD_EXCEPTION_PSERIES(0x600, alignment)
@@ -508,6 +518,38 @@ _GLOBAL(do_stab_bolted_pSeries)
 	mfspr	r12,SPRN_SPRG2
 	EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted)
 
+/*
+ * We have some room here  we use that to put
+ * the peries slb miss user trampoline code so it's reasonably
+ * away from slb_miss_user_common to avoid problems with rfid
+ *
+ * This is used for when the SLB miss handler has to go virtual,
+ * which doesn't happen for now anymore but will once we re-implement
+ * dynamic VSIDs for shared page tables
+ */
+#ifdef __DISABLED__
+slb_miss_user_pseries:
+	std	r10,PACA_EXGEN+EX_R10(r13)
+	std	r11,PACA_EXGEN+EX_R11(r13)
+	std	r12,PACA_EXGEN+EX_R12(r13)
+	mfspr	r10,SPRG1
+	ld	r11,PACA_EXSLB+EX_R9(r13)
+	ld	r12,PACA_EXSLB+EX_R3(r13)
+	std	r10,PACA_EXGEN+EX_R13(r13)
+	std	r11,PACA_EXGEN+EX_R9(r13)
+	std	r12,PACA_EXGEN+EX_R3(r13)
+	clrrdi	r12,r13,32
+	mfmsr	r10
+	mfspr	r11,SRR0			/* save SRR0 */
+	ori	r12,r12,slb_miss_user_common@l	/* virt addr of handler */
+	ori	r10,r10,MSR_IR|MSR_DR|MSR_RI
+	mtspr	SRR0,r12
+	mfspr	r12,SRR1			/* and SRR1 */
+	mtspr	SRR1,r10
+	rfid
+	b	.				/* prevent spec. execution */
+#endif /* __DISABLED__ */
+
 /*
  * Vectors for the FWNMI option.  Share common code.
  */
@@ -559,22 +601,59 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
 	.globl	data_access_slb_iSeries
 data_access_slb_iSeries:
 	mtspr	SPRN_SPRG1,r13		/* save r13 */
-	EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB)
+	mfspr	r13,SPRN_SPRG3		/* get paca address into r13 */
 	std	r3,PACA_EXSLB+EX_R3(r13)
-	ld	r12,PACALPPACA+LPPACASRR1(r13)
 	mfspr	r3,SPRN_DAR
-	b	.do_slb_miss
+	std	r9,PACA_EXSLB+EX_R9(r13)
+	mfcr	r9
+#ifdef __DISABLED__
+	cmpdi	r3,0
+	bge	slb_miss_user_iseries
+#endif
+	std	r10,PACA_EXSLB+EX_R10(r13)
+	std	r11,PACA_EXSLB+EX_R11(r13)
+	std	r12,PACA_EXSLB+EX_R12(r13)
+	mfspr	r10,SPRN_SPRG1
+	std	r10,PACA_EXSLB+EX_R13(r13)
+	ld	r12,PACALPPACA+LPPACASRR1(r13);
+	b	.slb_miss_realmode
 
 	STD_EXCEPTION_ISERIES(0x400, instruction_access, PACA_EXGEN)
 
 	.globl	instruction_access_slb_iSeries
 instruction_access_slb_iSeries:
 	mtspr	SPRN_SPRG1,r13		/* save r13 */
-	EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB)
+	mfspr	r13,SPRN_SPRG3		/* get paca address into r13 */
 	std	r3,PACA_EXSLB+EX_R3(r13)
-	ld	r12,PACALPPACA+LPPACASRR1(r13)
-	ld	r3,PACALPPACA+LPPACASRR0(r13)
-	b	.do_slb_miss
+	ld	r3,PACALPPACA+LPPACASRR0(r13)	/* get SRR0 value */
+	std	r9,PACA_EXSLB+EX_R9(r13)
+	mfcr	r9
+#ifdef __DISABLED__
+	cmpdi	r3,0
+	bge	.slb_miss_user_iseries
+#endif
+	std	r10,PACA_EXSLB+EX_R10(r13)
+	std	r11,PACA_EXSLB+EX_R11(r13)
+	std	r12,PACA_EXSLB+EX_R12(r13)
+	mfspr	r10,SPRN_SPRG1
+	std	r10,PACA_EXSLB+EX_R13(r13)
+	ld	r12,PACALPPACA+LPPACASRR1(r13);
+	b	.slb_miss_realmode
+
+#ifdef __DISABLED__
+slb_miss_user_iseries:
+	std	r10,PACA_EXGEN+EX_R10(r13)
+	std	r11,PACA_EXGEN+EX_R11(r13)
+	std	r12,PACA_EXGEN+EX_R12(r13)
+	mfspr	r10,SPRG1
+	ld	r11,PACA_EXSLB+EX_R9(r13)
+	ld	r12,PACA_EXSLB+EX_R3(r13)
+	std	r10,PACA_EXGEN+EX_R13(r13)
+	std	r11,PACA_EXGEN+EX_R9(r13)
+	std	r12,PACA_EXGEN+EX_R3(r13)
+	EXCEPTION_PROLOG_ISERIES_2
+	b	slb_miss_user_common
+#endif
 
 	MASKABLE_EXCEPTION_ISERIES(0x500, hardware_interrupt)
 	STD_EXCEPTION_ISERIES(0x600, alignment, PACA_EXGEN)
@@ -809,6 +888,126 @@ instruction_access_common:
 	li	r5,0x400
 	b	.do_hash_page		/* Try to handle as hpte fault */
 
+/*
+ * Here is the common SLB miss user that is used when going to virtual
+ * mode for SLB misses, that is currently not used
+ */
+#ifdef __DISABLED__
+	.align	7
+	.globl	slb_miss_user_common
+slb_miss_user_common:
+	mflr	r10
+	std	r3,PACA_EXGEN+EX_DAR(r13)
+	stw	r9,PACA_EXGEN+EX_CCR(r13)
+	std	r10,PACA_EXGEN+EX_LR(r13)
+	std	r11,PACA_EXGEN+EX_SRR0(r13)
+	bl	.slb_allocate_user
+
+	ld	r10,PACA_EXGEN+EX_LR(r13)
+	ld	r3,PACA_EXGEN+EX_R3(r13)
+	lwz	r9,PACA_EXGEN+EX_CCR(r13)
+	ld	r11,PACA_EXGEN+EX_SRR0(r13)
+	mtlr	r10
+	beq-	slb_miss_fault
+
+	andi.	r10,r12,MSR_RI		/* check for unrecoverable exception */
+	beq-	unrecov_user_slb
+	mfmsr	r10
+
+.machine push
+.machine "power4"
+	mtcrf	0x80,r9
+.machine pop
+
+	clrrdi	r10,r10,2		/* clear RI before setting SRR0/1 */
+	mtmsrd	r10,1
+
+	mtspr	SRR0,r11
+	mtspr	SRR1,r12
+
+	ld	r9,PACA_EXGEN+EX_R9(r13)
+	ld	r10,PACA_EXGEN+EX_R10(r13)
+	ld	r11,PACA_EXGEN+EX_R11(r13)
+	ld	r12,PACA_EXGEN+EX_R12(r13)
+	ld	r13,PACA_EXGEN+EX_R13(r13)
+	rfid
+	b	.
+
+slb_miss_fault:
+	EXCEPTION_PROLOG_COMMON(0x380, PACA_EXGEN)
+	ld	r4,PACA_EXGEN+EX_DAR(r13)
+	li	r5,0
+	std	r4,_DAR(r1)
+	std	r5,_DSISR(r1)
+	b	.handle_page_fault
+
+unrecov_user_slb:
+	EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN)
+	DISABLE_INTS
+	bl	.save_nvgprs
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	1b
+
+#endif /* __DISABLED__ */
+
+
+/*
+ * r13 points to the PACA, r9 contains the saved CR,
+ * r12 contain the saved SRR1, SRR0 is still ready for return
+ * r3 has the faulting address
+ * r9 - r13 are saved in paca->exslb.
+ * r3 is saved in paca->slb_r3
+ * We assume we aren't going to take any exceptions during this procedure.
+ */
+_GLOBAL(slb_miss_realmode)
+	mflr	r10
+
+	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
+	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
+
+	bl	.slb_allocate_realmode
+
+	/* All done -- return from exception. */
+
+	ld	r10,PACA_EXSLB+EX_LR(r13)
+	ld	r3,PACA_EXSLB+EX_R3(r13)
+	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
+#ifdef CONFIG_PPC_ISERIES
+	ld	r11,PACALPPACA+LPPACASRR0(r13)	/* get SRR0 value */
+#endif /* CONFIG_PPC_ISERIES */
+
+	mtlr	r10
+
+	andi.	r10,r12,MSR_RI	/* check for unrecoverable exception */
+	beq-	unrecov_slb
+
+.machine	push
+.machine	"power4"
+	mtcrf	0x80,r9
+	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
+.machine	pop
+
+#ifdef CONFIG_PPC_ISERIES
+	mtspr	SPRN_SRR0,r11
+	mtspr	SPRN_SRR1,r12
+#endif /* CONFIG_PPC_ISERIES */
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
+	rfid
+	b	.	/* prevent speculative execution */
+
+unrecov_slb:
+	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
+	DISABLE_INTS
+	bl	.save_nvgprs
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	1b
+
 	.align	7
 	.globl hardware_interrupt_common
 	.globl hardware_interrupt_entry
@@ -1138,62 +1337,6 @@ _GLOBAL(do_stab_bolted)
 	rfid
 	b	.	/* prevent speculative execution */
 
-/*
- * r13 points to the PACA, r9 contains the saved CR,
- * r11 and r12 contain the saved SRR0 and SRR1.
- * r3 has the faulting address
- * r9 - r13 are saved in paca->exslb.
- * r3 is saved in paca->slb_r3
- * We assume we aren't going to take any exceptions during this procedure.
- */
-_GLOBAL(do_slb_miss)
-	mflr	r10
-
-	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
-	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
-
-	bl	.slb_allocate			/* handle it */
-
-	/* All done -- return from exception. */
-
-	ld	r10,PACA_EXSLB+EX_LR(r13)
-	ld	r3,PACA_EXSLB+EX_R3(r13)
-	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
-#ifdef CONFIG_PPC_ISERIES
-	ld	r11,PACALPPACA+LPPACASRR0(r13)	/* get SRR0 value */
-#endif /* CONFIG_PPC_ISERIES */
-
-	mtlr	r10
-
-	andi.	r10,r12,MSR_RI	/* check for unrecoverable exception */
-	beq-	unrecov_slb
-
-.machine	push
-.machine	"power4"
-	mtcrf	0x80,r9
-	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
-.machine	pop
-
-#ifdef CONFIG_PPC_ISERIES
-	mtspr	SPRN_SRR0,r11
-	mtspr	SPRN_SRR1,r12
-#endif /* CONFIG_PPC_ISERIES */
-	ld	r9,PACA_EXSLB+EX_R9(r13)
-	ld	r10,PACA_EXSLB+EX_R10(r13)
-	ld	r11,PACA_EXSLB+EX_R11(r13)
-	ld	r12,PACA_EXSLB+EX_R12(r13)
-	ld	r13,PACA_EXSLB+EX_R13(r13)
-	rfid
-	b	.	/* prevent speculative execution */
-
-unrecov_slb:
-	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
-	DISABLE_INTS
-	bl	.save_nvgprs
-1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.unrecoverable_exception
-	b	1b
-
 /*
  * Space for CPU0's segment table.
  *
@@ -1569,7 +1712,10 @@ _GLOBAL(__secondary_start)
 #endif
 	/* Initialize the first segment table (or SLB) entry		 */
 	ld	r3,PACASTABVIRT(r13)	/* get addr of segment table	 */
+BEGIN_FTR_SECTION
 	bl	.stab_initialize
+END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
+	bl	.slb_initialize
 
 	/* Initialize the kernel stack.  Just a repeat for iSeries.	 */
 	LOADADDR(r3,current_set)
diff --git a/arch/powerpc/kernel/lparmap.c b/arch/powerpc/kernel/lparmap.c
index eded971d1bf9317074f4744084b597a0b6a4988c..5a05a797485fea898e35b215501dd1f2e85c1a76 100644
--- a/arch/powerpc/kernel/lparmap.c
+++ b/arch/powerpc/kernel/lparmap.c
@@ -25,7 +25,7 @@ const struct LparMap __attribute__((__section__(".text"))) xLparMap = {
 	.xRanges = {
 		{ .xPages = HvPagesToMap,
 		  .xOffset = 0,
-		  .xVPN = KERNEL_VSID(KERNELBASE) << (SID_SHIFT - PAGE_SHIFT),
+		  .xVPN = KERNEL_VSID(KERNELBASE) << (SID_SHIFT - HW_PAGE_SHIFT),
 		},
 	},
 };
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 96843211cc5c3187ff09fbe8c6690626f9792f93..7f64f0464d446997756c4fae4c903665d717ea19 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -554,12 +554,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
 #ifdef CONFIG_PPC64
 	if (cpu_has_feature(CPU_FTR_SLB)) {
 		unsigned long sp_vsid = get_kernel_vsid(sp);
+		unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp;
 
 		sp_vsid <<= SLB_VSID_SHIFT;
-		sp_vsid |= SLB_VSID_KERNEL;
-		if (cpu_has_feature(CPU_FTR_16M_PAGE))
-			sp_vsid |= SLB_VSID_L;
-
+		sp_vsid |= SLB_VSID_KERNEL | llp;
 		p->thread.ksp_vsid = sp_vsid;
 	}
 
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index eec2da695508ce27480037dbf9d4e81d58d9073e..3675ef4bac90f09c4e12b5d02580beecafa6ea40 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -724,10 +724,10 @@ static inline char *find_flat_dt_string(u32 offset)
  * used to extract the memory informations at boot before we can
  * unflatten the tree
  */
-static int __init scan_flat_dt(int (*it)(unsigned long node,
-					 const char *uname, int depth,
-					 void *data),
-			       void *data)
+int __init of_scan_flat_dt(int (*it)(unsigned long node,
+				     const char *uname, int depth,
+				     void *data),
+			   void *data)
 {
 	unsigned long p = ((unsigned long)initial_boot_params) +
 		initial_boot_params->off_dt_struct;
@@ -784,8 +784,8 @@ static int __init scan_flat_dt(int (*it)(unsigned long node,
  * This  function can be used within scan_flattened_dt callback to get
  * access to properties
  */
-static void* __init get_flat_dt_prop(unsigned long node, const char *name,
-				     unsigned long *size)
+void* __init of_get_flat_dt_prop(unsigned long node, const char *name,
+				 unsigned long *size)
 {
 	unsigned long p = node;
 
@@ -1087,7 +1087,7 @@ void __init unflatten_device_tree(void)
 static int __init early_init_dt_scan_cpus(unsigned long node,
 					  const char *uname, int depth, void *data)
 {
-	char *type = get_flat_dt_prop(node, "device_type", NULL);
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
 	u32 *prop;
 	unsigned long size = 0;
 
@@ -1095,19 +1095,6 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
-#ifdef CONFIG_PPC_PSERIES
-	/* On LPAR, look for the first ibm,pft-size property for the  hash table size
-	 */
-	if (systemcfg->platform == PLATFORM_PSERIES_LPAR && ppc64_pft_size == 0) {
-		u32 *pft_size;
-		pft_size = get_flat_dt_prop(node, "ibm,pft-size", NULL);
-		if (pft_size != NULL) {
-			/* pft_size[0] is the NUMA CEC cookie */
-			ppc64_pft_size = pft_size[1];
-		}
-	}
-#endif
-
 	boot_cpuid = 0;
 	boot_cpuid_phys = 0;
 	if (initial_boot_params && initial_boot_params->version >= 2) {
@@ -1117,8 +1104,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 		boot_cpuid_phys = initial_boot_params->boot_cpuid_phys;
 	} else {
 		/* Check if it's the boot-cpu, set it's hw index now */
-		if (get_flat_dt_prop(node, "linux,boot-cpu", NULL) != NULL) {
-			prop = get_flat_dt_prop(node, "reg", NULL);
+		if (of_get_flat_dt_prop(node,
+					"linux,boot-cpu", NULL) != NULL) {
+			prop = of_get_flat_dt_prop(node, "reg", NULL);
 			if (prop != NULL)
 				boot_cpuid_phys = *prop;
 		}
@@ -1127,14 +1115,14 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 
 #ifdef CONFIG_ALTIVEC
 	/* Check if we have a VMX and eventually update CPU features */
-	prop = (u32 *)get_flat_dt_prop(node, "ibm,vmx", &size);
+	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,vmx", &size);
 	if (prop && (*prop) > 0) {
 		cur_cpu_spec->cpu_features |= CPU_FTR_ALTIVEC;
 		cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_ALTIVEC;
 	}
 
 	/* Same goes for Apple's "altivec" property */
-	prop = (u32 *)get_flat_dt_prop(node, "altivec", NULL);
+	prop = (u32 *)of_get_flat_dt_prop(node, "altivec", NULL);
 	if (prop) {
 		cur_cpu_spec->cpu_features |= CPU_FTR_ALTIVEC;
 		cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_ALTIVEC;
@@ -1147,7 +1135,7 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 	 * this by looking at the size of the ibm,ppc-interrupt-server#s
 	 * property
 	 */
-	prop = (u32 *)get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s",
+	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s",
 				       &size);
 	cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT;
 	if (prop && ((size / sizeof(u32)) > 1))
@@ -1170,7 +1158,7 @@ static int __init early_init_dt_scan_chosen(unsigned long node,
 		return 0;
 
 	/* get platform type */
-	prop = (u32 *)get_flat_dt_prop(node, "linux,platform", NULL);
+	prop = (u32 *)of_get_flat_dt_prop(node, "linux,platform", NULL);
 	if (prop == NULL)
 		return 0;
 #ifdef CONFIG_PPC64
@@ -1183,21 +1171,21 @@ static int __init early_init_dt_scan_chosen(unsigned long node,
 
 #ifdef CONFIG_PPC64
 	/* check if iommu is forced on or off */
-	if (get_flat_dt_prop(node, "linux,iommu-off", NULL) != NULL)
+	if (of_get_flat_dt_prop(node, "linux,iommu-off", NULL) != NULL)
 		iommu_is_off = 1;
-	if (get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL)
+	if (of_get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL)
 		iommu_force_on = 1;
 #endif
 
- 	lprop = get_flat_dt_prop(node, "linux,memory-limit", NULL);
+ 	lprop = of_get_flat_dt_prop(node, "linux,memory-limit", NULL);
  	if (lprop)
  		memory_limit = *lprop;
 
 #ifdef CONFIG_PPC64
- 	lprop = get_flat_dt_prop(node, "linux,tce-alloc-start", NULL);
+ 	lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-start", NULL);
  	if (lprop)
  		tce_alloc_start = *lprop;
- 	lprop = get_flat_dt_prop(node, "linux,tce-alloc-end", NULL);
+ 	lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-end", NULL);
  	if (lprop)
  		tce_alloc_end = *lprop;
 #endif
@@ -1209,9 +1197,9 @@ static int __init early_init_dt_scan_chosen(unsigned long node,
 	{
 		u64 *basep, *entryp;
 
-		basep = get_flat_dt_prop(node, "linux,rtas-base", NULL);
-		entryp = get_flat_dt_prop(node, "linux,rtas-entry", NULL);
-		prop = get_flat_dt_prop(node, "linux,rtas-size", NULL);
+		basep = of_get_flat_dt_prop(node, "linux,rtas-base", NULL);
+		entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL);
+		prop = of_get_flat_dt_prop(node, "linux,rtas-size", NULL);
 		if (basep && entryp && prop) {
 			rtas.base = *basep;
 			rtas.entry = *entryp;
@@ -1232,11 +1220,11 @@ static int __init early_init_dt_scan_root(unsigned long node,
 	if (depth != 0)
 		return 0;
 
-	prop = get_flat_dt_prop(node, "#size-cells", NULL);
+	prop = of_get_flat_dt_prop(node, "#size-cells", NULL);
 	dt_root_size_cells = (prop == NULL) ? 1 : *prop;
 	DBG("dt_root_size_cells = %x\n", dt_root_size_cells);
 
-	prop = get_flat_dt_prop(node, "#address-cells", NULL);
+	prop = of_get_flat_dt_prop(node, "#address-cells", NULL);
 	dt_root_addr_cells = (prop == NULL) ? 2 : *prop;
 	DBG("dt_root_addr_cells = %x\n", dt_root_addr_cells);
 	
@@ -1271,7 +1259,7 @@ static unsigned long __init dt_mem_next_cell(int s, cell_t **cellp)
 static int __init early_init_dt_scan_memory(unsigned long node,
 					    const char *uname, int depth, void *data)
 {
-	char *type = get_flat_dt_prop(node, "device_type", NULL);
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
 	cell_t *reg, *endp;
 	unsigned long l;
 
@@ -1279,7 +1267,7 @@ static int __init early_init_dt_scan_memory(unsigned long node,
 	if (type == NULL || strcmp(type, "memory") != 0)
 		return 0;
 
-	reg = (cell_t *)get_flat_dt_prop(node, "reg", &l);
+	reg = (cell_t *)of_get_flat_dt_prop(node, "reg", &l);
 	if (reg == NULL)
 		return 0;
 
@@ -1343,12 +1331,12 @@ void __init early_init_devtree(void *params)
 	 * device-tree, including the platform type, initrd location and
 	 * size, TCE reserve, and more ...
 	 */
-	scan_flat_dt(early_init_dt_scan_chosen, NULL);
+	of_scan_flat_dt(early_init_dt_scan_chosen, NULL);
 
 	/* Scan memory nodes and rebuild LMBs */
 	lmb_init();
-	scan_flat_dt(early_init_dt_scan_root, NULL);
-	scan_flat_dt(early_init_dt_scan_memory, NULL);
+	of_scan_flat_dt(early_init_dt_scan_root, NULL);
+	of_scan_flat_dt(early_init_dt_scan_memory, NULL);
 	lmb_enforce_memory_limit(memory_limit);
 	lmb_analyze();
 #ifdef CONFIG_PPC64
@@ -1363,10 +1351,10 @@ void __init early_init_devtree(void *params)
 
 	DBG("Scanning CPUs ...\n");
 
-	/* Retreive hash table size from flattened tree plus other
-	 * CPU related informations (altivec support, boot CPU ID, ...)
+	/* Retreive CPU related informations from the flat tree
+	 * (altivec support, boot CPU ID, ...)
 	 */
-	scan_flat_dt(early_init_dt_scan_cpus, NULL);
+	of_scan_flat_dt(early_init_dt_scan_cpus, NULL);
 
 	DBG(" <- early_init_devtree()\n");
 }
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 6b52cce872bef47ade033cd9d6853c9fcf90545b..b0994050024ff0c127dfece85a4e42ade8255387 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -277,16 +277,21 @@ void __init early_setup(unsigned long dt_ptr)
 	DBG("Found, Initializing memory management...\n");
 
 	/*
-	 * Initialize stab / SLB management
+	 * Initialize the MMU Hash table and create the linear mapping
+	 * of memory. Has to be done before stab/slb initialization as
+	 * this is currently where the page size encoding is obtained
 	 */
-	if (!firmware_has_feature(FW_FEATURE_ISERIES))
-		stab_initialize(lpaca->stab_real);
+	htab_initialize();
 
 	/*
-	 * Initialize the MMU Hash table and create the linear mapping
-	 * of memory
+	 * Initialize stab / SLB management except on iSeries
 	 */
-	htab_initialize();
+	if (!firmware_has_feature(FW_FEATURE_ISERIES)) {
+		if (cpu_has_feature(CPU_FTR_SLB))
+			slb_initialize();
+		else
+			stab_initialize(lpaca->stab_real);
+	}
 
 	DBG(" <- early_setup()\n");
 }
@@ -552,10 +557,12 @@ static void __init irqstack_early_init(void)
 	 * SLB misses on them.
 	 */
 	for_each_cpu(i) {
-		softirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE,
-					THREAD_SIZE, 0x10000000));
-		hardirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE,
-					THREAD_SIZE, 0x10000000));
+		softirq_ctx[i] = (struct thread_info *)
+			__va(lmb_alloc_base(THREAD_SIZE,
+					    THREAD_SIZE, 0x10000000));
+		hardirq_ctx[i] = (struct thread_info *)
+			__va(lmb_alloc_base(THREAD_SIZE,
+					    THREAD_SIZE, 0x10000000));
 	}
 }
 #else
@@ -583,8 +590,8 @@ static void __init emergency_stack_init(void)
 	limit = min(0x10000000UL, lmb.rmo_size);
 
 	for_each_cpu(i)
-		paca[i].emergency_sp = __va(lmb_alloc_base(PAGE_SIZE, 128,
-						limit)) + PAGE_SIZE;
+		paca[i].emergency_sp =
+		__va(lmb_alloc_base(HW_PAGE_SIZE, 128, limit)) + HW_PAGE_SIZE;
 }
 
 /*
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 733d61618bbf91cb67ad72b800e01e23e1a977f7..40523b140109e6455eaa162542fa98b4b4bfbf7b 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -11,7 +11,7 @@
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
 
-_GLOBAL(copy_page)
+_GLOBAL(copy_4K_page)
 	std	r31,-8(1)
 	std	r30,-16(1)
 	std	r29,-24(1)
diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S
index a0b3fbbd6fb17a0b140c73428b06d2f58f3c8f4b..6d69ef39b7df7ca6ed0ac621f0d1b86deb717a0a 100644
--- a/arch/powerpc/lib/copyuser_64.S
+++ b/arch/powerpc/lib/copyuser_64.S
@@ -24,7 +24,7 @@ _GLOBAL(__copy_tofrom_user)
 	std	r4,-16(r1)
 	std	r5,-8(r1)
 	dcbt	0,r4
-	beq	.Lcopy_page
+	beq	.Lcopy_page_4K
 	andi.	r6,r6,7
 	mtcrf	0x01,r5
 	blt	cr1,.Lshort_copy
@@ -366,7 +366,7 @@ _GLOBAL(__copy_tofrom_user)
  * above (following the .Ldst_aligned label) but it runs slightly
  * slower on POWER3.
  */
-.Lcopy_page:
+.Lcopy_page_4K:
 	std	r31,-32(1)
 	std	r30,-40(1)
 	std	r29,-48(1)
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index d6ed9102eeea7a9a0c44b29a19b93cafd957b9c8..e0d02c4a2615f8787abc1b13737f7b78e67bc5a5 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -1,7 +1,7 @@
 /*
  * ppc64 MMU hashtable management routines
  *
- * (c) Copyright IBM Corp. 2003
+ * (c) Copyright IBM Corp. 2003, 2005
  *
  * Maintained by: Benjamin Herrenschmidt
  *                <benh@kernel.crashing.org>
@@ -10,6 +10,7 @@
  * described in the kernel's COPYING file.
  */
 
+#include <linux/config.h>
 #include <asm/reg.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -42,14 +43,24 @@
 /* Save non-volatile offsets */
 #define STK_REG(i)	(112 + ((i)-14)*8)
 
+
+#ifndef CONFIG_PPC_64K_PAGES
+
+/*****************************************************************************
+ *                                                                           *
+ *           4K SW & 4K HW pages implementation                              *
+ *                                                                           *
+ *****************************************************************************/
+
+
 /*
- * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid,
- *		pte_t *ptep, unsigned long trap, int local)
+ * _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+ *		 pte_t *ptep, unsigned long trap, int local)
  *
- * Adds a page to the hash table. This is the non-LPAR version for now
+ * Adds a 4K page to the hash table in a segment of 4K pages only
  */
 
-_GLOBAL(__hash_page)
+_GLOBAL(__hash_page_4K)
 	mflr	r0
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
@@ -88,7 +99,8 @@ _GLOBAL(__hash_page)
 	/* If so, just bail out and refault if needed. Someone else
 	 * is changing this PTE anyway and might hash it.
 	 */
-	bne-	bail_ok
+	bne-	htab_bail_ok
+
 	/* Prepare new PTE value (turn access RW into DIRTY, then
 	 * add BUSY,HASHPTE and ACCESSED)
 	 */
@@ -118,10 +130,10 @@ _GLOBAL(__hash_page)
 
 	/* Convert linux PTE bits into HW equivalents */
 	andi.	r3,r30,0x1fe		/* Get basic set of flags */
-	xori	r3,r3,HW_NO_EXEC	/* _PAGE_EXEC -> NOEXEC */
+	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
 	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
 	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
-	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */
+	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r30,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
 
@@ -158,19 +170,21 @@ htab_insert_pte:
 	andc	r30,r30,r0
 	ori	r30,r30,_PAGE_HASHPTE
 
-	/* page number in r5 */
-	rldicl	r5,r31,64-PTE_SHIFT,PTE_SHIFT
+	/* physical address r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
 
 	/* Calculate primary group hash */
 	and	r0,r28,r27
-	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	rldicr	r3,r0,3,63-3		/* r3 = (hash & mask) << 3 */
 
 	/* Call ppc_md.hpte_insert */
-	ld	r7,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
 	mr	r4,r29			/* Retreive va */
-	li	r6,0			/* no vflags */
+	li	r7,0			/* !bolted, !secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
 _GLOBAL(htab_call_hpte_insert1)
-	bl	.			/* Will be patched by htab_finish_init() */
+	bl	.			/* Patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge	htab_pte_insert_ok	/* Insertion successful */
 	cmpdi	0,r3,-2			/* Critical failure */
@@ -178,19 +192,21 @@ _GLOBAL(htab_call_hpte_insert1)
 
 	/* Now try secondary slot */
 	
-	/* page number in r5 */
-	rldicl	r5,r31,64-PTE_SHIFT,PTE_SHIFT
+	/* physical address r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
 
 	/* Calculate secondary group hash */
 	andc	r0,r27,r28
 	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
 	
 	/* Call ppc_md.hpte_insert */
-	ld	r7,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
 	mr	r4,r29			/* Retreive va */
-	li	r6,HPTE_V_SECONDARY@l	/* secondary slot */
+	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
 _GLOBAL(htab_call_hpte_insert2)
-	bl	.			/* Will be patched by htab_finish_init() */
+	bl	.			/* Patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge+	htab_pte_insert_ok	/* Insertion successful */
 	cmpdi	0,r3,-2			/* Critical failure */
@@ -207,14 +223,14 @@ _GLOBAL(htab_call_hpte_insert2)
 	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */	
 	/* Call ppc_md.hpte_remove */
 _GLOBAL(htab_call_hpte_remove)
-	bl	.			/* Will be patched by htab_finish_init() */
+	bl	.			/* Patched by htab_finish_init() */
 
 	/* Try all again */
 	b	htab_insert_pte	
 
-bail_ok:
+htab_bail_ok:
 	li	r3,0
-	b	bail
+	b	htab_bail
 
 htab_pte_insert_ok:
 	/* Insert slot number & secondary bit in PTE */
@@ -227,7 +243,7 @@ htab_write_out_pte:
 	ld	r6,STK_PARM(r6)(r1)
 	std	r30,0(r6)
 	li	r3, 0
-bail:
+htab_bail:
 	ld	r27,STK_REG(r27)(r1)
 	ld	r28,STK_REG(r28)(r1)
 	ld	r29,STK_REG(r29)(r1)
@@ -256,10 +272,10 @@ htab_modify_pte:
 
 	/* Call ppc_md.hpte_updatepp */
 	mr	r5,r29			/* va */
-	li	r6,0			/* large is 0 */
+	li	r6,MMU_PAGE_4K		/* page size */
 	ld	r7,STK_PARM(r8)(r1)	/* get "local" param */
 _GLOBAL(htab_call_hpte_updatepp)
-	bl	.			/* Will be patched by htab_finish_init() */
+	bl	.			/* Patched by htab_finish_init() */
 
 	/* if we failed because typically the HPTE wasn't really here
 	 * we try an insertion. 
@@ -276,13 +292,556 @@ htab_wrong_access:
 	/* Bail out clearing reservation */
 	stdcx.	r31,0,r6
 	li	r3,1
-	b	bail
+	b	htab_bail
+
+htab_pte_insert_failure:
+	/* Bail out restoring old PTE */
+	ld	r6,STK_PARM(r6)(r1)
+	std	r31,0(r6)
+	li	r3,-1
+	b	htab_bail
+
+
+#else /* CONFIG_PPC_64K_PAGES */
+
+
+/*****************************************************************************
+ *                                                                           *
+ *           64K SW & 4K or 64K HW in a 4K segment pages implementation      *
+ *                                                                           *
+ *****************************************************************************/
+
+/* _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+ *		 pte_t *ptep, unsigned long trap, int local)
+ */
+
+/*
+ * For now, we do NOT implement Admixed pages
+ */
+_GLOBAL(__hash_page_4K)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	/* Save all params that we need after a function call */
+	std	r6,STK_PARM(r6)(r1)
+	std	r8,STK_PARM(r8)(r1)
+
+	/* Add _PAGE_PRESENT to access */
+	ori	r4,r4,_PAGE_PRESENT
+
+	/* Save non-volatile registers.
+	 * r31 will hold "old PTE"
+	 * r30 is "new PTE"
+	 * r29 is "va"
+	 * r28 is a hash value
+	 * r27 is hashtab mask (maybe dynamic patched instead ?)
+	 * r26 is the hidx mask
+	 * r25 is the index in combo page
+	 */
+	std	r25,STK_REG(r25)(r1)
+	std	r26,STK_REG(r26)(r1)
+	std	r27,STK_REG(r27)(r1)
+	std	r28,STK_REG(r28)(r1)
+	std	r29,STK_REG(r29)(r1)
+	std	r30,STK_REG(r30)(r1)
+	std	r31,STK_REG(r31)(r1)
+
+	/* Step 1:
+	 *
+	 * Check permissions, atomically mark the linux PTE busy
+	 * and hashed.
+	 */
+1:
+	ldarx	r31,0,r6
+	/* Check access rights (access & ~(pte_val(*ptep))) */
+	andc.	r0,r4,r31
+	bne-	htab_wrong_access
+	/* Check if PTE is busy */
+	andi.	r0,r31,_PAGE_BUSY
+	/* If so, just bail out and refault if needed. Someone else
+	 * is changing this PTE anyway and might hash it.
+	 */
+	bne-	htab_bail_ok
+	/* Prepare new PTE value (turn access RW into DIRTY, then
+	 * add BUSY and ACCESSED)
+	 */
+	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
+	or	r30,r30,r31
+	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+	/* Write the linux PTE atomically (setting busy) */
+	stdcx.	r30,0,r6
+	bne-	1b
+	isync
+
+	/* Step 2:
+	 *
+	 * Insert/Update the HPTE in the hash table. At this point,
+	 * r4 (access) is re-useable, we use it for the new HPTE flags
+	 */
+
+	/* Load the hidx index */
+	rldicl	r25,r3,64-12,60
+
+	/* Calc va and put it in r29 */
+	rldicr	r29,r5,28,63-28		/* r29 = (vsid << 28) */
+	rldicl	r3,r3,0,36		/* r3 = (ea & 0x0fffffff) */
+	or	r29,r3,r29		/* r29 = va
+
+	/* Calculate hash value for primary slot and store it in r28 */
+	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
+	rldicl	r0,r3,64-12,48		/* (ea >> 12) & 0xffff */
+	xor	r28,r5,r0
+
+	/* Convert linux PTE bits into HW equivalents */
+	andi.	r3,r30,0x1fe		/* Get basic set of flags */
+	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
+	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
+	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
+	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
+	andc	r0,r30,r0		/* r0 = pte & ~r0 */
+	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
+
+	/* We eventually do the icache sync here (maybe inline that
+	 * code rather than call a C function...)
+	 */
+BEGIN_FTR_SECTION
+	mr	r4,r30
+	mr	r5,r7
+	bl	.hash_page_do_lazy_icache
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
+
+	/* At this point, r3 contains new PP bits, save them in
+	 * place of "access" in the param area (sic)
+	 */
+	std	r3,STK_PARM(r4)(r1)
+
+	/* Get htab_hash_mask */
+	ld	r4,htab_hash_mask@got(2)
+	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
+
+	/* Check if we may already be in the hashtable, in this case, we
+	 * go to out-of-line code to try to modify the HPTE. We look for
+	 * the bit at (1 >> (index + 32))
+	 */
+	andi.	r0,r31,_PAGE_HASHPTE
+	li	r26,0			/* Default hidx */
+	beq	htab_insert_pte
+	ld	r6,STK_PARM(r6)(r1)
+	ori	r26,r6,0x8000		/* Load the hidx mask */
+	ld	r26,0(r26)
+	addi	r5,r25,36		/* Check actual HPTE_SUB bit, this */
+	rldcr.	r0,r31,r5,0		/* must match pgtable.h definition */
+	bne	htab_modify_pte
+
+htab_insert_pte:
+	/* real page number in r5, PTE RPN value + index */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT
+	add	r5,r5,r25
+	sldi	r5,r5,HW_PAGE_SHIFT
+
+	/* Calculate primary group hash */
+	and	r0,r28,r27
+	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,0			/* !bolted, !secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
+_GLOBAL(htab_call_hpte_insert1)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge	htab_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	htab_pte_insert_failure
+
+	/* Now try secondary slot */
+
+	/* real page number in r5, PTE RPN value + index */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT
+	add	r5,r5,r25
+	sldi	r5,r5,HW_PAGE_SHIFT
+
+	/* Calculate secondary group hash */
+	andc	r0,r27,r28
+	rldicr	r3,r0,3,63-3		/* r0 = (~hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
+_GLOBAL(htab_call_hpte_insert2)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge+	htab_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	htab_pte_insert_failure
+
+	/* Both are full, we need to evict something */
+	mftb	r0
+	/* Pick a random group based on TB */
+	andi.	r0,r0,1
+	mr	r5,r28
+	bne	2f
+	not	r5,r5
+2:	and	r0,r5,r27
+	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
+	/* Call ppc_md.hpte_remove */
+_GLOBAL(htab_call_hpte_remove)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* Try all again */
+	b	htab_insert_pte
+
+htab_bail_ok:
+	li	r3,0
+	b	htab_bail
+
+htab_pte_insert_ok:
+	/* Insert slot number & secondary bit in PTE second half,
+	 * clear _PAGE_BUSY and set approriate HPTE slot bit
+	 */
+	ld	r6,STK_PARM(r6)(r1)
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	/* HPTE SUB bit */
+	li	r0,1
+	subfic	r5,r25,27		/* Must match bit position in */
+	sld	r0,r0,r5		/* pgtable.h */
+	or	r30,r30,r0
+	/* hindx */
+	sldi	r5,r25,2
+	sld	r3,r3,r5
+	li	r4,0xf
+	sld	r4,r4,r5
+	andc	r26,r26,r4
+	or	r26,r26,r3
+	ori	r5,r6,0x8000
+	std	r26,0(r5)
+	lwsync
+	std	r30,0(r6)
+	li	r3, 0
+htab_bail:
+	ld	r25,STK_REG(r25)(r1)
+	ld	r26,STK_REG(r26)(r1)
+	ld	r27,STK_REG(r27)(r1)
+	ld	r28,STK_REG(r28)(r1)
+	ld	r29,STK_REG(r29)(r1)
+	ld      r30,STK_REG(r30)(r1)
+	ld      r31,STK_REG(r31)(r1)
+	addi    r1,r1,STACKFRAMESIZE
+	ld      r0,16(r1)
+	mtlr    r0
+	blr
+
+htab_modify_pte:
+	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+	mr	r4,r3
+	sldi	r5,r25,2
+	srd	r3,r26,r5
+
+	/* Secondary group ? if yes, get a inverted hash value */
+	mr	r5,r28
+	andi.	r0,r3,0x8 /* page secondary ? */
+	beq	1f
+	not	r5,r5
+1:	andi.	r3,r3,0x7 /* extract idx alone */
+
+	/* Calculate proper slot value for ppc_md.hpte_updatepp */
+	and	r0,r5,r27
+	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	add	r3,r0,r3	/* add slot idx */
+
+	/* Call ppc_md.hpte_updatepp */
+	mr	r5,r29			/* va */
+	li	r6,MMU_PAGE_4K		/* page size */
+	ld	r7,STK_PARM(r8)(r1)	/* get "local" param */
+_GLOBAL(htab_call_hpte_updatepp)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* if we failed because typically the HPTE wasn't really here
+	 * we try an insertion.
+	 */
+	cmpdi	0,r3,-1
+	beq-	htab_insert_pte
+
+	/* Clear the BUSY bit and Write out the PTE */
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	ld	r6,STK_PARM(r6)(r1)
+	std	r30,0(r6)
+	li	r3,0
+	b	htab_bail
+
+htab_wrong_access:
+	/* Bail out clearing reservation */
+	stdcx.	r31,0,r6
+	li	r3,1
+	b	htab_bail
 
 htab_pte_insert_failure:
 	/* Bail out restoring old PTE */
 	ld	r6,STK_PARM(r6)(r1)
 	std	r31,0(r6)
 	li	r3,-1
-	b	bail
+	b	htab_bail
+
+
+/*****************************************************************************
+ *                                                                           *
+ *           64K SW & 64K HW in a 64K segment pages implementation           *
+ *                                                                           *
+ *****************************************************************************/
+
+_GLOBAL(__hash_page_64K)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	/* Save all params that we need after a function call */
+	std	r6,STK_PARM(r6)(r1)
+	std	r8,STK_PARM(r8)(r1)
+
+	/* Add _PAGE_PRESENT to access */
+	ori	r4,r4,_PAGE_PRESENT
+
+	/* Save non-volatile registers.
+	 * r31 will hold "old PTE"
+	 * r30 is "new PTE"
+	 * r29 is "va"
+	 * r28 is a hash value
+	 * r27 is hashtab mask (maybe dynamic patched instead ?)
+	 */
+	std	r27,STK_REG(r27)(r1)
+	std	r28,STK_REG(r28)(r1)
+	std	r29,STK_REG(r29)(r1)
+	std	r30,STK_REG(r30)(r1)
+	std	r31,STK_REG(r31)(r1)
+
+	/* Step 1:
+	 *
+	 * Check permissions, atomically mark the linux PTE busy
+	 * and hashed.
+	 */
+1:
+	ldarx	r31,0,r6
+	/* Check access rights (access & ~(pte_val(*ptep))) */
+	andc.	r0,r4,r31
+	bne-	ht64_wrong_access
+	/* Check if PTE is busy */
+	andi.	r0,r31,_PAGE_BUSY
+	/* If so, just bail out and refault if needed. Someone else
+	 * is changing this PTE anyway and might hash it.
+	 */
+	bne-	ht64_bail_ok
+	/* Prepare new PTE value (turn access RW into DIRTY, then
+	 * add BUSY,HASHPTE and ACCESSED)
+	 */
+	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
+	or	r30,r30,r31
+	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+	/* Write the linux PTE atomically (setting busy) */
+	stdcx.	r30,0,r6
+	bne-	1b
+	isync
+
+	/* Step 2:
+	 *
+	 * Insert/Update the HPTE in the hash table. At this point,
+	 * r4 (access) is re-useable, we use it for the new HPTE flags
+	 */
+
+	/* Calc va and put it in r29 */
+	rldicr	r29,r5,28,63-28
+	rldicl	r3,r3,0,36
+	or	r29,r3,r29
+
+	/* Calculate hash value for primary slot and store it in r28 */
+	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
+	rldicl	r0,r3,64-16,52		/* (ea >> 16) & 0xfff */
+	xor	r28,r5,r0
+
+	/* Convert linux PTE bits into HW equivalents */
+	andi.	r3,r30,0x1fe		/* Get basic set of flags */
+	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
+	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
+	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
+	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
+	andc	r0,r30,r0		/* r0 = pte & ~r0 */
+	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
+
+	/* We eventually do the icache sync here (maybe inline that
+	 * code rather than call a C function...)
+	 */
+BEGIN_FTR_SECTION
+	mr	r4,r30
+	mr	r5,r7
+	bl	.hash_page_do_lazy_icache
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
+
+	/* At this point, r3 contains new PP bits, save them in
+	 * place of "access" in the param area (sic)
+	 */
+	std	r3,STK_PARM(r4)(r1)
+
+	/* Get htab_hash_mask */
+	ld	r4,htab_hash_mask@got(2)
+	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
+
+	/* Check if we may already be in the hashtable, in this case, we
+	 * go to out-of-line code to try to modify the HPTE
+	 */
+	andi.	r0,r31,_PAGE_HASHPTE
+	bne	ht64_modify_pte
+
+ht64_insert_pte:
+	/* Clear hpte bits in new pte (we also clear BUSY btw) and
+	 * add _PAGE_HASHPTE
+	 */
+	lis	r0,_PAGE_HPTEFLAGS@h
+	ori	r0,r0,_PAGE_HPTEFLAGS@l
+	andc	r30,r30,r0
+	ori	r30,r30,_PAGE_HASHPTE
+
+	/* Phyical address in r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
+
+	/* Calculate primary group hash */
+	and	r0,r28,r27
+	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,0			/* !bolted, !secondary */
+	li	r8,MMU_PAGE_64K
+_GLOBAL(ht64_call_hpte_insert1)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge	ht64_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	ht64_pte_insert_failure
+
+	/* Now try secondary slot */
+
+	/* Phyical address in r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
+
+	/* Calculate secondary group hash */
+	andc	r0,r27,r28
+	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
+	li	r8,MMU_PAGE_64K
+_GLOBAL(ht64_call_hpte_insert2)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge+	ht64_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	ht64_pte_insert_failure
+
+	/* Both are full, we need to evict something */
+	mftb	r0
+	/* Pick a random group based on TB */
+	andi.	r0,r0,1
+	mr	r5,r28
+	bne	2f
+	not	r5,r5
+2:	and	r0,r5,r27
+	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	/* Call ppc_md.hpte_remove */
+_GLOBAL(ht64_call_hpte_remove)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* Try all again */
+	b	ht64_insert_pte
+
+ht64_bail_ok:
+	li	r3,0
+	b	ht64_bail
+
+ht64_pte_insert_ok:
+	/* Insert slot number & secondary bit in PTE */
+	rldimi	r30,r3,12,63-15
+
+	/* Write out the PTE with a normal write
+	 * (maybe add eieio may be good still ?)
+	 */
+ht64_write_out_pte:
+	ld	r6,STK_PARM(r6)(r1)
+	std	r30,0(r6)
+	li	r3, 0
+ht64_bail:
+	ld	r27,STK_REG(r27)(r1)
+	ld	r28,STK_REG(r28)(r1)
+	ld	r29,STK_REG(r29)(r1)
+	ld      r30,STK_REG(r30)(r1)
+	ld      r31,STK_REG(r31)(r1)
+	addi    r1,r1,STACKFRAMESIZE
+	ld      r0,16(r1)
+	mtlr    r0
+	blr
+
+ht64_modify_pte:
+	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+	mr	r4,r3
+	rlwinm	r3,r31,32-12,29,31
+
+	/* Secondary group ? if yes, get a inverted hash value */
+	mr	r5,r28
+	andi.	r0,r31,_PAGE_F_SECOND
+	beq	1f
+	not	r5,r5
+1:
+	/* Calculate proper slot value for ppc_md.hpte_updatepp */
+	and	r0,r5,r27
+	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	add	r3,r0,r3	/* add slot idx */
+
+	/* Call ppc_md.hpte_updatepp */
+	mr	r5,r29			/* va */
+	li	r6,MMU_PAGE_64K
+	ld	r7,STK_PARM(r8)(r1)	/* get "local" param */
+_GLOBAL(ht64_call_hpte_updatepp)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* if we failed because typically the HPTE wasn't really here
+	 * we try an insertion.
+	 */
+	cmpdi	0,r3,-1
+	beq-	ht64_insert_pte
+
+	/* Clear the BUSY bit and Write out the PTE */
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	b	ht64_write_out_pte
+
+ht64_wrong_access:
+	/* Bail out clearing reservation */
+	stdcx.	r31,0,r6
+	li	r3,1
+	b	ht64_bail
+
+ht64_pte_insert_failure:
+	/* Bail out restoring old PTE */
+	ld	r6,STK_PARM(r6)(r1)
+	std	r31,0(r6)
+	li	r3,-1
+	b	ht64_bail
+
+
+#endif /* CONFIG_PPC_64K_PAGES */
 
 
+/*****************************************************************************
+ *                                                                           *
+ *           Huge pages implementation is in hugetlbpage.c                   *
+ *                                                                           *
+ *****************************************************************************/
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 174d14576c2816658bb3db98a465432ec532074a..d96bcfe4c6f6c2ed38f6c55afeb7e7809c6bd37c 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -9,6 +9,9 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+
+#undef DEBUG_LOW
+
 #include <linux/spinlock.h>
 #include <linux/bitops.h>
 #include <linux/threads.h>
@@ -22,11 +25,84 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/cputable.h>
+#include <asm/udbg.h>
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
 
 #define HPTE_LOCK_BIT 3
 
 static DEFINE_SPINLOCK(native_tlbie_lock);
 
+static inline void __tlbie(unsigned long va, unsigned int psize)
+{
+	unsigned int penc;
+
+	/* clear top 16 bits, non SLS segment */
+	va &= ~(0xffffULL << 48);
+
+	switch (psize) {
+	case MMU_PAGE_4K:
+		va &= ~0xffful;
+		asm volatile("tlbie %0,0" : : "r" (va) : "memory");
+		break;
+	default:
+		penc = mmu_psize_defs[psize].penc;
+		va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+		va |= (0x7f >> (8 - penc)) << 12;
+		asm volatile("tlbie %0,1" : : "r" (va) : "memory");
+		break;
+	}
+}
+
+static inline void __tlbiel(unsigned long va, unsigned int psize)
+{
+	unsigned int penc;
+
+	/* clear top 16 bits, non SLS segment */
+	va &= ~(0xffffULL << 48);
+
+	switch (psize) {
+	case MMU_PAGE_4K:
+		va &= ~0xffful;
+		asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
+			     : : "r"(va) : "memory");
+		break;
+	default:
+		penc = mmu_psize_defs[psize].penc;
+		va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+		va |= (0x7f >> (8 - penc)) << 12;
+		asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
+			     : : "r"(va) : "memory");
+		break;
+	}
+
+}
+
+static inline void tlbie(unsigned long va, int psize, int local)
+{
+	unsigned int use_local = local && cpu_has_feature(CPU_FTR_TLBIEL);
+	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+	if (use_local)
+		use_local = mmu_psize_defs[psize].tlbiel;
+	if (lock_tlbie && !use_local)
+		spin_lock(&native_tlbie_lock);
+	asm volatile("ptesync": : :"memory");
+	if (use_local) {
+		__tlbiel(va, psize);
+		asm volatile("ptesync": : :"memory");
+	} else {
+		__tlbie(va, psize);
+		asm volatile("eieio; tlbsync; ptesync": : :"memory");
+	}
+	if (lock_tlbie && !use_local)
+		spin_unlock(&native_tlbie_lock);
+}
+
 static inline void native_lock_hpte(hpte_t *hptep)
 {
 	unsigned long *word = &hptep->v;
@@ -48,13 +124,19 @@ static inline void native_unlock_hpte(hpte_t *hptep)
 }
 
 long native_hpte_insert(unsigned long hpte_group, unsigned long va,
-			unsigned long prpn, unsigned long vflags,
-			unsigned long rflags)
+			unsigned long pa, unsigned long rflags,
+			unsigned long vflags, int psize)
 {
 	hpte_t *hptep = htab_address + hpte_group;
 	unsigned long hpte_v, hpte_r;
 	int i;
 
+	if (!(vflags & HPTE_V_BOLTED)) {
+		DBG_LOW("    insert(group=%lx, va=%016lx, pa=%016lx,"
+			" rflags=%lx, vflags=%lx, psize=%d)\n",
+			hpte_group, va, pa, rflags, vflags, psize);
+	}
+
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
 		if (! (hptep->v & HPTE_V_VALID)) {
 			/* retry with lock held */
@@ -70,10 +152,13 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va,
 	if (i == HPTES_PER_GROUP)
 		return -1;
 
-	hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
-	if (vflags & HPTE_V_LARGE)
-		va &= ~(1UL << HPTE_V_AVPN_SHIFT);
-	hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
+	hpte_v = hpte_encode_v(va, psize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize) | rflags;
+
+	if (!(vflags & HPTE_V_BOLTED)) {
+		DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
+			i, hpte_v, hpte_r);
+	}
 
 	hptep->r = hpte_r;
 	/* Guarantee the second dword is visible before the valid bit */
@@ -96,6 +181,8 @@ static long native_hpte_remove(unsigned long hpte_group)
 	int slot_offset;
 	unsigned long hpte_v;
 
+	DBG_LOW("    remove(group=%lx)\n", hpte_group);
+
 	/* pick a random entry to start at */
 	slot_offset = mftb() & 0x7;
 
@@ -126,34 +213,51 @@ static long native_hpte_remove(unsigned long hpte_group)
 	return i;
 }
 
-static inline void set_pp_bit(unsigned long pp, hpte_t *addr)
+static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
+				 unsigned long va, int psize, int local)
 {
-	unsigned long old;
-	unsigned long *p = &addr->r;
-
-	__asm__ __volatile__(
-	"1:	ldarx	%0,0,%3\n\
-		rldimi	%0,%2,0,61\n\
-		stdcx.	%0,0,%3\n\
-		bne	1b"
-	: "=&r" (old), "=m" (*p)
-	: "r" (pp), "r" (p), "m" (*p)
-	: "cc");
+	hpte_t *hptep = htab_address + slot;
+	unsigned long hpte_v, want_v;
+	int ret = 0;
+
+	want_v = hpte_encode_v(va, psize);
+
+	DBG_LOW("    update(va=%016lx, avpnv=%016lx, hash=%016lx, newpp=%x)",
+		va, want_v & HPTE_V_AVPN, slot, newpp);
+
+	native_lock_hpte(hptep);
+
+	hpte_v = hptep->v;
+
+	/* Even if we miss, we need to invalidate the TLB */
+	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
+		DBG_LOW(" -> miss\n");
+		native_unlock_hpte(hptep);
+		ret = -1;
+	} else {
+		DBG_LOW(" -> hit\n");
+		/* Update the HPTE */
+		hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
+			(newpp & (HPTE_R_PP | HPTE_R_N));
+		native_unlock_hpte(hptep);
+	}
+
+	/* Ensure it is out of the tlb too. */
+	tlbie(va, psize, local);
+
+	return ret;
 }
 
-/*
- * Only works on small pages. Yes its ugly to have to check each slot in
- * the group but we only use this during bootup.
- */
-static long native_hpte_find(unsigned long vpn)
+static long native_hpte_find(unsigned long va, int psize)
 {
 	hpte_t *hptep;
 	unsigned long hash;
 	unsigned long i, j;
 	long slot;
-	unsigned long hpte_v;
+	unsigned long want_v, hpte_v;
 
-	hash = hpt_hash(vpn, 0);
+	hash = hpt_hash(va, mmu_psize_defs[psize].shift);
+	want_v = hpte_encode_v(va, psize);
 
 	for (j = 0; j < 2; j++) {
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -161,7 +265,7 @@ static long native_hpte_find(unsigned long vpn)
 			hptep = htab_address + slot;
 			hpte_v = hptep->v;
 
-			if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11))
+			if (HPTE_V_COMPARE(hpte_v, want_v)
 			    && (hpte_v & HPTE_V_VALID)
 			    && ( !!(hpte_v & HPTE_V_SECONDARY) == j)) {
 				/* HPTE matches */
@@ -177,120 +281,92 @@ static long native_hpte_find(unsigned long vpn)
 	return -1;
 }
 
-static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
-				 unsigned long va, int large, int local)
-{
-	hpte_t *hptep = htab_address + slot;
-	unsigned long hpte_v;
-	unsigned long avpn = va >> 23;
-	int ret = 0;
-
-	if (large)
-		avpn &= ~1;
-
-	native_lock_hpte(hptep);
-
-	hpte_v = hptep->v;
-
-	/* Even if we miss, we need to invalidate the TLB */
-	if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
-	    || !(hpte_v & HPTE_V_VALID)) {
-		native_unlock_hpte(hptep);
-		ret = -1;
-	} else {
-		set_pp_bit(newpp, hptep);
-		native_unlock_hpte(hptep);
-	}
-
-	/* Ensure it is out of the tlb too */
-	if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
-		tlbiel(va);
-	} else {
-		int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
-
-		if (lock_tlbie)
-			spin_lock(&native_tlbie_lock);
-		tlbie(va, large);
-		if (lock_tlbie)
-			spin_unlock(&native_tlbie_lock);
-	}
-
-	return ret;
-}
-
 /*
  * Update the page protection bits. Intended to be used to create
  * guard pages for kernel data structures on pages which are bolted
  * in the HPT. Assumes pages being operated on will not be stolen.
- * Does not work on large pages.
  *
  * No need to lock here because we should be the only user.
  */
-static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
+static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
+				       int psize)
 {
-	unsigned long vsid, va, vpn, flags = 0;
+	unsigned long vsid, va;
 	long slot;
 	hpte_t *hptep;
-	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
 
 	vsid = get_kernel_vsid(ea);
 	va = (vsid << 28) | (ea & 0x0fffffff);
-	vpn = va >> PAGE_SHIFT;
 
-	slot = native_hpte_find(vpn);
+	slot = native_hpte_find(va, psize);
 	if (slot == -1)
 		panic("could not find page to bolt\n");
 	hptep = htab_address + slot;
 
-	set_pp_bit(newpp, hptep);
+	/* Update the HPTE */
+	hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
+		(newpp & (HPTE_R_PP | HPTE_R_N));
 
-	/* Ensure it is out of the tlb too */
-	if (lock_tlbie)
-		spin_lock_irqsave(&native_tlbie_lock, flags);
-	tlbie(va, 0);
-	if (lock_tlbie)
-		spin_unlock_irqrestore(&native_tlbie_lock, flags);
+	/* Ensure it is out of the tlb too. */
+	tlbie(va, psize, 0);
 }
 
 static void native_hpte_invalidate(unsigned long slot, unsigned long va,
-				    int large, int local)
+				   int psize, int local)
 {
 	hpte_t *hptep = htab_address + slot;
 	unsigned long hpte_v;
-	unsigned long avpn = va >> 23;
+	unsigned long want_v;
 	unsigned long flags;
-	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
-
-	if (large)
-		avpn &= ~1;
 
 	local_irq_save(flags);
-	native_lock_hpte(hptep);
 
+	DBG_LOW("    invalidate(va=%016lx, hash: %x)\n", va, slot);
+
+	want_v = hpte_encode_v(va, psize);
+	native_lock_hpte(hptep);
 	hpte_v = hptep->v;
 
 	/* Even if we miss, we need to invalidate the TLB */
-	if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
-	    || !(hpte_v & HPTE_V_VALID)) {
+	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
 		native_unlock_hpte(hptep);
-	} else {
+	else
 		/* Invalidate the hpte. NOTE: this also unlocks it */
 		hptep->v = 0;
-	}
 
-	/* Invalidate the tlb */
-	if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
-		tlbiel(va);
-	} else {
-		if (lock_tlbie)
-			spin_lock(&native_tlbie_lock);
-		tlbie(va, large);
-		if (lock_tlbie)
-			spin_unlock(&native_tlbie_lock);
-	}
+	/* Invalidate the TLB */
+	tlbie(va, psize, local);
+
 	local_irq_restore(flags);
 }
 
+/*
+ * XXX This need fixing based on page size. It's only used by
+ * native_hpte_clear() for now which needs fixing too so they
+ * make a good pair...
+ */
+static unsigned long slot2va(unsigned long hpte_v, unsigned long slot)
+{
+	unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v);
+	unsigned long va;
+
+	va = avpn << 23;
+
+	if (! (hpte_v & HPTE_V_LARGE)) {
+		unsigned long vpi, pteg;
+
+		pteg = slot / HPTES_PER_GROUP;
+		if (hpte_v & HPTE_V_SECONDARY)
+			pteg = ~pteg;
+
+		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
+
+		va |= vpi << PAGE_SHIFT;
+	}
+
+	return va;
+}
+
 /*
  * clear all mappings on kexec.  All cpus are in real mode (or they will
  * be when they isi), and we are the only one left.  We rely on our kernel
@@ -298,6 +374,8 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va,
  *
  * TODO: add batching support when enabled.  remember, no dynamic memory here,
  * athough there is the control page available...
+ *
+ * XXX FIXME: 4k only for now !
  */
 static void native_hpte_clear(void)
 {
@@ -327,7 +405,7 @@ static void native_hpte_clear(void)
 
 		if (hpte_v & HPTE_V_VALID) {
 			hptep->v = 0;
-			tlbie(slot2va(hpte_v, slot), hpte_v & HPTE_V_LARGE);
+			tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K, 0);
 		}
 	}
 
@@ -335,59 +413,59 @@ static void native_hpte_clear(void)
 	local_irq_restore(flags);
 }
 
+/*
+ * Batched hash table flush, we batch the tlbie's to avoid taking/releasing
+ * the lock all the time
+ */
 static void native_flush_hash_range(unsigned long number, int local)
 {
-	unsigned long va, vpn, hash, secondary, slot, flags, avpn;
-	int i, j;
+	unsigned long va, hash, index, hidx, shift, slot;
 	hpte_t *hptep;
 	unsigned long hpte_v;
+	unsigned long want_v;
+	unsigned long flags;
+	real_pte_t pte;
 	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
-	unsigned long large = batch->large;
+	unsigned long psize = batch->psize;
+	int i;
 
 	local_irq_save(flags);
 
-	j = 0;
 	for (i = 0; i < number; i++) {
-		va = batch->vaddr[j];
-		if (large)
-			vpn = va >> HPAGE_SHIFT;
-		else
-			vpn = va >> PAGE_SHIFT;
-		hash = hpt_hash(vpn, large);
-		secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15;
-		if (secondary)
-			hash = ~hash;
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12;
-
-		hptep = htab_address + slot;
-
-		avpn = va >> 23;
-		if (large)
-			avpn &= ~0x1UL;
-
-		native_lock_hpte(hptep);
-
-		hpte_v = hptep->v;
-
-		/* Even if we miss, we need to invalidate the TLB */
-		if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
-		    || !(hpte_v & HPTE_V_VALID)) {
-			native_unlock_hpte(hptep);
-		} else {
-			/* Invalidate the hpte. NOTE: this also unlocks it */
-			hptep->v = 0;
-		}
-
-		j++;
+		va = batch->vaddr[i];
+		pte = batch->pte[i];
+
+		pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
+			hash = hpt_hash(va, shift);
+			hidx = __rpte_to_hidx(pte, index);
+			if (hidx & _PTEIDX_SECONDARY)
+				hash = ~hash;
+			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot += hidx & _PTEIDX_GROUP_IX;
+			hptep = htab_address + slot;
+			want_v = hpte_encode_v(va, psize);
+			native_lock_hpte(hptep);
+			hpte_v = hptep->v;
+			if (!HPTE_V_COMPARE(hpte_v, want_v) ||
+			    !(hpte_v & HPTE_V_VALID))
+				native_unlock_hpte(hptep);
+			else
+				hptep->v = 0;
+		} pte_iterate_hashed_end();
 	}
 
-	if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+	if (cpu_has_feature(CPU_FTR_TLBIEL) &&
+	    mmu_psize_defs[psize].tlbiel && local) {
 		asm volatile("ptesync":::"memory");
-
-		for (i = 0; i < j; i++)
-			__tlbiel(batch->vaddr[i]);
-
+		for (i = 0; i < number; i++) {
+			va = batch->vaddr[i];
+			pte = batch->pte[i];
+
+			pte_iterate_hashed_subpages(pte, psize, va, index,
+						    shift) {
+				__tlbiel(va, psize);
+			} pte_iterate_hashed_end();
+		}
 		asm volatile("ptesync":::"memory");
 	} else {
 		int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
@@ -396,10 +474,15 @@ static void native_flush_hash_range(unsigned long number, int local)
 			spin_lock(&native_tlbie_lock);
 
 		asm volatile("ptesync":::"memory");
-
-		for (i = 0; i < j; i++)
-			__tlbie(batch->vaddr[i], large);
-
+		for (i = 0; i < number; i++) {
+			va = batch->vaddr[i];
+			pte = batch->pte[i];
+
+			pte_iterate_hashed_subpages(pte, psize, va, index,
+						    shift) {
+				__tlbie(va, psize);
+			} pte_iterate_hashed_end();
+		}
 		asm volatile("eieio; tlbsync; ptesync":::"memory");
 
 		if (lock_tlbie)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 6e9e05cce02c8448e207a782f3ebc4cdb53f6556..b2f3dbca695223fc988c1be9cc778243481eb221 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -19,6 +19,7 @@
  */
 
 #undef DEBUG
+#undef DEBUG_LOW
 
 #include <linux/config.h>
 #include <linux/spinlock.h>
@@ -59,6 +60,15 @@
 #define DBG(fmt...)
 #endif
 
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#define KB (1024)
+#define MB (1024*KB)
+
 /*
  * Note:  pte   --> Linux PTE
  *        HPTE  --> PowerPC Hashed Page Table Entry
@@ -77,91 +87,290 @@ extern unsigned long dart_tablebase;
 
 hpte_t *htab_address;
 unsigned long htab_hash_mask;
-
 unsigned long _SDR1;
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+int mmu_linear_psize = MMU_PAGE_4K;
+int mmu_virtual_psize = MMU_PAGE_4K;
+#ifdef CONFIG_HUGETLB_PAGE
+int mmu_huge_psize = MMU_PAGE_16M;
+unsigned int HPAGE_SHIFT;
+#endif
 
-#define KB (1024)
-#define MB (1024*KB)
-
-static inline void loop_forever(void)
-{
-	volatile unsigned long x = 1;
-	for(;x;x|=1)
-		;
-}
+/* There are definitions of page sizes arrays to be used when none
+ * is provided by the firmware.
+ */
 
-static inline void create_pte_mapping(unsigned long start, unsigned long end,
-				      unsigned long mode, int large)
+/* Pre-POWER4 CPUs (4k pages only)
+ */
+struct mmu_psize_def mmu_psize_defaults_old[] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.sllp	= 0,
+		.penc	= 0,
+		.avpnm	= 0,
+		.tlbiel = 0,
+	},
+};
+
+/* POWER4, GPUL, POWER5
+ *
+ * Support for 16Mb large pages
+ */
+struct mmu_psize_def mmu_psize_defaults_gp[] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.sllp	= 0,
+		.penc	= 0,
+		.avpnm	= 0,
+		.tlbiel = 1,
+	},
+	[MMU_PAGE_16M] = {
+		.shift	= 24,
+		.sllp	= SLB_VSID_L,
+		.penc	= 0,
+		.avpnm	= 0x1UL,
+		.tlbiel = 0,
+	},
+};
+
+
+int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
+		      unsigned long pstart, unsigned long mode, int psize)
 {
-	unsigned long addr;
-	unsigned int step;
+	unsigned long vaddr, paddr;
+	unsigned int step, shift;
 	unsigned long tmp_mode;
-	unsigned long vflags;
+	int ret = 0;
 
-	if (large) {
-		step = 16*MB;
-		vflags = HPTE_V_BOLTED | HPTE_V_LARGE;
-	} else {
-		step = 4*KB;
-		vflags = HPTE_V_BOLTED;
-	}
+	shift = mmu_psize_defs[psize].shift;
+	step = 1 << shift;
 
-	for (addr = start; addr < end; addr += step) {
+	for (vaddr = vstart, paddr = pstart; vaddr < vend;
+	     vaddr += step, paddr += step) {
 		unsigned long vpn, hash, hpteg;
-		unsigned long vsid = get_kernel_vsid(addr);
-		unsigned long va = (vsid << 28) | (addr & 0xfffffff);
-		int ret = -1;
-
-		if (large)
-			vpn = va >> HPAGE_SHIFT;
-		else
-			vpn = va >> PAGE_SHIFT;
-
+		unsigned long vsid = get_kernel_vsid(vaddr);
+		unsigned long va = (vsid << 28) | (vaddr & 0x0fffffff);
 
+		vpn = va >> shift;
 		tmp_mode = mode;
 		
 		/* Make non-kernel text non-executable */
-		if (!in_kernel_text(addr))
-			tmp_mode = mode | HW_NO_EXEC;
-
-		hash = hpt_hash(vpn, large);
+		if (!in_kernel_text(vaddr))
+			tmp_mode = mode | HPTE_R_N;
 
+		hash = hpt_hash(va, shift);
 		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
 
+		/* The crap below can be cleaned once ppd_md.probe() can
+		 * set up the hash callbacks, thus we can just used the
+		 * normal insert callback here.
+		 */
 #ifdef CONFIG_PPC_ISERIES
-		if (systemcfg->platform & PLATFORM_ISERIES_LPAR)
-			ret = iSeries_hpte_bolt_or_insert(hpteg, va,
-				virt_to_abs(addr) >> PAGE_SHIFT,
-				vflags, tmp_mode);
+		if (systemcfg->platform == PLATFORM_ISERIES_LPAR)
+			ret = iSeries_hpte_insert(hpteg, va,
+						  virt_to_abs(paddr),
+						  tmp_mode,
+						  HPTE_V_BOLTED,
+						  psize);
 		else
 #endif
 #ifdef CONFIG_PPC_PSERIES
 		if (systemcfg->platform & PLATFORM_LPAR)
 			ret = pSeries_lpar_hpte_insert(hpteg, va,
-				virt_to_abs(addr) >> PAGE_SHIFT,
-				vflags, tmp_mode);
+						       virt_to_abs(paddr),
+						       tmp_mode,
+						       HPTE_V_BOLTED,
+						       psize);
 		else
 #endif
 #ifdef CONFIG_PPC_MULTIPLATFORM
 			ret = native_hpte_insert(hpteg, va,
-				virt_to_abs(addr) >> PAGE_SHIFT,
-				vflags, tmp_mode);
+						 virt_to_abs(paddr),
+						 tmp_mode, HPTE_V_BOLTED,
+						 psize);
 #endif
+		if (ret < 0)
+			break;
+	}
+	return ret < 0 ? ret : 0;
+}
 
-		if (ret == -1) {
-			ppc64_terminate_msg(0x20, "create_pte_mapping");
-			loop_forever();
+static int __init htab_dt_scan_page_sizes(unsigned long node,
+					  const char *uname, int depth,
+					  void *data)
+{
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	u32 *prop;
+	unsigned long size = 0;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = (u32 *)of_get_flat_dt_prop(node,
+					  "ibm,segment-page-sizes", &size);
+	if (prop != NULL) {
+		DBG("Page sizes from device-tree:\n");
+		size /= 4;
+		cur_cpu_spec->cpu_features &= ~(CPU_FTR_16M_PAGE);
+		while(size > 0) {
+			unsigned int shift = prop[0];
+			unsigned int slbenc = prop[1];
+			unsigned int lpnum = prop[2];
+			unsigned int lpenc = 0;
+			struct mmu_psize_def *def;
+			int idx = -1;
+
+			size -= 3; prop += 3;
+			while(size > 0 && lpnum) {
+				if (prop[0] == shift)
+					lpenc = prop[1];
+				prop += 2; size -= 2;
+				lpnum--;
+			}
+			switch(shift) {
+			case 0xc:
+				idx = MMU_PAGE_4K;
+				break;
+			case 0x10:
+				idx = MMU_PAGE_64K;
+				break;
+			case 0x14:
+				idx = MMU_PAGE_1M;
+				break;
+			case 0x18:
+				idx = MMU_PAGE_16M;
+				cur_cpu_spec->cpu_features |= CPU_FTR_16M_PAGE;
+				break;
+			case 0x22:
+				idx = MMU_PAGE_16G;
+				break;
+			}
+			if (idx < 0)
+				continue;
+			def = &mmu_psize_defs[idx];
+			def->shift = shift;
+			if (shift <= 23)
+				def->avpnm = 0;
+			else
+				def->avpnm = (1 << (shift - 23)) - 1;
+			def->sllp = slbenc;
+			def->penc = lpenc;
+			/* We don't know for sure what's up with tlbiel, so
+			 * for now we only set it for 4K and 64K pages
+			 */
+			if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K)
+				def->tlbiel = 1;
+			else
+				def->tlbiel = 0;
+
+			DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, "
+			    "tlbiel=%d, penc=%d\n",
+			    idx, shift, def->sllp, def->avpnm, def->tlbiel,
+			    def->penc);
 		}
+		return 1;
+	}
+	return 0;
+}
+
+
+static void __init htab_init_page_sizes(void)
+{
+	int rc;
+
+	/* Default to 4K pages only */
+	memcpy(mmu_psize_defs, mmu_psize_defaults_old,
+	       sizeof(mmu_psize_defaults_old));
+
+	/*
+	 * Try to find the available page sizes in the device-tree
+	 */
+	rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
+	if (rc != 0)  /* Found */
+		goto found;
+
+	/*
+	 * Not in the device-tree, let's fallback on known size
+	 * list for 16M capable GP & GR
+	 */
+	if ((systemcfg->platform != PLATFORM_ISERIES_LPAR) &&
+	    cpu_has_feature(CPU_FTR_16M_PAGE))
+		memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
+		       sizeof(mmu_psize_defaults_gp));
+ found:
+	/*
+	 * Pick a size for the linear mapping. Currently, we only support
+	 * 16M, 1M and 4K which is the default
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift)
+		mmu_linear_psize = MMU_PAGE_16M;
+	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+		mmu_linear_psize = MMU_PAGE_1M;
+
+	/*
+	 * Pick a size for the ordinary pages. Default is 4K, we support
+	 * 64K if cache inhibited large pages are supported by the
+	 * processor
+	 */
+#ifdef CONFIG_PPC_64K_PAGES
+	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+	    cpu_has_feature(CPU_FTR_CI_LARGE_PAGE))
+		mmu_virtual_psize = MMU_PAGE_64K;
+#endif
+
+	printk(KERN_INFO "Page orders: linear mapping = %d, others = %d\n",
+	       mmu_psize_defs[mmu_linear_psize].shift,
+	       mmu_psize_defs[mmu_virtual_psize].shift);
+
+#ifdef CONFIG_HUGETLB_PAGE
+	/* Init large page size. Currently, we pick 16M or 1M depending
+	 * on what is available
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift)
+		mmu_huge_psize = MMU_PAGE_16M;
+	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+		mmu_huge_psize = MMU_PAGE_1M;
+
+	/* Calculate HPAGE_SHIFT and sanity check it */
+	if (mmu_psize_defs[mmu_huge_psize].shift > 16 &&
+	    mmu_psize_defs[mmu_huge_psize].shift < 28)
+		HPAGE_SHIFT = mmu_psize_defs[mmu_huge_psize].shift;
+	else
+		HPAGE_SHIFT = 0; /* No huge pages dude ! */
+#endif /* CONFIG_HUGETLB_PAGE */
+}
+
+static int __init htab_dt_scan_pftsize(unsigned long node,
+				       const char *uname, int depth,
+				       void *data)
+{
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	u32 *prop;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+	if (prop != NULL) {
+		/* pft_size[0] is the NUMA CEC cookie */
+		ppc64_pft_size = prop[1];
+		return 1;
 	}
+	return 0;
 }
 
-static unsigned long get_hashtable_size(void)
+static unsigned long __init htab_get_table_size(void)
 {
 	unsigned long rnd_mem_size, pteg_count;
 
-	/* If hash size wasn't obtained in prom.c, we calculate it now based on
-	 * the total RAM size
+	/* If hash size isn't already provided by the platform, we try to
+	 * retreive it from the device-tree. If it's not there neither, we
+	 * calculate it now based on the total RAM size
 	 */
+	if (ppc64_pft_size == 0)
+		of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
 	if (ppc64_pft_size)
 		return 1UL << ppc64_pft_size;
 
@@ -181,17 +390,21 @@ void __init htab_initialize(void)
 	unsigned long table, htab_size_bytes;
 	unsigned long pteg_count;
 	unsigned long mode_rw;
-	int i, use_largepages = 0;
 	unsigned long base = 0, size = 0;
+	int i;
+
 	extern unsigned long tce_alloc_start, tce_alloc_end;
 
 	DBG(" -> htab_initialize()\n");
 
+	/* Initialize page sizes */
+	htab_init_page_sizes();
+
 	/*
 	 * Calculate the required size of the htab.  We want the number of
 	 * PTEGs to equal one half the number of real pages.
 	 */ 
-	htab_size_bytes = get_hashtable_size();
+	htab_size_bytes = htab_get_table_size();
 	pteg_count = htab_size_bytes >> 7;
 
 	/* For debug, make the HTAB 1/8 as big as it normally would be. */
@@ -211,14 +424,11 @@ void __init htab_initialize(void)
 		 * the absolute address space.
 		 */
 		table = lmb_alloc(htab_size_bytes, htab_size_bytes);
+		BUG_ON(table == 0);
 
 		DBG("Hash table allocated at %lx, size: %lx\n", table,
 		    htab_size_bytes);
 
-		if ( !table ) {
-			ppc64_terminate_msg(0x20, "hpt space");
-			loop_forever();
-		}
 		htab_address = abs_to_virt(table);
 
 		/* htab absolute addr + encoded htabsize */
@@ -234,8 +444,6 @@ void __init htab_initialize(void)
 	 * _NOT_ map it to avoid cache paradoxes as it's remapped non
 	 * cacheable later on
 	 */
-	if (cpu_has_feature(CPU_FTR_16M_PAGE))
-		use_largepages = 1;
 
 	/* create bolted the linear mapping in the hash table */
 	for (i=0; i < lmb.memory.cnt; i++) {
@@ -246,27 +454,32 @@ void __init htab_initialize(void)
 
 #ifdef CONFIG_U3_DART
 		/* Do not map the DART space. Fortunately, it will be aligned
-		 * in such a way that it will not cross two lmb regions and will
-		 * fit within a single 16Mb page.
-		 * The DART space is assumed to be a full 16Mb region even if we
-		 * only use 2Mb of that space. We will use more of it later for
-		 * AGP GART. We have to use a full 16Mb large page.
+		 * in such a way that it will not cross two lmb regions and
+		 * will fit within a single 16Mb page.
+		 * The DART space is assumed to be a full 16Mb region even if
+		 * we only use 2Mb of that space. We will use more of it later
+		 * for AGP GART. We have to use a full 16Mb large page.
 		 */
 		DBG("DART base: %lx\n", dart_tablebase);
 
 		if (dart_tablebase != 0 && dart_tablebase >= base
 		    && dart_tablebase < (base + size)) {
 			if (base != dart_tablebase)
-				create_pte_mapping(base, dart_tablebase, mode_rw,
-						   use_largepages);
+				BUG_ON(htab_bolt_mapping(base, dart_tablebase,
+							 base, mode_rw,
+							 mmu_linear_psize));
 			if ((base + size) > (dart_tablebase + 16*MB))
-				create_pte_mapping(dart_tablebase + 16*MB, base + size,
-						   mode_rw, use_largepages);
+				BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB,
+							 base + size,
+							 dart_tablebase+16*MB,
+							 mode_rw,
+							 mmu_linear_psize));
 			continue;
 		}
 #endif /* CONFIG_U3_DART */
-		create_pte_mapping(base, base + size, mode_rw, use_largepages);
-	}
+		BUG_ON(htab_bolt_mapping(base, base + size, base,
+					 mode_rw, mmu_linear_psize));
+       }
 
 	/*
 	 * If we have a memory_limit and we've allocated TCEs then we need to
@@ -282,8 +495,9 @@ void __init htab_initialize(void)
 		if (base + size >= tce_alloc_start)
 			tce_alloc_start = base + size + 1;
 
-		create_pte_mapping(tce_alloc_start, tce_alloc_end,
-			mode_rw, use_largepages);
+ 		BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
+ 					 tce_alloc_start, mode_rw,
+					 mmu_linear_psize));
 	}
 
 	DBG(" <- htab_initialize()\n");
@@ -298,9 +512,6 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
 {
 	struct page *page;
 
-	if (!pfn_valid(pte_pfn(pte)))
-		return pp;
-
 	page = pte_page(pte);
 
 	/* page is dirty */
@@ -309,7 +520,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
 			__flush_dcache_icache(page_address(page));
 			set_bit(PG_arch_1, &page->flags);
 		} else
-			pp |= HW_NO_EXEC;
+			pp |= HPTE_R_N;
 	}
 	return pp;
 }
@@ -325,94 +536,169 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	unsigned long vsid;
 	struct mm_struct *mm;
 	pte_t *ptep;
-	int ret;
-	int user_region = 0;
-	int local = 0;
 	cpumask_t tmp;
+	int rc, user_region = 0, local = 0;
 
-	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
-		return 1;
+	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
+		ea, access, trap);
 
+	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) {
+		DBG_LOW(" out of pgtable range !\n");
+ 		return 1;
+	}
+
+	/* Get region & vsid */
  	switch (REGION_ID(ea)) {
 	case USER_REGION_ID:
 		user_region = 1;
 		mm = current->mm;
-		if (! mm)
+		if (! mm) {
+			DBG_LOW(" user region with no mm !\n");
 			return 1;
-
+		}
 		vsid = get_vsid(mm->context.id, ea);
 		break;
 	case VMALLOC_REGION_ID:
 		mm = &init_mm;
 		vsid = get_kernel_vsid(ea);
 		break;
-#if 0
-	case KERNEL_REGION_ID:
-		/*
-		 * Should never get here - entire 0xC0... region is bolted.
-		 * Send the problem up to do_page_fault 
-		 */
-#endif
 	default:
 		/* Not a valid range
 		 * Send the problem up to do_page_fault 
 		 */
 		return 1;
-		break;
 	}
+	DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
 
+	/* Get pgdir */
 	pgdir = mm->pgd;
-
 	if (pgdir == NULL)
 		return 1;
 
+	/* Check CPU locality */
 	tmp = cpumask_of_cpu(smp_processor_id());
 	if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
 		local = 1;
 
-	/* Is this a huge page ? */
-	if (unlikely(in_hugepage_area(mm->context, ea)))
-		ret = hash_huge_page(mm, access, ea, vsid, local);
-	else {
-		ptep = find_linux_pte(pgdir, ea);
-		if (ptep == NULL)
-			return 1;
-		ret = __hash_page(ea, access, vsid, ptep, trap, local);
+	/* Handle hugepage regions */
+	if (unlikely(in_hugepage_area(mm->context, ea))) {
+		DBG_LOW(" -> huge page !\n");
+		return hash_huge_page(mm, access, ea, vsid, local);
+	}
+
+	/* Get PTE and page size from page tables */
+	ptep = find_linux_pte(pgdir, ea);
+	if (ptep == NULL || !pte_present(*ptep)) {
+		DBG_LOW(" no PTE !\n");
+		return 1;
+	}
+
+#ifndef CONFIG_PPC_64K_PAGES
+	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
+#else
+	DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
+		pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+	/* Pre-check access permissions (will be re-checked atomically
+	 * in __hash_page_XX but this pre-check is a fast path
+	 */
+	if (access & ~pte_val(*ptep)) {
+		DBG_LOW(" no access !\n");
+		return 1;
 	}
 
-	return ret;
+	/* Do actual hashing */
+#ifndef CONFIG_PPC_64K_PAGES
+	rc = __hash_page_4K(ea, access, vsid, ptep, trap, local);
+#else
+	if (mmu_virtual_psize == MMU_PAGE_64K)
+		rc = __hash_page_64K(ea, access, vsid, ptep, trap, local);
+	else
+		rc = __hash_page_4K(ea, access, vsid, ptep, trap, local);
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifndef CONFIG_PPC_64K_PAGES
+	DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
+#else
+	DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
+		pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+	DBG_LOW(" -> rc=%d\n", rc);
+	return rc;
 }
 
-void flush_hash_page(unsigned long va, pte_t pte, int local)
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+		  unsigned long access, unsigned long trap)
 {
-	unsigned long vpn, hash, secondary, slot;
-	unsigned long huge = pte_huge(pte);
+	unsigned long vsid;
+	void *pgdir;
+	pte_t *ptep;
+	cpumask_t mask;
+	unsigned long flags;
+	int local = 0;
+
+	/* We don't want huge pages prefaulted for now
+	 */
+	if (unlikely(in_hugepage_area(mm->context, ea)))
+		return;
+
+	DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
+		" trap=%lx\n", mm, mm->pgd, ea, access, trap);
 
-	if (huge)
-		vpn = va >> HPAGE_SHIFT;
+	/* Get PTE, VSID, access mask */
+	pgdir = mm->pgd;
+	if (pgdir == NULL)
+		return;
+	ptep = find_linux_pte(pgdir, ea);
+	if (!ptep)
+		return;
+	vsid = get_vsid(mm->context.id, ea);
+
+	/* Hash it in */
+	local_irq_save(flags);
+	mask = cpumask_of_cpu(smp_processor_id());
+	if (cpus_equal(mm->cpu_vm_mask, mask))
+		local = 1;
+#ifndef CONFIG_PPC_64K_PAGES
+	__hash_page_4K(ea, access, vsid, ptep, trap, local);
+#else
+	if (mmu_virtual_psize == MMU_PAGE_64K)
+		__hash_page_64K(ea, access, vsid, ptep, trap, local);
 	else
-		vpn = va >> PAGE_SHIFT;
-	hash = hpt_hash(vpn, huge);
-	secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
-	if (secondary)
-		hash = ~hash;
-	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-	slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12;
-
-	ppc_md.hpte_invalidate(slot, va, huge, local);
+		__hash_page_4K(ea, access, vsid, ptep, trap, local);
+#endif /* CONFIG_PPC_64K_PAGES */
+	local_irq_restore(flags);
+}
+
+void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int local)
+{
+	unsigned long hash, index, shift, hidx, slot;
+
+	DBG_LOW("flush_hash_page(va=%016x)\n", va);
+	pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
+		hash = hpt_hash(va, shift);
+		hidx = __rpte_to_hidx(pte, index);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+		DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx);
+		ppc_md.hpte_invalidate(slot, va, psize, local);
+	} pte_iterate_hashed_end();
 }
 
 void flush_hash_range(unsigned long number, int local)
 {
-	if (ppc_md.flush_hash_range) {
+	if (ppc_md.flush_hash_range)
 		ppc_md.flush_hash_range(number, local);
-	} else {
+	else {
 		int i;
 		struct ppc64_tlb_batch *batch =
 			&__get_cpu_var(ppc64_tlb_batch);
 
 		for (i = 0; i < number; i++)
-			flush_hash_page(batch->vaddr[i], batch->pte[i], local);
+			flush_hash_page(batch->vaddr[i], batch->pte[i],
+					batch->psize, local);
 	}
 }
 
@@ -452,6 +738,18 @@ void __init htab_finish_init(void)
 	extern unsigned int *htab_call_hpte_remove;
 	extern unsigned int *htab_call_hpte_updatepp;
 
+#ifdef CONFIG_PPC_64K_PAGES
+	extern unsigned int *ht64_call_hpte_insert1;
+	extern unsigned int *ht64_call_hpte_insert2;
+	extern unsigned int *ht64_call_hpte_remove;
+	extern unsigned int *ht64_call_hpte_updatepp;
+
+	make_bl(ht64_call_hpte_insert1, ppc_md.hpte_insert);
+	make_bl(ht64_call_hpte_insert2, ppc_md.hpte_insert);
+	make_bl(ht64_call_hpte_remove, ppc_md.hpte_remove);
+	make_bl(ht64_call_hpte_updatepp, ppc_md.hpte_updatepp);
+#endif /* CONFIG_PPC_64K_PAGES */
+
 	make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
 	make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
 	make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0ea0994ed974e0185bad810cc1757cc4f20f5063..0073a04047e48b6a7b8144ee1538142925d7bdb3 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -47,10 +47,25 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 		pu = pud_offset(pg, addr);
 		if (!pud_none(*pu)) {
 			pm = pmd_offset(pu, addr);
+#ifdef CONFIG_PPC_64K_PAGES
+			/* Currently, we use the normal PTE offset within full
+			 * size PTE pages, thus our huge PTEs are scattered in
+			 * the PTE page and we do waste some. We may change
+			 * that in the future, but the current mecanism keeps
+			 * things much simpler
+			 */
+			if (!pmd_none(*pm)) {
+				/* Note: pte_offset_* are all equivalent on
+				 * ppc64 as we don't have HIGHMEM
+				 */
+				pt = pte_offset_kernel(pm, addr);
+				return pt;
+			}
+#else /* CONFIG_PPC_64K_PAGES */
+			/* On 4k pages, we put huge PTEs in the PMD page */
 			pt = (pte_t *)pm;
-			BUG_ON(!pmd_none(*pm)
-			       && !(pte_present(*pt) && pte_huge(*pt)));
 			return pt;
+#endif /* CONFIG_PPC_64K_PAGES */
 		}
 	}
 
@@ -74,9 +89,16 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	if (pu) {
 		pm = pmd_alloc(mm, pu, addr);
 		if (pm) {
+#ifdef CONFIG_PPC_64K_PAGES
+			/* See comment in huge_pte_offset. Note that if we ever
+			 * want to put the page size in the PMD, we would have
+			 * to open code our own pte_alloc* function in order
+			 * to populate and set the size atomically
+			 */
+			pt = pte_alloc_map(mm, pm, addr);
+#else /* CONFIG_PPC_64K_PAGES */
 			pt = (pte_t *)pm;
-			BUG_ON(!pmd_none(*pm)
-			       && !(pte_present(*pt) && pte_huge(*pt)));
+#endif /* CONFIG_PPC_64K_PAGES */
 			return pt;
 		}
 	}
@@ -84,35 +106,29 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return NULL;
 }
 
-#define HUGEPTE_BATCH_SIZE	(HPAGE_SIZE / PMD_SIZE)
-
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte)
 {
-	int i;
-
 	if (pte_present(*ptep)) {
-		pte_clear(mm, addr, ptep);
+		/* We open-code pte_clear because we need to pass the right
+		 * argument to hpte_update (huge / !huge)
+		 */
+		unsigned long old = pte_update(ptep, ~0UL);
+		if (old & _PAGE_HASHPTE)
+			hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
 		flush_tlb_pending();
 	}
-
-	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
-		*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-		ptep++;
-	}
+	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 }
 
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep)
 {
 	unsigned long old = pte_update(ptep, ~0UL);
-	int i;
 
 	if (old & _PAGE_HASHPTE)
-		hpte_update(mm, addr, old, 0);
-
-	for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
-		ptep[i] = __pte(0);
+		hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
+	*ptep = __pte(0);
 
 	return __pte(old);
 }
@@ -563,6 +579,8 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	int lastshift;
 	u16 areamask, curareas;
 
+	if (HPAGE_SHIFT == 0)
+		return -EINVAL;
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 
@@ -619,19 +637,15 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 		   unsigned long ea, unsigned long vsid, int local)
 {
 	pte_t *ptep;
-	unsigned long va, vpn;
-	pte_t old_pte, new_pte;
-	unsigned long rflags, prpn;
+	unsigned long old_pte, new_pte;
+	unsigned long va, rflags, pa;
 	long slot;
 	int err = 1;
 
-	spin_lock(&mm->page_table_lock);
-
 	ptep = huge_pte_offset(mm, ea);
 
 	/* Search the Linux page table for a match with va */
 	va = (vsid << 28) | (ea & 0x0fffffff);
-	vpn = va >> HPAGE_SHIFT;
 
 	/*
 	 * If no pte found or not present, send the problem up to
@@ -640,8 +654,6 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 	if (unlikely(!ptep || pte_none(*ptep)))
 		goto out;
 
-/* 	BUG_ON(pte_bad(*ptep)); */
-
 	/* 
 	 * Check the user's access rights to the page.  If access should be
 	 * prevented then send the problem up to do_page_fault.
@@ -661,58 +673,64 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 	 */
 
 
-	old_pte = *ptep;
-	new_pte = old_pte;
-
-	rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
+	do {
+		old_pte = pte_val(*ptep);
+		if (old_pte & _PAGE_BUSY)
+			goto out;
+		new_pte = old_pte | _PAGE_BUSY |
+			_PAGE_ACCESSED | _PAGE_HASHPTE;
+	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+					 old_pte, new_pte));
+
+	rflags = 0x2 | (!(new_pte & _PAGE_RW));
  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-	rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
+	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 
 	/* Check if pte already has an hpte (case 2) */
-	if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
+	if (unlikely(old_pte & _PAGE_HASHPTE)) {
 		/* There MIGHT be an HPTE for this pte */
 		unsigned long hash, slot;
 
-		hash = hpt_hash(vpn, 1);
-		if (pte_val(old_pte) & _PAGE_SECONDARY)
+		hash = hpt_hash(va, HPAGE_SHIFT);
+		if (old_pte & _PAGE_F_SECOND)
 			hash = ~hash;
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
+		slot += (old_pte & _PAGE_F_GIX) >> 12;
 
 		if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
-			pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
+			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
-	if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
-		unsigned long hash = hpt_hash(vpn, 1);
+	if (likely(!(old_pte & _PAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
 		unsigned long hpte_group;
 
-		prpn = pte_pfn(old_pte);
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 
 repeat:
 		hpte_group = ((hash & htab_hash_mask) *
 			      HPTES_PER_GROUP) & ~0x7UL;
 
-		/* Update the linux pte with the HPTE slot */
-		pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
-		pte_val(new_pte) |= _PAGE_HASHPTE;
+		/* clear HPTE slot informations in new PTE */
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 
 		/* Add in WIMG bits */
 		/* XXX We should store these in the pte */
+		/* --BenH: I think they are ... */
 		rflags |= _PAGE_COHERENT;
 
-		slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-					  HPTE_V_LARGE, rflags);
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
+					  mmu_huge_psize);
 
 		/* Primary is full, try the secondary */
 		if (unlikely(slot == -1)) {
-			pte_val(new_pte) |= _PAGE_SECONDARY;
+			new_pte |= _PAGE_F_SECOND;
 			hpte_group = ((~hash & htab_hash_mask) *
 				      HPTES_PER_GROUP) & ~0x7UL; 
-			slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-						  HPTE_V_LARGE |
+			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 						  HPTE_V_SECONDARY,
-						  rflags);
+						  mmu_huge_psize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
 					hpte_group = ((hash & htab_hash_mask) *
@@ -726,20 +744,18 @@ repeat:
 		if (unlikely(slot == -2))
 			panic("hash_huge_page: pte_insert failed\n");
 
-		pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
-
-		/* 
-		 * No need to use ldarx/stdcx here because all who
-		 * might be updating the pte will hold the
-		 * page_table_lock
-		 */
-		*ptep = new_pte;
+		new_pte |= (slot << 12) & _PAGE_F_GIX;
 	}
 
+	/*
+	 * No need to use ldarx/stdcx here because all who
+	 * might be updating the pte will hold the
+	 * page_table_lock
+	 */
+	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+
 	err = 0;
 
  out:
-	spin_unlock(&mm->page_table_lock);
-
 	return err;
 }
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index b0fc822ec29f5b660825315a260317bd1a4e9114..dfe7fa37b41a16835b629964d2b21faefe2d97b9 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -188,12 +188,21 @@ static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
 	memset(addr, 0, kmem_cache_size(cache));
 }
 
+#ifdef CONFIG_PPC_64K_PAGES
+static const int pgtable_cache_size[2] = {
+	PTE_TABLE_SIZE, PGD_TABLE_SIZE
+};
+static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
+	"pte_pmd_cache", "pgd_cache",
+};
+#else
 static const int pgtable_cache_size[2] = {
 	PTE_TABLE_SIZE, PMD_TABLE_SIZE
 };
 static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
 	"pgd_pte_cache", "pud_pmd_cache",
 };
+#endif /* CONFIG_PPC_64K_PAGES */
 
 kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 
@@ -201,19 +210,14 @@ void pgtable_cache_init(void)
 {
 	int i;
 
-	BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
-	BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
-	BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
-	BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
-
 	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
 		int size = pgtable_cache_size[i];
 		const char *name = pgtable_cache_name[i];
 
 		pgtable_cache[i] = kmem_cache_create(name,
 						     size, size,
-						     SLAB_HWCACHE_ALIGN
-						     | SLAB_MUST_HWCACHE_ALIGN,
+						     SLAB_HWCACHE_ALIGN |
+						     SLAB_MUST_HWCACHE_ALIGN,
 						     zero_ctor,
 						     NULL);
 		if (! pgtable_cache[i])
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 117b00012e144b0221fe1bd254a0e291571cfcc6..7faa46b71f21ee2db7a6c4d783206c747cdaad8d 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -61,6 +61,9 @@ int init_bootmem_done;
 int mem_init_done;
 unsigned long memory_limit;
 
+extern void hash_preload(struct mm_struct *mm, unsigned long ea,
+			 unsigned long access, unsigned long trap);
+
 /*
  * This is called by /dev/mem to know if a given address has to
  * be mapped non-cacheable or not
@@ -493,18 +496,10 @@ EXPORT_SYMBOL(flush_icache_user_range);
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		      pte_t pte)
 {
-	/* handle i-cache coherency */
-	unsigned long pfn = pte_pfn(pte);
-#ifdef CONFIG_PPC32
-	pmd_t *pmd;
-#else
-	unsigned long vsid;
-	void *pgdir;
-	pte_t *ptep;
-	int local = 0;
-	cpumask_t tmp;
-	unsigned long flags;
+#ifdef CONFIG_PPC_STD_MMU
+	unsigned long access = 0, trap;
 #endif
+	unsigned long pfn = pte_pfn(pte);
 
 	/* handle i-cache coherency */
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
@@ -535,30 +530,21 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
 	if (!pte_young(pte) || address >= TASK_SIZE)
 		return;
-#ifdef CONFIG_PPC32
-	if (Hash == 0)
-		return;
-	pmd = pmd_offset(pgd_offset(vma->vm_mm, address), address);
-	if (!pmd_none(*pmd))
-		add_hash_page(vma->vm_mm->context, address, pmd_val(*pmd));
-#else
-	pgdir = vma->vm_mm->pgd;
-	if (pgdir == NULL)
-		return;
 
-	ptep = find_linux_pte(pgdir, address);
-	if (!ptep)
+	/* We try to figure out if we are coming from an instruction
+	 * access fault and pass that down to __hash_page so we avoid
+	 * double-faulting on execution of fresh text. We have to test
+	 * for regs NULL since init will get here first thing at boot
+	 *
+	 * We also avoid filling the hash if not coming from a fault
+	 */
+	if (current->thread.regs == NULL)
 		return;
-
-	vsid = get_vsid(vma->vm_mm->context.id, address);
-
-	local_irq_save(flags);
-	tmp = cpumask_of_cpu(smp_processor_id());
-	if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
-		local = 1;
-
-	__hash_page(address, 0, vsid, ptep, 0x300, local);
-	local_irq_restore(flags);
-#endif
-#endif
+	trap = TRAP(current->thread.regs);
+	if (trap == 0x400)
+		access |= _PAGE_EXEC;
+	else if (trap != 0x300)
+		return;
+	hash_preload(vma->vm_mm, address, access, trap);
+#endif /* CONFIG_PPC_STD_MMU */
 }
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index b79a7820613558181d16a8d25dce4687e6a3c7e3..51b7869409715d8ce9ebf51d31a89b4258460c55 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -101,7 +101,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
-	unsigned long vsid;
 
 	if (mem_init_done) {
 		pgdp = pgd_offset_k(ea);
@@ -117,28 +116,15 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
 	} else {
-		unsigned long va, vpn, hash, hpteg;
-
 		/*
 		 * If the mm subsystem is not fully up, we cannot create a
 		 * linux page table entry for this mapping.  Simply bolt an
 		 * entry in the hardware page table.
+		 *
 		 */
-		vsid = get_kernel_vsid(ea);
-		va = (vsid << 28) | (ea & 0xFFFFFFF);
-		vpn = va >> PAGE_SHIFT;
-
-		hash = hpt_hash(vpn, 0);
-
-		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
-
-		/* Panic if a pte grpup is full */
-		if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT,
-				       HPTE_V_BOLTED,
-				       _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX)
-		    == -1) {
-			panic("map_io_page: could not insert mapping");
-		}
+		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
+				      mmu_virtual_psize))
+			panic("Can't map bolted IO mapping");
 	}
 	return 0;
 }
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index cef9e83cc7e9e922d7a3e01fe1a4a16bd5bdbde5..d137abd241ff09dc0c7c520794c13b1c24016269 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -178,6 +178,21 @@ void __init setbat(int index, unsigned long virt, unsigned long phys,
 	bat_addrs[index].phys = phys;
 }
 
+/*
+ * Preload a translation in the hash table
+ */
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+		  unsigned long access, unsigned long trap)
+{
+	pmd_t *pmd;
+
+	if (Hash == 0)
+		return;
+	pmd = pmd_offset(pgd_offset(vma->vm_mm, address), address);
+	if (!pmd_none(*pmd))
+		add_hash_page(vma->vm_mm->context, address, pmd_val(*pmd));
+}
+
 /*
  * Initialize the hash table and patch the instructions in hashtable.S.
  */
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 0473953f6a37713cdf1cc8d18597511a299c102e..60e852f2f8e59de68fe63b8b43df3e4d7ee28d1a 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -14,14 +14,32 @@
  *      2 of the License, or (at your option) any later version.
  */
 
+#undef DEBUG
+
 #include <linux/config.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
 #include <asm/paca.h>
 #include <asm/cputable.h>
+#include <asm/cacheflush.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
 
-extern void slb_allocate(unsigned long ea);
+extern void slb_allocate_realmode(unsigned long ea);
+extern void slb_allocate_user(unsigned long ea);
+
+static void slb_allocate(unsigned long ea)
+{
+	/* Currently, we do real mode for all SLBs including user, but
+	 * that will change if we bring back dynamic VSIDs
+	 */
+	slb_allocate_realmode(ea);
+}
 
 static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot)
 {
@@ -46,13 +64,15 @@ static void slb_flush_and_rebolt(void)
 {
 	/* If you change this make sure you change SLB_NUM_BOLTED
 	 * appropriately too. */
-	unsigned long ksp_flags = SLB_VSID_KERNEL;
+	unsigned long linear_llp, virtual_llp, lflags, vflags;
 	unsigned long ksp_esid_data;
 
 	WARN_ON(!irqs_disabled());
 
-	if (cpu_has_feature(CPU_FTR_16M_PAGE))
-		ksp_flags |= SLB_VSID_L;
+	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+	virtual_llp = mmu_psize_defs[mmu_virtual_psize].sllp;
+	lflags = SLB_VSID_KERNEL | linear_llp;
+	vflags = SLB_VSID_KERNEL | virtual_llp;
 
 	ksp_esid_data = mk_esid_data(get_paca()->kstack, 2);
 	if ((ksp_esid_data & ESID_MASK) == KERNELBASE)
@@ -67,9 +87,9 @@ static void slb_flush_and_rebolt(void)
 		     /* Slot 2 - kernel stack */
 		     "slbmte	%2,%3\n"
 		     "isync"
-		     :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)),
+		     :: "r"(mk_vsid_data(VMALLOCBASE, vflags)),
 		        "r"(mk_esid_data(VMALLOCBASE, 1)),
-		        "r"(mk_vsid_data(ksp_esid_data, ksp_flags)),
+		        "r"(mk_vsid_data(ksp_esid_data, lflags)),
 		        "r"(ksp_esid_data)
 		     : "memory");
 }
@@ -102,6 +122,9 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 
 	get_paca()->slb_cache_ptr = 0;
 	get_paca()->context = mm->context;
+#ifdef CONFIG_PPC_64K_PAGES
+	get_paca()->pgdir = mm->pgd;
+#endif /* CONFIG_PPC_64K_PAGES */
 
 	/*
 	 * preload some userspace segments into the SLB.
@@ -131,28 +154,77 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 	slb_allocate(unmapped_base);
 }
 
+static inline void patch_slb_encoding(unsigned int *insn_addr,
+				      unsigned int immed)
+{
+	/* Assume the instruction had a "0" immediate value, just
+	 * "or" in the new value
+	 */
+	*insn_addr |= immed;
+	flush_icache_range((unsigned long)insn_addr, 4+
+			   (unsigned long)insn_addr);
+}
+
 void slb_initialize(void)
 {
+	unsigned long linear_llp, virtual_llp;
+	static int slb_encoding_inited;
+	extern unsigned int *slb_miss_kernel_load_linear;
+	extern unsigned int *slb_miss_kernel_load_virtual;
+	extern unsigned int *slb_miss_user_load_normal;
+#ifdef CONFIG_HUGETLB_PAGE
+	extern unsigned int *slb_miss_user_load_huge;
+	unsigned long huge_llp;
+
+	huge_llp = mmu_psize_defs[mmu_huge_psize].sllp;
+#endif
+
+	/* Prepare our SLB miss handler based on our page size */
+	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+	virtual_llp = mmu_psize_defs[mmu_virtual_psize].sllp;
+	if (!slb_encoding_inited) {
+		slb_encoding_inited = 1;
+		patch_slb_encoding(slb_miss_kernel_load_linear,
+				   SLB_VSID_KERNEL | linear_llp);
+		patch_slb_encoding(slb_miss_kernel_load_virtual,
+				   SLB_VSID_KERNEL | virtual_llp);
+		patch_slb_encoding(slb_miss_user_load_normal,
+				   SLB_VSID_USER | virtual_llp);
+
+		DBG("SLB: linear  LLP = %04x\n", linear_llp);
+		DBG("SLB: virtual LLP = %04x\n", virtual_llp);
+#ifdef CONFIG_HUGETLB_PAGE
+		patch_slb_encoding(slb_miss_user_load_huge,
+				   SLB_VSID_USER | huge_llp);
+		DBG("SLB: huge    LLP = %04x\n", huge_llp);
+#endif
+	}
+
 	/* On iSeries the bolted entries have already been set up by
 	 * the hypervisor from the lparMap data in head.S */
 #ifndef CONFIG_PPC_ISERIES
-	unsigned long flags = SLB_VSID_KERNEL;
+ {
+	unsigned long lflags, vflags;
 
- 	/* Invalidate the entire SLB (even slot 0) & all the ERATS */
- 	if (cpu_has_feature(CPU_FTR_16M_PAGE))
- 		flags |= SLB_VSID_L;
+	lflags = SLB_VSID_KERNEL | linear_llp;
+	vflags = SLB_VSID_KERNEL | virtual_llp;
 
- 	asm volatile("isync":::"memory");
- 	asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
+	/* Invalidate the entire SLB (even slot 0) & all the ERATS */
+	asm volatile("isync":::"memory");
+	asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
 	asm volatile("isync; slbia; isync":::"memory");
-	create_slbe(KERNELBASE, flags, 0);
-	create_slbe(VMALLOCBASE, SLB_VSID_KERNEL, 1);
+	create_slbe(KERNELBASE, lflags, 0);
+
+	/* VMALLOC space has 4K pages always for now */
+	create_slbe(VMALLOCBASE, vflags, 1);
+
 	/* We don't bolt the stack for the time being - we're in boot,
 	 * so the stack is in the bolted segment.  By the time it goes
 	 * elsewhere, we'll call _switch() which will bolt in the new
 	 * one. */
 	asm volatile("isync":::"memory");
-#endif
+ }
+#endif /* CONFIG_PPC_ISERIES */
 
 	get_paca()->stab_rr = SLB_NUM_BOLTED;
 }
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index a3a03da503bcbde0ed78b6611dbd42023e02618c..3e18241b6f35218b65619762f63fc1867052aa83 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -18,61 +18,28 @@
 
 #include <linux/config.h>
 #include <asm/processor.h>
-#include <asm/page.h>
-#include <asm/mmu.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/cputable.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
 
-/* void slb_allocate(unsigned long ea);
+/* void slb_allocate_realmode(unsigned long ea);
  *
  * Create an SLB entry for the given EA (user or kernel).
  * 	r3 = faulting address, r13 = PACA
  *	r9, r10, r11 are clobbered by this function
  * No other registers are examined or changed.
  */
-_GLOBAL(slb_allocate)
-	/*
-	 * First find a slot, round robin. Previously we tried to find
-	 * a free slot first but that took too long. Unfortunately we
-	 * dont have any LRU information to help us choose a slot.
-	 */
-#ifdef CONFIG_PPC_ISERIES
-	/*
-	 * On iSeries, the "bolted" stack segment can be cast out on
-	 * shared processor switch so we need to check for a miss on
-	 * it and restore it to the right slot.
-	 */
-	ld	r9,PACAKSAVE(r13)
-	clrrdi	r9,r9,28
-	clrrdi	r11,r3,28
-	li	r10,SLB_NUM_BOLTED-1	/* Stack goes in last bolted slot */
-	cmpld	r9,r11
-	beq	3f
-#endif /* CONFIG_PPC_ISERIES */
-
-	ld	r10,PACASTABRR(r13)
-	addi	r10,r10,1
-	/* use a cpu feature mask if we ever change our slb size */
-	cmpldi	r10,SLB_NUM_ENTRIES
-
-	blt+	4f
-	li	r10,SLB_NUM_BOLTED
-
-4:
-	std	r10,PACASTABRR(r13)
-3:
-	/* r3 = faulting address, r10 = entry */
+_GLOBAL(slb_allocate_realmode)
+	/* r3 = faulting address */
 
 	srdi	r9,r3,60		/* get region */
-	srdi	r3,r3,28		/* get esid */
+	srdi	r10,r3,28		/* get esid */
 	cmpldi	cr7,r9,0xc		/* cmp KERNELBASE for later use */
 
-	rldimi	r10,r3,28,0		/* r10= ESID<<28 | entry */
-	oris	r10,r10,SLB_ESID_V@h	/* r10 |= SLB_ESID_V */
-
-	/* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */
-
+	/* r3 = address, r10 = esid, cr7 = <>KERNELBASE */
 	blt	cr7,0f			/* user or kernel? */
 
 	/* kernel address: proto-VSID = ESID */
@@ -81,43 +48,161 @@ _GLOBAL(slb_allocate)
 	 * top segment.  That's ok, the scramble below will translate
 	 * it to VSID 0, which is reserved as a bad VSID - one which
 	 * will never have any pages in it.  */
-	li	r11,SLB_VSID_KERNEL
-BEGIN_FTR_SECTION
-	bne	cr7,9f
-	li	r11,(SLB_VSID_KERNEL|SLB_VSID_L)
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-	b	9f
 
-0:	/* user address: proto-VSID = context<<15 | ESID */
-	srdi.	r9,r3,USER_ESID_BITS
+	/* Check if hitting the linear mapping of the vmalloc/ioremap
+	 * kernel space
+	*/
+	bne	cr7,1f
+
+	/* Linear mapping encoding bits, the "li" instruction below will
+	 * be patched by the kernel at boot
+	 */
+_GLOBAL(slb_miss_kernel_load_linear)
+	li	r11,0
+	b	slb_finish_load
+
+1:	/* vmalloc/ioremap mapping encoding bits, the "li" instruction below
+	 * will be patched by the kernel at boot
+	 */
+_GLOBAL(slb_miss_kernel_load_virtual)
+	li	r11,0
+	b	slb_finish_load
+
+
+0:	/* user address: proto-VSID = context << 15 | ESID. First check
+	 * if the address is within the boundaries of the user region
+	 */
+	srdi.	r9,r10,USER_ESID_BITS
 	bne-	8f			/* invalid ea bits set */
 
+	/* Figure out if the segment contains huge pages */
 #ifdef CONFIG_HUGETLB_PAGE
 BEGIN_FTR_SECTION
+	b	1f
+END_FTR_SECTION_IFCLR(CPU_FTR_16M_PAGE)
 	lhz	r9,PACAHIGHHTLBAREAS(r13)
-	srdi	r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
+	srdi	r11,r10,(HTLB_AREA_SHIFT-SID_SHIFT)
 	srd	r9,r9,r11
 	lhz	r11,PACALOWHTLBAREAS(r13)
-	srd	r11,r11,r3
-	or	r9,r9,r11
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+	srd	r11,r11,r10
+	or.	r9,r9,r11
+	beq	1f
+_GLOBAL(slb_miss_user_load_huge)
+	li	r11,0
+	b	2f
+1:
 #endif /* CONFIG_HUGETLB_PAGE */
 
-	li	r11,SLB_VSID_USER
+_GLOBAL(slb_miss_user_load_normal)
+	li	r11,0
 
-#ifdef CONFIG_HUGETLB_PAGE
-BEGIN_FTR_SECTION
-	rldimi	r11,r9,8,55		/* shift masked bit into SLB_VSID_L */
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-#endif /* CONFIG_HUGETLB_PAGE */
+2:
+	ld	r9,PACACONTEXTID(r13)
+	rldimi	r10,r9,USER_ESID_BITS,0
+	b	slb_finish_load
+
+8:	/* invalid EA */
+	li	r10,0			/* BAD_VSID */
+	li	r11,SLB_VSID_USER	/* flags don't much matter */
+	b	slb_finish_load
+
+#ifdef __DISABLED__
+
+/* void slb_allocate_user(unsigned long ea);
+ *
+ * Create an SLB entry for the given EA (user or kernel).
+ * 	r3 = faulting address, r13 = PACA
+ *	r9, r10, r11 are clobbered by this function
+ * No other registers are examined or changed.
+ *
+ * It is called with translation enabled in order to be able to walk the
+ * page tables. This is not currently used.
+ */
+_GLOBAL(slb_allocate_user)
+	/* r3 = faulting address */
+	srdi	r10,r3,28		/* get esid */
+
+	crset	4*cr7+lt		/* set "user" flag for later */
+
+	/* check if we fit in the range covered by the pagetables*/
+	srdi.	r9,r3,PGTABLE_EADDR_SIZE
+	crnot	4*cr0+eq,4*cr0+eq
+	beqlr
 
+	/* now we need to get to the page tables in order to get the page
+	 * size encoding from the PMD. In the future, we'll be able to deal
+	 * with 1T segments too by getting the encoding from the PGD instead
+	 */
+	ld	r9,PACAPGDIR(r13)
+	cmpldi	cr0,r9,0
+	beqlr
+	rlwinm	r11,r10,8,25,28
+	ldx	r9,r9,r11		/* get pgd_t */
+	cmpldi	cr0,r9,0
+	beqlr
+	rlwinm	r11,r10,3,17,28
+	ldx	r9,r9,r11		/* get pmd_t */
+	cmpldi	cr0,r9,0
+	beqlr
+
+	/* build vsid flags */
+	andi.	r11,r9,SLB_VSID_LLP
+	ori	r11,r11,SLB_VSID_USER
+
+	/* get context to calculate proto-VSID */
 	ld	r9,PACACONTEXTID(r13)
-	rldimi	r3,r9,USER_ESID_BITS,0
+	rldimi	r10,r9,USER_ESID_BITS,0
+
+	/* fall through slb_finish_load */
+
+#endif /* __DISABLED__ */
 
-9:	/* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
-	ASM_VSID_SCRAMBLE(r3,r9)
 
-	rldimi	r11,r3,SLB_VSID_SHIFT,16	/* combine VSID and flags */
+/*
+ * Finish loading of an SLB entry and return
+ *
+ * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <>KERNELBASE
+ */
+slb_finish_load:
+	ASM_VSID_SCRAMBLE(r10,r9)
+	rldimi	r11,r10,SLB_VSID_SHIFT,16	/* combine VSID and flags */
+
+	/* r3 = EA, r11 = VSID data */
+	/*
+	 * Find a slot, round robin. Previously we tried to find a
+	 * free slot first but that took too long. Unfortunately we
+ 	 * dont have any LRU information to help us choose a slot.
+ 	 */
+#ifdef CONFIG_PPC_ISERIES
+	/*
+	 * On iSeries, the "bolted" stack segment can be cast out on
+	 * shared processor switch so we need to check for a miss on
+	 * it and restore it to the right slot.
+	 */
+	ld	r9,PACAKSAVE(r13)
+	clrrdi	r9,r9,28
+	clrrdi	r3,r3,28
+	li	r10,SLB_NUM_BOLTED-1	/* Stack goes in last bolted slot */
+	cmpld	r9,r3
+	beq	3f
+#endif /* CONFIG_PPC_ISERIES */
+
+	ld	r10,PACASTABRR(r13)
+	addi	r10,r10,1
+	/* use a cpu feature mask if we ever change our slb size */
+	cmpldi	r10,SLB_NUM_ENTRIES
+
+	blt+	4f
+	li	r10,SLB_NUM_BOLTED
+
+4:
+	std	r10,PACASTABRR(r13)
+
+3:
+	rldimi	r3,r10,0,36		/* r3= EA[0:35] | entry */
+	oris	r10,r3,SLB_ESID_V@h	/* r3 |= SLB_ESID_V */
+
+	/* r3 = ESID data, r11 = VSID data */
 
 	/*
 	 * No need for an isync before or after this slbmte. The exception
@@ -125,7 +210,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 	 */
 	slbmte	r11,r10
 
-	bgelr	cr7			/* we're done for kernel addresses */
+	/* we're done for kernel addresses */
+	crclr	4*cr0+eq		/* set result to "success" */
+	bgelr	cr7
 
 	/* Update the slb cache */
 	lhz	r3,PACASLBCACHEPTR(r13)	/* offset = paca->slb_cache_ptr */
@@ -143,9 +230,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 	li	r3,SLB_CACHE_ENTRIES+1
 2:
 	sth	r3,PACASLBCACHEPTR(r13)	/* paca->slb_cache_ptr = offset */
+	crclr	4*cr0+eq		/* set result to "success" */
 	blr
 
-8:	/* invalid EA */
-	li	r3,0			/* BAD_VSID */
-	li	r11,SLB_VSID_USER	/* flags don't much matter */
-	b	9b
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 1b83f002bf27f421d33c61afc2a1e27f0bb60628..fa325dbf98fc2d14ac3be5774feafbcd3f4cd21c 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -26,7 +26,6 @@ struct stab_entry {
 	unsigned long vsid_data;
 };
 
-/* Both the segment table and SLB code uses the following cache */
 #define NR_STAB_CACHE_ENTRIES 8
 DEFINE_PER_CPU(long, stab_cache_ptr);
 DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
@@ -186,7 +185,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
 		/* Never flush the first entry. */
 		ste += 1;
 		for (entry = 1;
-		     entry < (PAGE_SIZE / sizeof(struct stab_entry));
+		     entry < (HW_PAGE_SIZE / sizeof(struct stab_entry));
 		     entry++, ste++) {
 			unsigned long ea;
 			ea = ste->esid_data & ESID_MASK;
@@ -200,6 +199,10 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
 
 	__get_cpu_var(stab_cache_ptr) = 0;
 
+#ifdef CONFIG_PPC_64K_PAGES
+	get_paca()->pgdir = mm->pgd;
+#endif /* CONFIG_PPC_64K_PAGES */
+
 	/* Now preload some entries for the new task */
 	if (test_tsk_thread_flag(tsk, TIF_32BIT))
 		unmapped_base = TASK_UNMAPPED_BASE_USER32;
@@ -223,8 +226,6 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
 	asm volatile("sync" : : : "memory");
 }
 
-extern void slb_initialize(void);
-
 /*
  * Allocate segment tables for secondary CPUs.  These must all go in
  * the first (bolted) segment, so that do_stab_bolted won't get a
@@ -243,18 +244,21 @@ void stabs_alloc(void)
 		if (cpu == 0)
 			continue; /* stab for CPU 0 is statically allocated */
 
-		newstab = lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, 1<<SID_SHIFT);
+		newstab = lmb_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE,
+					 1<<SID_SHIFT);
 		if (! newstab)
 			panic("Unable to allocate segment table for CPU %d.\n",
 			      cpu);
 
 		newstab += KERNELBASE;
 
-		memset((void *)newstab, 0, PAGE_SIZE);
+		memset((void *)newstab, 0, HW_PAGE_SIZE);
 
 		paca[cpu].stab_addr = newstab;
 		paca[cpu].stab_real = virt_to_abs(newstab);
-		printk(KERN_DEBUG "Segment table for CPU %d at 0x%lx virtual, 0x%lx absolute\n", cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
+		printk(KERN_DEBUG "Segment table for CPU %d at 0x%lx "
+		       "virtual, 0x%lx absolute\n",
+		       cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
 	}
 }
 
@@ -267,13 +271,9 @@ void stab_initialize(unsigned long stab)
 {
 	unsigned long vsid = get_kernel_vsid(KERNELBASE);
 
-	if (cpu_has_feature(CPU_FTR_SLB)) {
-		slb_initialize();
-	} else {
-		asm volatile("isync; slbia; isync":::"memory");
-		make_ste(stab, GET_ESID(KERNELBASE), vsid);
+	asm volatile("isync; slbia; isync":::"memory");
+	make_ste(stab, GET_ESID(KERNELBASE), vsid);
 
-		/* Order update */
-		asm volatile("sync":::"memory");
-	}
+	/* Order update */
+	asm volatile("sync":::"memory");
 }
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index 09ab81a10f4f05100030d88a03760b9ed5b9b9bf..53e31b834ace00f75dbc686321d9ddbeeab8eff7 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -21,6 +21,7 @@
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  */
+
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -30,7 +31,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
-#include <linux/highmem.h>
+#include <asm/bug.h>
 
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 
@@ -126,28 +127,46 @@ void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
  * (if we remove it we should clear the _PTE_HPTEFLAGS bits).
  */
 void hpte_update(struct mm_struct *mm, unsigned long addr,
-		 unsigned long pte, int wrprot)
+		 pte_t *ptep, unsigned long pte, int huge)
 {
 	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
 	unsigned long vsid;
+	unsigned int psize = mmu_virtual_psize;
 	int i;
 
 	i = batch->index;
 
+	/* We mask the address for the base page size. Huge pages will
+	 * have applied their own masking already
+	 */
+	addr &= PAGE_MASK;
+
+	/* Get page size (maybe move back to caller) */
+	if (huge) {
+#ifdef CONFIG_HUGETLB_PAGE
+		psize = mmu_huge_psize;
+#else
+		BUG();
+#endif
+	}
+
 	/*
 	 * This can happen when we are in the middle of a TLB batch and
 	 * we encounter memory pressure (eg copy_page_range when it tries
 	 * to allocate a new pte). If we have to reclaim memory and end
 	 * up scanning and resetting referenced bits then our batch context
 	 * will change mid stream.
+	 *
+	 * We also need to ensure only one page size is present in a given
+	 * batch
 	 */
-	if (i != 0 && (mm != batch->mm || batch->large != pte_huge(pte))) {
+	if (i != 0 && (mm != batch->mm || batch->psize != psize)) {
 		flush_tlb_pending();
 		i = 0;
 	}
 	if (i == 0) {
 		batch->mm = mm;
-		batch->large = pte_huge(pte);
+		batch->psize = psize;
 	}
 	if (addr < KERNELBASE) {
 		vsid = get_vsid(mm->context.id, addr);
@@ -155,7 +174,7 @@ void hpte_update(struct mm_struct *mm, unsigned long addr,
 	} else
 		vsid = get_kernel_vsid(addr);
 	batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff);
-	batch->pte[i] = __pte(pte);
+	batch->pte[i] = __real_pte(__pte(pte), ptep);
 	batch->index = ++i;
 	if (i >= PPC64_TLB_BATCH_NR)
 		flush_tlb_pending();
@@ -177,7 +196,8 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 		local = 1;
 
 	if (i == 1)
-		flush_hash_page(batch->vaddr[0], batch->pte[0], local);
+		flush_hash_page(batch->vaddr[0], batch->pte[0],
+				batch->psize, local);
 	else
 		flush_hash_range(i, local);
 	batch->index = 0;
diff --git a/arch/powerpc/platforms/iseries/htab.c b/arch/powerpc/platforms/iseries/htab.c
index b3c6c3374ca608b42f3098fe42efc3e2cedd5399..30bdcf3925d9b1727cc37a82c48582c06eff6f30 100644
--- a/arch/powerpc/platforms/iseries/htab.c
+++ b/arch/powerpc/platforms/iseries/htab.c
@@ -39,15 +39,16 @@ static inline void iSeries_hunlock(unsigned long slot)
 	spin_unlock(&iSeries_hlocks[(slot >> 4) & 0x3f]);
 }
 
-static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va,
-				unsigned long prpn, unsigned long vflags,
-				unsigned long rflags)
+long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va,
+			 unsigned long pa, unsigned long rflags,
+			 unsigned long vflags, int psize)
 {
-	unsigned long arpn;
 	long slot;
 	hpte_t lhpte;
 	int secondary = 0;
 
+	BUG_ON(psize != MMU_PAGE_4K);
+
 	/*
 	 * The hypervisor tries both primary and secondary.
 	 * If we are being called to insert in the secondary,
@@ -59,8 +60,19 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va,
 
 	iSeries_hlock(hpte_group);
 
-	slot = HvCallHpt_findValid(&lhpte, va >> PAGE_SHIFT);
-	BUG_ON(lhpte.v & HPTE_V_VALID);
+	slot = HvCallHpt_findValid(&lhpte, va >> HW_PAGE_SHIFT);
+	if (unlikely(lhpte.v & HPTE_V_VALID)) {
+		if (vflags & HPTE_V_BOLTED) {
+			HvCallHpt_setSwBits(slot, 0x10, 0);
+			HvCallHpt_setPp(slot, PP_RWXX);
+			iSeries_hunlock(hpte_group);
+			if (slot < 0)
+				return 0x8 | (slot & 7);
+			else
+				return slot & 7;
+		}
+		BUG();
+	}
 
 	if (slot == -1)	{ /* No available entry found in either group */
 		iSeries_hunlock(hpte_group);
@@ -73,10 +85,9 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va,
 		slot &= 0x7fffffffffffffff;
 	}
 
-	arpn = phys_to_abs(prpn << PAGE_SHIFT) >> PAGE_SHIFT;
 
-	lhpte.v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
-	lhpte.r = (arpn << HPTE_R_RPN_SHIFT) | rflags;
+ 	lhpte.v = hpte_encode_v(va, MMU_PAGE_4K) | vflags | HPTE_V_VALID;
+	lhpte.r = hpte_encode_r(phys_to_abs(pa), MMU_PAGE_4K) | rflags;
 
 	/* Now fill in the actual HPTE */
 	HvCallHpt_addValidate(slot, secondary, &lhpte);
@@ -86,25 +97,6 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va,
 	return (secondary << 3) | (slot & 7);
 }
 
-long iSeries_hpte_bolt_or_insert(unsigned long hpte_group,
-		unsigned long va, unsigned long prpn, unsigned long vflags,
-		unsigned long rflags)
-{
-	long slot;
-	hpte_t lhpte;
-
-	slot = HvCallHpt_findValid(&lhpte, va >> PAGE_SHIFT);
-
-	if (lhpte.v & HPTE_V_VALID) {
-		/* Bolt the existing HPTE */
-		HvCallHpt_setSwBits(slot, 0x10, 0);
-		HvCallHpt_setPp(slot, PP_RWXX);
-		return 0;
-	}
-
-	return iSeries_hpte_insert(hpte_group, va, prpn, vflags, rflags);
-}
-
 static unsigned long iSeries_hpte_getword0(unsigned long slot)
 {
 	hpte_t hpte;
@@ -150,15 +142,17 @@ static long iSeries_hpte_remove(unsigned long hpte_group)
  *	bits 61..63 : PP2,PP1,PP0
  */
 static long iSeries_hpte_updatepp(unsigned long slot, unsigned long newpp,
-				  unsigned long va, int large, int local)
+				  unsigned long va, int psize, int local)
 {
 	hpte_t hpte;
-	unsigned long avpn = va >> 23;
+	unsigned long want_v;
 
 	iSeries_hlock(slot);
 
 	HvCallHpt_get(&hpte, slot);
-	if ((HPTE_V_AVPN_VAL(hpte.v) == avpn) && (hpte.v & HPTE_V_VALID)) {
+	want_v = hpte_encode_v(va, MMU_PAGE_4K);
+
+	if (HPTE_V_COMPARE(hpte.v, want_v) && (hpte.v & HPTE_V_VALID)) {
 		/*
 		 * Hypervisor expects bits as NPPP, which is
 		 * different from how they are mapped in our PP.
@@ -210,14 +204,17 @@ static long iSeries_hpte_find(unsigned long vpn)
  *
  * No need to lock here because we should be the only user.
  */
-static void iSeries_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
+static void iSeries_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
+					int psize)
 {
 	unsigned long vsid,va,vpn;
 	long slot;
 
+	BUG_ON(psize != MMU_PAGE_4K);
+
 	vsid = get_kernel_vsid(ea);
 	va = (vsid << 28) | (ea & 0x0fffffff);
-	vpn = va >> PAGE_SHIFT;
+	vpn = va >> HW_PAGE_SHIFT;
 	slot = iSeries_hpte_find(vpn);
 	if (slot == -1)
 		panic("updateboltedpp: Could not find page to bolt\n");
@@ -225,7 +222,7 @@ static void iSeries_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
 }
 
 static void iSeries_hpte_invalidate(unsigned long slot, unsigned long va,
-				    int large, int local)
+				    int psize, int local)
 {
 	unsigned long hpte_v;
 	unsigned long avpn = va >> 23;
diff --git a/arch/powerpc/platforms/iseries/hvlog.c b/arch/powerpc/platforms/iseries/hvlog.c
index 62ec73479687ee6b4b2a2f677e3fa7b96b924156..f476d71194fac1c0bebc9f5bcb8861ff71731fde 100644
--- a/arch/powerpc/platforms/iseries/hvlog.c
+++ b/arch/powerpc/platforms/iseries/hvlog.c
@@ -22,7 +22,7 @@ void HvCall_writeLogBuffer(const void *buffer, u64 len)
 
 	while (len) {
 		hv_buf.addr = cur;
-		left_this_page = ((cur & PAGE_MASK) + PAGE_SIZE) - cur;
+		left_this_page = ((cur & HW_PAGE_MASK) + HW_PAGE_SIZE) - cur;
 		if (left_this_page > len)
 			left_this_page = len;
 		hv_buf.len = left_this_page;
@@ -30,6 +30,6 @@ void HvCall_writeLogBuffer(const void *buffer, u64 len)
 		HvCall2(HvCallBaseWriteLogBuffer,
 				virt_to_abs(&hv_buf),
 				left_this_page);
-		cur = (cur & PAGE_MASK) + PAGE_SIZE;
+		cur = (cur & HW_PAGE_MASK) + HW_PAGE_SIZE;
 	}
 }
diff --git a/arch/powerpc/platforms/iseries/iommu.c b/arch/powerpc/platforms/iseries/iommu.c
index 1a6845b5c5a415c14550b35994cbc630b2a691db..bf081b3458201deef96b0b9bd59216cd666ee985 100644
--- a/arch/powerpc/platforms/iseries/iommu.c
+++ b/arch/powerpc/platforms/iseries/iommu.c
@@ -43,9 +43,12 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
 	u64 rc;
 	union tce_entry tce;
 
+	index <<= TCE_PAGE_FACTOR;
+	npages <<= TCE_PAGE_FACTOR;
+
 	while (npages--) {
 		tce.te_word = 0;
-		tce.te_bits.tb_rpn = virt_to_abs(uaddr) >> PAGE_SHIFT;
+		tce.te_bits.tb_rpn = virt_to_abs(uaddr) >> TCE_SHIFT;
 
 		if (tbl->it_type == TCE_VB) {
 			/* Virtual Bus */
@@ -66,7 +69,7 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
 			panic("PCI_DMA: HvCallXm_setTce failed, Rc: 0x%lx\n",
 					rc);
 		index++;
-		uaddr += PAGE_SIZE;
+		uaddr += TCE_PAGE_SIZE;
 	}
 }
 
@@ -74,6 +77,9 @@ static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages)
 {
 	u64 rc;
 
+	npages <<= TCE_PAGE_FACTOR;
+	index <<= TCE_PAGE_FACTOR;
+
 	while (npages--) {
 		rc = HvCallXm_setTce((u64)tbl->it_index, (u64)index, 0);
 		if (rc)
@@ -83,27 +89,6 @@ static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages)
 	}
 }
 
-#ifdef CONFIG_PCI
-/*
- * This function compares the known tables to find an iommu_table
- * that has already been built for hardware TCEs.
- */
-static struct iommu_table *iommu_table_find(struct iommu_table * tbl)
-{
-	struct pci_dn *pdn;
-
-	list_for_each_entry(pdn, &iSeries_Global_Device_List, Device_List) {
-		struct iommu_table *it = pdn->iommu_table;
-		if ((it != NULL) &&
-		    (it->it_type == TCE_PCI) &&
-		    (it->it_offset == tbl->it_offset) &&
-		    (it->it_index == tbl->it_index) &&
-		    (it->it_size == tbl->it_size))
-			return it;
-	}
-	return NULL;
-}
-
 /*
  * Call Hv with the architected data structure to get TCE table info.
  * info. Put the returned data into the Linux representation of the
@@ -113,8 +98,10 @@ static struct iommu_table *iommu_table_find(struct iommu_table * tbl)
  * 2. TCE table per Bus.
  * 3. TCE Table per IOA.
  */
-static void iommu_table_getparms(struct pci_dn *pdn,
-				 struct iommu_table* tbl)
+void iommu_table_getparms_iSeries(unsigned long busno,
+				  unsigned char slotno,
+				  unsigned char virtbus,
+				  struct iommu_table* tbl)
 {
 	struct iommu_table_cb *parms;
 
@@ -124,9 +111,9 @@ static void iommu_table_getparms(struct pci_dn *pdn,
 
 	memset(parms, 0, sizeof(*parms));
 
-	parms->itc_busno = pdn->busno;
-	parms->itc_slotno = pdn->LogicalSlot;
-	parms->itc_virtbus = 0;
+	parms->itc_busno = busno;
+	parms->itc_slotno = slotno;
+	parms->itc_virtbus = virtbus;
 
 	HvCallXm_getTceTableParms(iseries_hv_addr(parms));
 
@@ -134,17 +121,40 @@ static void iommu_table_getparms(struct pci_dn *pdn,
 		panic("PCI_DMA: parms->size is zero, parms is 0x%p", parms);
 
 	/* itc_size is in pages worth of table, it_size is in # of entries */
-	tbl->it_size = (parms->itc_size * PAGE_SIZE) / sizeof(union tce_entry);
+	tbl->it_size = ((parms->itc_size * TCE_PAGE_SIZE) /
+			sizeof(union tce_entry)) >> TCE_PAGE_FACTOR;
 	tbl->it_busno = parms->itc_busno;
-	tbl->it_offset = parms->itc_offset;
+	tbl->it_offset = parms->itc_offset >> TCE_PAGE_FACTOR;
 	tbl->it_index = parms->itc_index;
 	tbl->it_blocksize = 1;
-	tbl->it_type = TCE_PCI;
+	tbl->it_type = virtbus ? TCE_VB : TCE_PCI;
 
 	kfree(parms);
 }
 
 
+#ifdef CONFIG_PCI
+/*
+ * This function compares the known tables to find an iommu_table
+ * that has already been built for hardware TCEs.
+ */
+static struct iommu_table *iommu_table_find(struct iommu_table * tbl)
+{
+	struct pci_dn *pdn;
+
+	list_for_each_entry(pdn, &iSeries_Global_Device_List, Device_List) {
+		struct iommu_table *it = pdn->iommu_table;
+		if ((it != NULL) &&
+		    (it->it_type == TCE_PCI) &&
+		    (it->it_offset == tbl->it_offset) &&
+		    (it->it_index == tbl->it_index) &&
+		    (it->it_size == tbl->it_size))
+			return it;
+	}
+	return NULL;
+}
+
+
 void iommu_devnode_init_iSeries(struct device_node *dn)
 {
 	struct iommu_table *tbl;
@@ -152,7 +162,7 @@ void iommu_devnode_init_iSeries(struct device_node *dn)
 
 	tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
 
-	iommu_table_getparms(pdn, tbl);
+	iommu_table_getparms_iSeries(pdn->busno, pdn->LogicalSlot, 0, tbl);
 
 	/* Look for existing tce table */
 	pdn->iommu_table = iommu_table_find(tbl);
diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c
index fda712b421687f2b910dbc99c47de85644449057..c5207064977db1ac3ae901b6f86762e1a1393fc6 100644
--- a/arch/powerpc/platforms/iseries/setup.c
+++ b/arch/powerpc/platforms/iseries/setup.c
@@ -320,11 +320,11 @@ static void __init iSeries_init_early(void)
 	 */
 	if (naca.xRamDisk) {
 		initrd_start = (unsigned long)__va(naca.xRamDisk);
-		initrd_end = initrd_start + naca.xRamDiskSize * PAGE_SIZE;
+		initrd_end = initrd_start + naca.xRamDiskSize * HW_PAGE_SIZE;
 		initrd_below_start_ok = 1;	// ramdisk in kernel space
 		ROOT_DEV = Root_RAM0;
-		if (((rd_size * 1024) / PAGE_SIZE) < naca.xRamDiskSize)
-			rd_size = (naca.xRamDiskSize * PAGE_SIZE) / 1024;
+		if (((rd_size * 1024) / HW_PAGE_SIZE) < naca.xRamDiskSize)
+			rd_size = (naca.xRamDiskSize * HW_PAGE_SIZE) / 1024;
 	} else
 #endif /* CONFIG_BLK_DEV_INITRD */
 	{
@@ -470,13 +470,14 @@ static void __init build_iSeries_Memory_Map(void)
 	 */
 	hptFirstChunk = (u32)addr_to_chunk(HvCallHpt_getHptAddress());
 	hptSizePages = (u32)HvCallHpt_getHptPages();
-	hptSizeChunks = hptSizePages >> (MSCHUNKS_CHUNK_SHIFT - PAGE_SHIFT);
+	hptSizeChunks = hptSizePages >>
+		(MSCHUNKS_CHUNK_SHIFT - HW_PAGE_SHIFT);
 	hptLastChunk = hptFirstChunk + hptSizeChunks - 1;
 
 	printk("HPT absolute addr = %016lx, size = %dK\n",
 			chunk_to_addr(hptFirstChunk), hptSizeChunks * 256);
 
-	ppc64_pft_size = __ilog2(hptSizePages * PAGE_SIZE);
+	ppc64_pft_size = __ilog2(hptSizePages * HW_PAGE_SIZE);
 
 	/*
 	 * The actual hashed page table is in the hypervisor,
@@ -629,7 +630,7 @@ static void __init iSeries_fixup_klimit(void)
 	 */
 	if (naca.xRamDisk)
 		klimit = KERNELBASE + (u64)naca.xRamDisk +
-			(naca.xRamDiskSize * PAGE_SIZE);
+			(naca.xRamDiskSize * HW_PAGE_SIZE);
 	else {
 		/*
 		 * No ram disk was included - check and see if there
diff --git a/arch/powerpc/platforms/iseries/vio.c b/arch/powerpc/platforms/iseries/vio.c
index c27a66876c2cd042bf32c7f0fff36d652ec63a1f..384360ee06ec6647510f059db6c2da3a1ba121f2 100644
--- a/arch/powerpc/platforms/iseries/vio.c
+++ b/arch/powerpc/platforms/iseries/vio.c
@@ -30,41 +30,14 @@ static struct iommu_table vio_iommu_table;
 
 static void __init iommu_vio_init(void)
 {
-	struct iommu_table *t;
-	struct iommu_table_cb cb;
-	unsigned long cbp;
-	unsigned long itc_entries;
+	iommu_table_getparms_iSeries(255, 0, 0xff, &veth_iommu_table);
+	veth_iommu_table.it_size /= 2;
+	vio_iommu_table = veth_iommu_table;
+	vio_iommu_table.it_offset += veth_iommu_table.it_size;
 
-	cb.itc_busno = 255;    /* Bus 255 is the virtual bus */
-	cb.itc_virtbus = 0xff; /* Ask for virtual bus */
-
-	cbp = virt_to_abs(&cb);
-	HvCallXm_getTceTableParms(cbp);
-
-	itc_entries = cb.itc_size * PAGE_SIZE / sizeof(union tce_entry);
-	veth_iommu_table.it_size        = itc_entries / 2;
-	veth_iommu_table.it_busno       = cb.itc_busno;
-	veth_iommu_table.it_offset      = cb.itc_offset;
-	veth_iommu_table.it_index       = cb.itc_index;
-	veth_iommu_table.it_type        = TCE_VB;
-	veth_iommu_table.it_blocksize	= 1;
-
-	t = iommu_init_table(&veth_iommu_table);
-
-	if (!t)
+	if (!iommu_init_table(&veth_iommu_table))
 		printk("Virtual Bus VETH TCE table failed.\n");
-
-	vio_iommu_table.it_size         = itc_entries - veth_iommu_table.it_size;
-	vio_iommu_table.it_busno        = cb.itc_busno;
-	vio_iommu_table.it_offset       = cb.itc_offset +
-					  veth_iommu_table.it_size;
-	vio_iommu_table.it_index        = cb.itc_index;
-	vio_iommu_table.it_type         = TCE_VB;
-	vio_iommu_table.it_blocksize	= 1;
-
-	t = iommu_init_table(&vio_iommu_table);
-
-	if (!t)
+	if (!iommu_init_table(&vio_iommu_table))
 		printk("Virtual Bus VIO TCE table failed.\n");
 }
 
diff --git a/arch/powerpc/platforms/iseries/viopath.c b/arch/powerpc/platforms/iseries/viopath.c
index fe97bfbf7463fad9231b4dd411c62b0066a84540..842672695598e9845f8698e4fe827ec1c20234d8 100644
--- a/arch/powerpc/platforms/iseries/viopath.c
+++ b/arch/powerpc/platforms/iseries/viopath.c
@@ -68,7 +68,8 @@ static DEFINE_SPINLOCK(statuslock);
  * For each kind of event we allocate a buffer that is
  * guaranteed not to cross a page boundary
  */
-static unsigned char event_buffer[VIO_MAX_SUBTYPES * 256] __page_aligned;
+static unsigned char event_buffer[VIO_MAX_SUBTYPES * 256]
+	__attribute__((__aligned__(4096)));
 static atomic_t event_buffer_available[VIO_MAX_SUBTYPES];
 static int event_buffer_initialised;
 
@@ -116,12 +117,12 @@ static int proc_viopath_show(struct seq_file *m, void *v)
 	HvLpEvent_Rc hvrc;
 	DECLARE_MUTEX_LOCKED(Semaphore);
 
-	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	buf = kmalloc(HW_PAGE_SIZE, GFP_KERNEL);
 	if (!buf)
 		return 0;
-	memset(buf, 0, PAGE_SIZE);
+	memset(buf, 0, HW_PAGE_SIZE);
 
-	handle = dma_map_single(iSeries_vio_dev, buf, PAGE_SIZE,
+	handle = dma_map_single(iSeries_vio_dev, buf, HW_PAGE_SIZE,
 				DMA_FROM_DEVICE);
 
 	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
@@ -131,7 +132,7 @@ static int proc_viopath_show(struct seq_file *m, void *v)
 			viopath_sourceinst(viopath_hostLp),
 			viopath_targetinst(viopath_hostLp),
 			(u64)(unsigned long)&Semaphore, VIOVERSION << 16,
-			((u64)handle) << 32, PAGE_SIZE, 0, 0);
+			((u64)handle) << 32, HW_PAGE_SIZE, 0, 0);
 
 	if (hvrc != HvLpEvent_Rc_Good)
 		printk(VIOPATH_KERN_WARN "hv error on op %d\n", (int)hvrc);
@@ -140,7 +141,7 @@ static int proc_viopath_show(struct seq_file *m, void *v)
 
 	vlanMap = HvLpConfig_getVirtualLanIndexMap();
 
-	buf[PAGE_SIZE-1] = '\0';
+	buf[HW_PAGE_SIZE-1] = '\0';
 	seq_printf(m, "%s", buf);
 	seq_printf(m, "AVAILABLE_VETH=%x\n", vlanMap);
 	seq_printf(m, "SRLNBR=%c%c%c%c%c%c%c\n",
@@ -152,7 +153,8 @@ static int proc_viopath_show(struct seq_file *m, void *v)
 		   e2a(xItExtVpdPanel.systemSerial[4]),
 		   e2a(xItExtVpdPanel.systemSerial[5]));
 
-	dma_unmap_single(iSeries_vio_dev, handle, PAGE_SIZE, DMA_FROM_DEVICE);
+	dma_unmap_single(iSeries_vio_dev, handle, HW_PAGE_SIZE,
+			 DMA_FROM_DEVICE);
 	kfree(buf);
 
 	return 0;
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index e384a5a9179607b2af53873fb596122bfe89789b..ab0c6dd6ec94538beaab0c12e3b9254d224222fe 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -19,7 +19,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
-#define DEBUG
+#undef DEBUG_LOW
 
 #include <linux/config.h>
 #include <linux/kernel.h>
@@ -41,10 +41,10 @@
 
 #include "plpar_wrappers.h"
 
-#ifdef DEBUG
-#define DBG(fmt...) udbg_printf(fmt)
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) do { udbg_printf(fmt); } while(0)
 #else
-#define DBG(fmt...)
+#define DBG_LOW(fmt...) do { } while(0)
 #endif
 
 /* in pSeries_hvCall.S */
@@ -276,8 +276,9 @@ void vpa_init(int cpu)
 }
 
 long pSeries_lpar_hpte_insert(unsigned long hpte_group,
-			      unsigned long va, unsigned long prpn,
-			      unsigned long vflags, unsigned long rflags)
+ 			      unsigned long va, unsigned long pa,
+ 			      unsigned long rflags, unsigned long vflags,
+ 			      int psize)
 {
 	unsigned long lpar_rc;
 	unsigned long flags;
@@ -285,11 +286,28 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 	unsigned long hpte_v, hpte_r;
 	unsigned long dummy0, dummy1;
 
-	hpte_v = ((va >> 23) << HPTE_V_AVPN_SHIFT) | vflags | HPTE_V_VALID;
-	if (vflags & HPTE_V_LARGE)
-		hpte_v &= ~(1UL << HPTE_V_AVPN_SHIFT);
-
-	hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
+	if (!(vflags & HPTE_V_BOLTED))
+		DBG_LOW("hpte_insert(group=%lx, va=%016lx, pa=%016lx, "
+			"rflags=%lx, vflags=%lx, psize=%d)\n",
+		hpte_group, va, pa, rflags, vflags, psize);
+
+ 	hpte_v = hpte_encode_v(va, psize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize) | rflags;
+
+	if (!(vflags & HPTE_V_BOLTED))
+		DBG_LOW(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
+
+#if 1
+	{
+		int i;
+		for (i=0;i<8;i++) {
+			unsigned long w0, w1;
+			plpar_pte_read(0, hpte_group, &w0, &w1);
+			BUG_ON (HPTE_V_COMPARE(hpte_v, w0)
+				&& (w0 & HPTE_V_VALID));
+		}
+	}
+#endif
 
 	/* Now fill in the actual HPTE */
 	/* Set CEC cookie to 0         */
@@ -299,23 +317,30 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 	/* Exact = 0                   */
 	flags = 0;
 
-	/* XXX why is this here? - Anton */
+	/* Make pHyp happy */
 	if (rflags & (_PAGE_GUARDED|_PAGE_NO_CACHE))
 		hpte_r &= ~_PAGE_COHERENT;
 
 	lpar_rc = plpar_hcall(H_ENTER, flags, hpte_group, hpte_v,
 			      hpte_r, &slot, &dummy0, &dummy1);
-
-	if (unlikely(lpar_rc == H_PTEG_Full))
+	if (unlikely(lpar_rc == H_PTEG_Full)) {
+		if (!(vflags & HPTE_V_BOLTED))
+			DBG_LOW(" full\n");
 		return -1;
+	}
 
 	/*
 	 * Since we try and ioremap PHBs we don't own, the pte insert
 	 * will fail. However we must catch the failure in hash_page
 	 * or we will loop forever, so return -2 in this case.
 	 */
-	if (unlikely(lpar_rc != H_Success))
+	if (unlikely(lpar_rc != H_Success)) {
+		if (!(vflags & HPTE_V_BOLTED))
+			DBG_LOW(" lpar err %d\n", lpar_rc);
 		return -2;
+	}
+	if (!(vflags & HPTE_V_BOLTED))
+		DBG_LOW(" -> slot: %d\n", slot & 7);
 
 	/* Because of iSeries, we have to pass down the secondary
 	 * bucket bit here as well
@@ -340,10 +365,8 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
 		/* don't remove a bolted entry */
 		lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
 					   (0x1UL << 4), &dummy1, &dummy2);
-
 		if (lpar_rc == H_Success)
 			return i;
-
 		BUG_ON(lpar_rc != H_Not_Found);
 
 		slot_offset++;
@@ -371,20 +394,28 @@ static void pSeries_lpar_hptab_clear(void)
  * We can probably optimize here and assume the high bits of newpp are
  * already zero.  For now I am paranoid.
  */
-static long pSeries_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp,
-				       unsigned long va, int large, int local)
+static long pSeries_lpar_hpte_updatepp(unsigned long slot,
+				       unsigned long newpp,
+				       unsigned long va,
+				       int psize, int local)
 {
 	unsigned long lpar_rc;
 	unsigned long flags = (newpp & 7) | H_AVPN;
-	unsigned long avpn = va >> 23;
+	unsigned long want_v;
 
-	if (large)
-		avpn &= ~0x1UL;
+	want_v = hpte_encode_v(va, psize);
 
-	lpar_rc = plpar_pte_protect(flags, slot, (avpn << 7));
+	DBG_LOW("    update: avpnv=%016lx, hash=%016lx, f=%x, psize: %d ... ",
+		want_v & HPTE_V_AVPN, slot, flags, psize);
 
-	if (lpar_rc == H_Not_Found)
+	lpar_rc = plpar_pte_protect(flags, slot, want_v & HPTE_V_AVPN);
+
+	if (lpar_rc == H_Not_Found) {
+		DBG_LOW("not found !\n");
 		return -1;
+	}
+
+	DBG_LOW("ok\n");
 
 	BUG_ON(lpar_rc != H_Success);
 
@@ -410,21 +441,22 @@ static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
 	return dword0;
 }
 
-static long pSeries_lpar_hpte_find(unsigned long vpn)
+static long pSeries_lpar_hpte_find(unsigned long va, int psize)
 {
 	unsigned long hash;
 	unsigned long i, j;
 	long slot;
-	unsigned long hpte_v;
+	unsigned long want_v, hpte_v;
 
-	hash = hpt_hash(vpn, 0);
+	hash = hpt_hash(va, mmu_psize_defs[psize].shift);
+	want_v = hpte_encode_v(va, psize);
 
 	for (j = 0; j < 2; j++) {
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		for (i = 0; i < HPTES_PER_GROUP; i++) {
 			hpte_v = pSeries_lpar_hpte_getword0(slot);
 
-			if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11))
+			if (HPTE_V_COMPARE(hpte_v, want_v)
 			    && (hpte_v & HPTE_V_VALID)
 			    && (!!(hpte_v & HPTE_V_SECONDARY) == j)) {
 				/* HPTE matches */
@@ -441,17 +473,15 @@ static long pSeries_lpar_hpte_find(unsigned long vpn)
 } 
 
 static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
-					     unsigned long ea)
+					     unsigned long ea,
+					     int psize)
 {
-	unsigned long lpar_rc;
-	unsigned long vsid, va, vpn, flags;
-	long slot;
+	unsigned long lpar_rc, slot, vsid, va, flags;
 
 	vsid = get_kernel_vsid(ea);
 	va = (vsid << 28) | (ea & 0x0fffffff);
-	vpn = va >> PAGE_SHIFT;
 
-	slot = pSeries_lpar_hpte_find(vpn);
+	slot = pSeries_lpar_hpte_find(va, psize);
 	BUG_ON(slot == -1);
 
 	flags = newpp & 7;
@@ -461,18 +491,18 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
 }
 
 static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
-					 int large, int local)
+					 int psize, int local)
 {
-	unsigned long avpn = va >> 23;
+	unsigned long want_v;
 	unsigned long lpar_rc;
 	unsigned long dummy1, dummy2;
 
-	if (large)
-		avpn &= ~0x1UL;
-
-	lpar_rc = plpar_pte_remove(H_AVPN, slot, (avpn << 7), &dummy1,
-				   &dummy2);
+	DBG_LOW("    inval : slot=%lx, va=%016lx, psize: %d, local: %d",
+		slot, va, psize, local);
 
+	want_v = hpte_encode_v(va, psize);
+	lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v & HPTE_V_AVPN,
+				   &dummy1, &dummy2);
 	if (lpar_rc == H_Not_Found)
 		return;
 
@@ -494,7 +524,8 @@ void pSeries_lpar_flush_hash_range(unsigned long number, int local)
 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
 
 	for (i = 0; i < number; i++)
-		flush_hash_page(batch->vaddr[i], batch->pte[i], local);
+		flush_hash_page(batch->vaddr[i], batch->pte[i],
+				batch->psize, local);
 
 	if (lock_tlbie)
 		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig
index b987164fca4cb714f7a68476e82d9ae27c0d863d..2130cc31595753643d1db643a57e610ea97265cd 100644
--- a/arch/ppc64/Kconfig
+++ b/arch/ppc64/Kconfig
@@ -47,6 +47,10 @@ config ARCH_MAY_HAVE_PC_FDC
 	bool
 	default y
 
+config PPC_STD_MMU
+	bool
+	default y
+
 # We optimistically allocate largepages from the VM, so make the limit
 # large enough (16MB). This badly named config option is actually
 # max order + 1
@@ -294,6 +298,15 @@ config NODES_SPAN_OTHER_NODES
 	def_bool y
 	depends on NEED_MULTIPLE_NODES
 
+config PPC_64K_PAGES
+	bool "64k page size"
+	help
+	  This option changes the kernel logical page size to 64k. On machines
+          without processor support for 64k pages, the kernel will simulate
+          them by loading each individual 4k page on demand transparently,
+          while on hardware with such support, it will be used to map
+          normal application pages.
+
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on SMP
diff --git a/arch/ppc64/kernel/asm-offsets.c b/arch/ppc64/kernel/asm-offsets.c
index 504dee836d292716693950494a3576636ed566b3..bce9065da6cbea1f193477637008290b5953114c 100644
--- a/arch/ppc64/kernel/asm-offsets.c
+++ b/arch/ppc64/kernel/asm-offsets.c
@@ -93,6 +93,9 @@ int main(void)
 	DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
 	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
+#ifdef CONFIG_PPC_64K_PAGES
+	DEFINE(PACAPGDIR, offsetof(struct paca_struct, pgdir));
+#endif
 #ifdef CONFIG_HUGETLB_PAGE
 	DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas));
 	DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas));
diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S
index db1cf397be2d6c57a3d7b6f3e4d3776a52833056..9e8050ea1225fa05c002328203d76ca94d82f6dc 100644
--- a/arch/ppc64/kernel/head.S
+++ b/arch/ppc64/kernel/head.S
@@ -195,11 +195,11 @@ exception_marker:
 #define EX_R12		24
 #define EX_R13		32
 #define EX_SRR0		40
-#define EX_R3		40	/* SLB miss saves R3, but not SRR0 */
 #define EX_DAR		48
-#define EX_LR		48	/* SLB miss saves LR, but not DAR */
 #define EX_DSISR	56
 #define EX_CCR		60
+#define EX_R3		64
+#define EX_LR		72
 
 #define EXCEPTION_PROLOG_PSERIES(area, label)				\
 	mfspr	r13,SPRN_SPRG3;		/* get paca address into r13 */	\
@@ -419,17 +419,22 @@ data_access_slb_pSeries:
 	mtspr	SPRN_SPRG1,r13
 	RUNLATCH_ON(r13)
 	mfspr	r13,SPRN_SPRG3		/* get paca address into r13 */
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r3,SPRN_DAR
 	std	r9,PACA_EXSLB+EX_R9(r13)	/* save r9 - r12 */
+	mfcr	r9
+#ifdef __DISABLED__
+	/* Keep that around for when we re-implement dynamic VSIDs */
+	cmpdi	r3,0
+	bge	slb_miss_user_pseries
+#endif /* __DISABLED__ */
 	std	r10,PACA_EXSLB+EX_R10(r13)
 	std	r11,PACA_EXSLB+EX_R11(r13)
 	std	r12,PACA_EXSLB+EX_R12(r13)
-	std	r3,PACA_EXSLB+EX_R3(r13)
-	mfspr	r9,SPRN_SPRG1
-	std	r9,PACA_EXSLB+EX_R13(r13)
-	mfcr	r9
+	mfspr	r10,SPRN_SPRG1
+	std	r10,PACA_EXSLB+EX_R13(r13)
 	mfspr	r12,SPRN_SRR1		/* and SRR1 */
-	mfspr	r3,SPRN_DAR
-	b	.do_slb_miss		/* Rel. branch works in real mode */
+	b	.slb_miss_realmode	/* Rel. branch works in real mode */
 
 	STD_EXCEPTION_PSERIES(0x400, instruction_access)
 
@@ -440,17 +445,22 @@ instruction_access_slb_pSeries:
 	mtspr	SPRN_SPRG1,r13
 	RUNLATCH_ON(r13)
 	mfspr	r13,SPRN_SPRG3		/* get paca address into r13 */
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
 	std	r9,PACA_EXSLB+EX_R9(r13)	/* save r9 - r12 */
+	mfcr	r9
+#ifdef __DISABLED__
+	/* Keep that around for when we re-implement dynamic VSIDs */
+	cmpdi	r3,0
+	bge	slb_miss_user_pseries
+#endif /* __DISABLED__ */
 	std	r10,PACA_EXSLB+EX_R10(r13)
 	std	r11,PACA_EXSLB+EX_R11(r13)
 	std	r12,PACA_EXSLB+EX_R12(r13)
-	std	r3,PACA_EXSLB+EX_R3(r13)
-	mfspr	r9,SPRN_SPRG1
-	std	r9,PACA_EXSLB+EX_R13(r13)
-	mfcr	r9
+	mfspr	r10,SPRN_SPRG1
+	std	r10,PACA_EXSLB+EX_R13(r13)
 	mfspr	r12,SPRN_SRR1		/* and SRR1 */
-	mfspr	r3,SPRN_SRR0			/* SRR0 is faulting address */
-	b	.do_slb_miss		/* Rel. branch works in real mode */
+	b	.slb_miss_realmode	/* Rel. branch works in real mode */
 
 	STD_EXCEPTION_PSERIES(0x500, hardware_interrupt)
 	STD_EXCEPTION_PSERIES(0x600, alignment)
@@ -508,6 +518,38 @@ _GLOBAL(do_stab_bolted_pSeries)
 	mfspr	r12,SPRN_SPRG2
 	EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted)
 
+/*
+ * We have some room here  we use that to put
+ * the peries slb miss user trampoline code so it's reasonably
+ * away from slb_miss_user_common to avoid problems with rfid
+ *
+ * This is used for when the SLB miss handler has to go virtual,
+ * which doesn't happen for now anymore but will once we re-implement
+ * dynamic VSIDs for shared page tables
+ */
+#ifdef __DISABLED__
+slb_miss_user_pseries:
+	std	r10,PACA_EXGEN+EX_R10(r13)
+	std	r11,PACA_EXGEN+EX_R11(r13)
+	std	r12,PACA_EXGEN+EX_R12(r13)
+	mfspr	r10,SPRG1
+	ld	r11,PACA_EXSLB+EX_R9(r13)
+	ld	r12,PACA_EXSLB+EX_R3(r13)
+	std	r10,PACA_EXGEN+EX_R13(r13)
+	std	r11,PACA_EXGEN+EX_R9(r13)
+	std	r12,PACA_EXGEN+EX_R3(r13)
+	clrrdi	r12,r13,32
+	mfmsr	r10
+	mfspr	r11,SRR0			/* save SRR0 */
+	ori	r12,r12,slb_miss_user_common@l	/* virt addr of handler */
+	ori	r10,r10,MSR_IR|MSR_DR|MSR_RI
+	mtspr	SRR0,r12
+	mfspr	r12,SRR1			/* and SRR1 */
+	mtspr	SRR1,r10
+	rfid
+	b	.				/* prevent spec. execution */
+#endif /* __DISABLED__ */
+
 /*
  * Vectors for the FWNMI option.  Share common code.
  */
@@ -559,22 +601,59 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
 	.globl	data_access_slb_iSeries
 data_access_slb_iSeries:
 	mtspr	SPRN_SPRG1,r13		/* save r13 */
-	EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB)
+	mfspr	r13,SPRN_SPRG3		/* get paca address into r13 */
 	std	r3,PACA_EXSLB+EX_R3(r13)
-	ld	r12,PACALPPACA+LPPACASRR1(r13)
 	mfspr	r3,SPRN_DAR
-	b	.do_slb_miss
+	std	r9,PACA_EXSLB+EX_R9(r13)
+	mfcr	r9
+#ifdef __DISABLED__
+	cmpdi	r3,0
+	bge	slb_miss_user_iseries
+#endif
+	std	r10,PACA_EXSLB+EX_R10(r13)
+	std	r11,PACA_EXSLB+EX_R11(r13)
+	std	r12,PACA_EXSLB+EX_R12(r13)
+	mfspr	r10,SPRN_SPRG1
+	std	r10,PACA_EXSLB+EX_R13(r13)
+	ld	r12,PACALPPACA+LPPACASRR1(r13);
+	b	.slb_miss_realmode
 
 	STD_EXCEPTION_ISERIES(0x400, instruction_access, PACA_EXGEN)
 
 	.globl	instruction_access_slb_iSeries
 instruction_access_slb_iSeries:
 	mtspr	SPRN_SPRG1,r13		/* save r13 */
-	EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB)
+	mfspr	r13,SPRN_SPRG3		/* get paca address into r13 */
 	std	r3,PACA_EXSLB+EX_R3(r13)
-	ld	r12,PACALPPACA+LPPACASRR1(r13)
-	ld	r3,PACALPPACA+LPPACASRR0(r13)
-	b	.do_slb_miss
+	ld	r3,PACALPPACA+LPPACASRR0(r13)	/* get SRR0 value */
+	std	r9,PACA_EXSLB+EX_R9(r13)
+	mfcr	r9
+#ifdef __DISABLED__
+	cmpdi	r3,0
+	bge	.slb_miss_user_iseries
+#endif
+	std	r10,PACA_EXSLB+EX_R10(r13)
+	std	r11,PACA_EXSLB+EX_R11(r13)
+	std	r12,PACA_EXSLB+EX_R12(r13)
+	mfspr	r10,SPRN_SPRG1
+	std	r10,PACA_EXSLB+EX_R13(r13)
+	ld	r12,PACALPPACA+LPPACASRR1(r13);
+	b	.slb_miss_realmode
+
+#ifdef __DISABLED__
+slb_miss_user_iseries:
+	std	r10,PACA_EXGEN+EX_R10(r13)
+	std	r11,PACA_EXGEN+EX_R11(r13)
+	std	r12,PACA_EXGEN+EX_R12(r13)
+	mfspr	r10,SPRG1
+	ld	r11,PACA_EXSLB+EX_R9(r13)
+	ld	r12,PACA_EXSLB+EX_R3(r13)
+	std	r10,PACA_EXGEN+EX_R13(r13)
+	std	r11,PACA_EXGEN+EX_R9(r13)
+	std	r12,PACA_EXGEN+EX_R3(r13)
+	EXCEPTION_PROLOG_ISERIES_2
+	b	slb_miss_user_common
+#endif
 
 	MASKABLE_EXCEPTION_ISERIES(0x500, hardware_interrupt)
 	STD_EXCEPTION_ISERIES(0x600, alignment, PACA_EXGEN)
@@ -809,6 +888,126 @@ instruction_access_common:
 	li	r5,0x400
 	b	.do_hash_page		/* Try to handle as hpte fault */
 
+/*
+ * Here is the common SLB miss user that is used when going to virtual
+ * mode for SLB misses, that is currently not used
+ */
+#ifdef __DISABLED__
+	.align	7
+	.globl	slb_miss_user_common
+slb_miss_user_common:
+	mflr	r10
+	std	r3,PACA_EXGEN+EX_DAR(r13)
+	stw	r9,PACA_EXGEN+EX_CCR(r13)
+	std	r10,PACA_EXGEN+EX_LR(r13)
+	std	r11,PACA_EXGEN+EX_SRR0(r13)
+	bl	.slb_allocate_user
+
+	ld	r10,PACA_EXGEN+EX_LR(r13)
+	ld	r3,PACA_EXGEN+EX_R3(r13)
+	lwz	r9,PACA_EXGEN+EX_CCR(r13)
+	ld	r11,PACA_EXGEN+EX_SRR0(r13)
+	mtlr	r10
+	beq-	slb_miss_fault
+
+	andi.	r10,r12,MSR_RI		/* check for unrecoverable exception */
+	beq-	unrecov_user_slb
+	mfmsr	r10
+
+.machine push
+.machine "power4"
+	mtcrf	0x80,r9
+.machine pop
+
+	clrrdi	r10,r10,2		/* clear RI before setting SRR0/1 */
+	mtmsrd	r10,1
+
+	mtspr	SRR0,r11
+	mtspr	SRR1,r12
+
+	ld	r9,PACA_EXGEN+EX_R9(r13)
+	ld	r10,PACA_EXGEN+EX_R10(r13)
+	ld	r11,PACA_EXGEN+EX_R11(r13)
+	ld	r12,PACA_EXGEN+EX_R12(r13)
+	ld	r13,PACA_EXGEN+EX_R13(r13)
+	rfid
+	b	.
+
+slb_miss_fault:
+	EXCEPTION_PROLOG_COMMON(0x380, PACA_EXGEN)
+	ld	r4,PACA_EXGEN+EX_DAR(r13)
+	li	r5,0
+	std	r4,_DAR(r1)
+	std	r5,_DSISR(r1)
+	b	.handle_page_fault
+
+unrecov_user_slb:
+	EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN)
+	DISABLE_INTS
+	bl	.save_nvgprs
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	1b
+
+#endif /* __DISABLED__ */
+
+
+/*
+ * r13 points to the PACA, r9 contains the saved CR,
+ * r12 contain the saved SRR1, SRR0 is still ready for return
+ * r3 has the faulting address
+ * r9 - r13 are saved in paca->exslb.
+ * r3 is saved in paca->slb_r3
+ * We assume we aren't going to take any exceptions during this procedure.
+ */
+_GLOBAL(slb_miss_realmode)
+	mflr	r10
+
+	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
+	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
+
+	bl	.slb_allocate_realmode
+
+	/* All done -- return from exception. */
+
+	ld	r10,PACA_EXSLB+EX_LR(r13)
+	ld	r3,PACA_EXSLB+EX_R3(r13)
+	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
+#ifdef CONFIG_PPC_ISERIES
+	ld	r11,PACALPPACA+LPPACASRR0(r13)	/* get SRR0 value */
+#endif /* CONFIG_PPC_ISERIES */
+
+	mtlr	r10
+
+	andi.	r10,r12,MSR_RI	/* check for unrecoverable exception */
+	beq-	unrecov_slb
+
+.machine	push
+.machine	"power4"
+	mtcrf	0x80,r9
+	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
+.machine	pop
+
+#ifdef CONFIG_PPC_ISERIES
+	mtspr	SPRN_SRR0,r11
+	mtspr	SPRN_SRR1,r12
+#endif /* CONFIG_PPC_ISERIES */
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
+	rfid
+	b	.	/* prevent speculative execution */
+
+unrecov_slb:
+	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
+	DISABLE_INTS
+	bl	.save_nvgprs
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	1b
+
 	.align	7
 	.globl hardware_interrupt_common
 	.globl hardware_interrupt_entry
@@ -1138,62 +1337,6 @@ _GLOBAL(do_stab_bolted)
 	rfid
 	b	.	/* prevent speculative execution */
 
-/*
- * r13 points to the PACA, r9 contains the saved CR,
- * r11 and r12 contain the saved SRR0 and SRR1.
- * r3 has the faulting address
- * r9 - r13 are saved in paca->exslb.
- * r3 is saved in paca->slb_r3
- * We assume we aren't going to take any exceptions during this procedure.
- */
-_GLOBAL(do_slb_miss)
-	mflr	r10
-
-	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
-	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
-
-	bl	.slb_allocate			/* handle it */
-
-	/* All done -- return from exception. */
-
-	ld	r10,PACA_EXSLB+EX_LR(r13)
-	ld	r3,PACA_EXSLB+EX_R3(r13)
-	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
-#ifdef CONFIG_PPC_ISERIES
-	ld	r11,PACALPPACA+LPPACASRR0(r13)	/* get SRR0 value */
-#endif /* CONFIG_PPC_ISERIES */
-
-	mtlr	r10
-
-	andi.	r10,r12,MSR_RI	/* check for unrecoverable exception */
-	beq-	unrecov_slb
-
-.machine	push
-.machine	"power4"
-	mtcrf	0x80,r9
-	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
-.machine	pop
-
-#ifdef CONFIG_PPC_ISERIES
-	mtspr	SPRN_SRR0,r11
-	mtspr	SPRN_SRR1,r12
-#endif /* CONFIG_PPC_ISERIES */
-	ld	r9,PACA_EXSLB+EX_R9(r13)
-	ld	r10,PACA_EXSLB+EX_R10(r13)
-	ld	r11,PACA_EXSLB+EX_R11(r13)
-	ld	r12,PACA_EXSLB+EX_R12(r13)
-	ld	r13,PACA_EXSLB+EX_R13(r13)
-	rfid
-	b	.	/* prevent speculative execution */
-
-unrecov_slb:
-	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
-	DISABLE_INTS
-	bl	.save_nvgprs
-1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.unrecoverable_exception
-	b	1b
-
 /*
  * Space for CPU0's segment table.
  *
@@ -1569,7 +1712,10 @@ _GLOBAL(__secondary_start)
 #endif
 	/* Initialize the first segment table (or SLB) entry		 */
 	ld	r3,PACASTABVIRT(r13)	/* get addr of segment table	 */
+BEGIN_FTR_SECTION
 	bl	.stab_initialize
+END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
+	bl	.slb_initialize
 
 	/* Initialize the kernel stack.  Just a repeat for iSeries.	 */
 	LOADADDR(r3,current_set)
diff --git a/arch/ppc64/kernel/pacaData.c b/arch/ppc64/kernel/pacaData.c
index 5e27e5a6a35d632b8e946c15f894e5b42a33b870..3133c72b28ecee22ce123fbd47da39ac19a05e6c 100644
--- a/arch/ppc64/kernel/pacaData.c
+++ b/arch/ppc64/kernel/pacaData.c
@@ -23,7 +23,7 @@
 static union {
 	struct systemcfg	data;
 	u8			page[PAGE_SIZE];
-} systemcfg_store __page_aligned;
+} systemcfg_store __attribute__((__section__(".data.page.aligned")));
 struct systemcfg *systemcfg = &systemcfg_store.data;
 EXPORT_SYMBOL(systemcfg);
 
diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c
index 97bfceb5353b7a842231e9176ccd67c4aedaaa43..dece31e58bc4b69c4cf8aaf4a05be977584d65cc 100644
--- a/arch/ppc64/kernel/prom.c
+++ b/arch/ppc64/kernel/prom.c
@@ -635,10 +635,10 @@ static inline char *find_flat_dt_string(u32 offset)
  * used to extract the memory informations at boot before we can
  * unflatten the tree
  */
-static int __init scan_flat_dt(int (*it)(unsigned long node,
-					 const char *uname, int depth,
-					 void *data),
-			       void *data)
+int __init of_scan_flat_dt(int (*it)(unsigned long node,
+				     const char *uname, int depth,
+				     void *data),
+			   void *data)
 {
 	unsigned long p = ((unsigned long)initial_boot_params) +
 		initial_boot_params->off_dt_struct;
@@ -695,8 +695,8 @@ static int __init scan_flat_dt(int (*it)(unsigned long node,
  * This  function can be used within scan_flattened_dt callback to get
  * access to properties
  */
-static void* __init get_flat_dt_prop(unsigned long node, const char *name,
-				     unsigned long *size)
+void* __init of_get_flat_dt_prop(unsigned long node, const char *name,
+				 unsigned long *size)
 {
 	unsigned long p = node;
 
@@ -996,7 +996,7 @@ void __init unflatten_device_tree(void)
 static int __init early_init_dt_scan_cpus(unsigned long node,
 					  const char *uname, int depth, void *data)
 {
-	char *type = get_flat_dt_prop(node, "device_type", NULL);
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
 	u32 *prop;
 	unsigned long size;
 
@@ -1004,17 +1004,6 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
-	/* On LPAR, look for the first ibm,pft-size property for the  hash table size
-	 */
-	if (systemcfg->platform == PLATFORM_PSERIES_LPAR && ppc64_pft_size == 0) {
-		u32 *pft_size;
-		pft_size = (u32 *)get_flat_dt_prop(node, "ibm,pft-size", NULL);
-		if (pft_size != NULL) {
-			/* pft_size[0] is the NUMA CEC cookie */
-			ppc64_pft_size = pft_size[1];
-		}
-	}
-
 	if (initial_boot_params && initial_boot_params->version >= 2) {
 		/* version 2 of the kexec param format adds the phys cpuid
 		 * of booted proc.
@@ -1023,8 +1012,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 		boot_cpuid = 0;
 	} else {
 		/* Check if it's the boot-cpu, set it's hw index in paca now */
-		if (get_flat_dt_prop(node, "linux,boot-cpu", NULL) != NULL) {
-			u32 *prop = get_flat_dt_prop(node, "reg", NULL);
+		if (of_get_flat_dt_prop(node, "linux,boot-cpu", NULL)
+		    != NULL) {
+			u32 *prop = of_get_flat_dt_prop(node, "reg", NULL);
 			set_hard_smp_processor_id(0, prop == NULL ? 0 : *prop);
 			boot_cpuid_phys = get_hard_smp_processor_id(0);
 		}
@@ -1032,14 +1022,14 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 
 #ifdef CONFIG_ALTIVEC
 	/* Check if we have a VMX and eventually update CPU features */
-	prop = (u32 *)get_flat_dt_prop(node, "ibm,vmx", NULL);
+	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,vmx", NULL);
 	if (prop && (*prop) > 0) {
 		cur_cpu_spec->cpu_features |= CPU_FTR_ALTIVEC;
 		cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_ALTIVEC;
 	}
 
 	/* Same goes for Apple's "altivec" property */
-	prop = (u32 *)get_flat_dt_prop(node, "altivec", NULL);
+	prop = (u32 *)of_get_flat_dt_prop(node, "altivec", NULL);
 	if (prop) {
 		cur_cpu_spec->cpu_features |= CPU_FTR_ALTIVEC;
 		cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_ALTIVEC;
@@ -1051,7 +1041,7 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 	 * this by looking at the size of the ibm,ppc-interrupt-server#s
 	 * property
 	 */
-	prop = (u32 *)get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s",
+	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s",
 				       &size);
 	cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT;
 	if (prop && ((size / sizeof(u32)) > 1))
@@ -1072,26 +1062,26 @@ static int __init early_init_dt_scan_chosen(unsigned long node,
 		return 0;
 
 	/* get platform type */
-	prop = (u32 *)get_flat_dt_prop(node, "linux,platform", NULL);
+	prop = (u32 *)of_get_flat_dt_prop(node, "linux,platform", NULL);
 	if (prop == NULL)
 		return 0;
 	systemcfg->platform = *prop;
 
 	/* check if iommu is forced on or off */
-	if (get_flat_dt_prop(node, "linux,iommu-off", NULL) != NULL)
+	if (of_get_flat_dt_prop(node, "linux,iommu-off", NULL) != NULL)
 		iommu_is_off = 1;
-	if (get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL)
+	if (of_get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL)
 		iommu_force_on = 1;
 
- 	prop64 = (u64*)get_flat_dt_prop(node, "linux,memory-limit", NULL);
+ 	prop64 = (u64*)of_get_flat_dt_prop(node, "linux,memory-limit", NULL);
  	if (prop64)
  		memory_limit = *prop64;
 
- 	prop64 = (u64*)get_flat_dt_prop(node, "linux,tce-alloc-start", NULL);
+ 	prop64 = (u64*)of_get_flat_dt_prop(node, "linux,tce-alloc-start",NULL);
  	if (prop64)
  		tce_alloc_start = *prop64;
 
- 	prop64 = (u64*)get_flat_dt_prop(node, "linux,tce-alloc-end", NULL);
+ 	prop64 = (u64*)of_get_flat_dt_prop(node, "linux,tce-alloc-end", NULL);
  	if (prop64)
  		tce_alloc_end = *prop64;
 
@@ -1102,9 +1092,12 @@ static int __init early_init_dt_scan_chosen(unsigned long node,
 	{
 		u64 *basep, *entryp;
 
-		basep = (u64*)get_flat_dt_prop(node, "linux,rtas-base", NULL);
-		entryp = (u64*)get_flat_dt_prop(node, "linux,rtas-entry", NULL);
-		prop = (u32*)get_flat_dt_prop(node, "linux,rtas-size", NULL);
+		basep = (u64*)of_get_flat_dt_prop(node,
+						  "linux,rtas-base", NULL);
+		entryp = (u64*)of_get_flat_dt_prop(node,
+						   "linux,rtas-entry", NULL);
+		prop = (u32*)of_get_flat_dt_prop(node,
+						 "linux,rtas-size", NULL);
 		if (basep && entryp && prop) {
 			rtas.base = *basep;
 			rtas.entry = *entryp;
@@ -1125,11 +1118,11 @@ static int __init early_init_dt_scan_root(unsigned long node,
 	if (depth != 0)
 		return 0;
 
-	prop = (u32 *)get_flat_dt_prop(node, "#size-cells", NULL);
+	prop = (u32 *)of_get_flat_dt_prop(node, "#size-cells", NULL);
 	dt_root_size_cells = (prop == NULL) ? 1 : *prop;
 	DBG("dt_root_size_cells = %x\n", dt_root_size_cells);
 
-	prop = (u32 *)get_flat_dt_prop(node, "#address-cells", NULL);
+	prop = (u32 *)of_get_flat_dt_prop(node, "#address-cells", NULL);
 	dt_root_addr_cells = (prop == NULL) ? 2 : *prop;
 	DBG("dt_root_addr_cells = %x\n", dt_root_addr_cells);
 	
@@ -1161,7 +1154,7 @@ static unsigned long __init dt_mem_next_cell(int s, cell_t **cellp)
 static int __init early_init_dt_scan_memory(unsigned long node,
 					    const char *uname, int depth, void *data)
 {
-	char *type = get_flat_dt_prop(node, "device_type", NULL);
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
 	cell_t *reg, *endp;
 	unsigned long l;
 
@@ -1169,7 +1162,7 @@ static int __init early_init_dt_scan_memory(unsigned long node,
 	if (type == NULL || strcmp(type, "memory") != 0)
 		return 0;
 
-	reg = (cell_t *)get_flat_dt_prop(node, "reg", &l);
+	reg = (cell_t *)of_get_flat_dt_prop(node, "reg", &l);
 	if (reg == NULL)
 		return 0;
 
@@ -1225,19 +1218,16 @@ void __init early_init_devtree(void *params)
 	/* Setup flat device-tree pointer */
 	initial_boot_params = params;
 
-	/* By default, hash size is not set */
-	ppc64_pft_size = 0;
-
 	/* Retreive various informations from the /chosen node of the
 	 * device-tree, including the platform type, initrd location and
 	 * size, TCE reserve, and more ...
 	 */
-	scan_flat_dt(early_init_dt_scan_chosen, NULL);
+	of_scan_flat_dt(early_init_dt_scan_chosen, NULL);
 
 	/* Scan memory nodes and rebuild LMBs */
 	lmb_init();
-	scan_flat_dt(early_init_dt_scan_root, NULL);
-	scan_flat_dt(early_init_dt_scan_memory, NULL);
+	of_scan_flat_dt(early_init_dt_scan_root, NULL);
+	of_scan_flat_dt(early_init_dt_scan_memory, NULL);
 	lmb_enforce_memory_limit(memory_limit);
 	lmb_analyze();
 	systemcfg->physicalMemorySize = lmb_phys_mem_size();
@@ -1253,26 +1243,8 @@ void __init early_init_devtree(void *params)
 	/* Retreive hash table size from flattened tree plus other
 	 * CPU related informations (altivec support, boot CPU ID, ...)
 	 */
-	scan_flat_dt(early_init_dt_scan_cpus, NULL);
-
-	/* If hash size wasn't obtained above, we calculate it now based on
-	 * the total RAM size
-	 */
-	if (ppc64_pft_size == 0) {
-		unsigned long rnd_mem_size, pteg_count;
-
-		/* round mem_size up to next power of 2 */
-		rnd_mem_size = 1UL << __ilog2(systemcfg->physicalMemorySize);
-		if (rnd_mem_size < systemcfg->physicalMemorySize)
-			rnd_mem_size <<= 1;
-
-		/* # pages / 2 */
-		pteg_count = max(rnd_mem_size >> (12 + 1), 1UL << 11);
-
-		ppc64_pft_size = __ilog2(pteg_count << 7);
-	}
+	of_scan_flat_dt(early_init_dt_scan_cpus, NULL);
 
-	DBG("Hash pftSize: %x\n", (int)ppc64_pft_size);
 	DBG(" <- early_init_devtree()\n");
 }
 
diff --git a/include/asm-powerpc/cputable.h b/include/asm-powerpc/cputable.h
index c019501dacebad84b5e54b87f55b3dbf4c9ad1a1..79a0556a0ab8ef09f2009c0e7de0789c9e09f0e8 100644
--- a/include/asm-powerpc/cputable.h
+++ b/include/asm-powerpc/cputable.h
@@ -101,6 +101,7 @@ extern void do_cpu_ftr_fixups(unsigned long offset);
 #define CPU_FTR_COHERENT_ICACHE  	ASM_CONST(0x0000020000000000)
 #define CPU_FTR_LOCKLESS_TLBIE		ASM_CONST(0x0000040000000000)
 #define CPU_FTR_MMCRA_SIHV		ASM_CONST(0x0000080000000000)
+#define CPU_FTR_CI_LARGE_PAGE		ASM_CONST(0x0000100000000000)
 #else
 /* ensure on 32b processors the flags are available for compiling but
  * don't do anything */
@@ -116,6 +117,7 @@ extern void do_cpu_ftr_fixups(unsigned long offset);
 #define CPU_FTR_COHERENT_ICACHE  	ASM_CONST(0x0)
 #define CPU_FTR_LOCKLESS_TLBIE		ASM_CONST(0x0)
 #define CPU_FTR_MMCRA_SIHV		ASM_CONST(0x0)
+#define CPU_FTR_CI_LARGE_PAGE		ASM_CONST(0x0)
 #endif
 
 #ifndef __ASSEMBLY__
@@ -339,6 +341,7 @@ enum {
 #ifdef __powerpc64__
 	    CPU_FTRS_POWER3 | CPU_FTRS_RS64 | CPU_FTRS_POWER4 |
 	    CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | CPU_FTRS_CELL |
+            CPU_FTR_CI_LARGE_PAGE |
 #endif
 	    0,
 
diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h
index 9d91bdd667ae6b4b3bf7709ee37bc3dd6b90999a..6a35e6570ccd4ad9dd3691078609a1a1daadc585 100644
--- a/include/asm-powerpc/iommu.h
+++ b/include/asm-powerpc/iommu.h
@@ -74,6 +74,11 @@ extern void iommu_devnode_init_pSeries(struct device_node *dn);
 
 /* Creates table for an individual device node */
 extern void iommu_devnode_init_iSeries(struct device_node *dn);
+/* Get table parameters from HV */
+extern void iommu_table_getparms_iSeries(unsigned long busno,
+					 unsigned char slotno,
+					 unsigned char virtbus,
+					 struct iommu_table* tbl);
 
 #endif /* CONFIG_PPC_ISERIES */
 
diff --git a/include/asm-powerpc/machdep.h b/include/asm-powerpc/machdep.h
index 629ca964b974740d8aac09eafd1f03efbf148b4a..fa03864d06eb0d69c528ffa4f25e110d102ca44c 100644
--- a/include/asm-powerpc/machdep.h
+++ b/include/asm-powerpc/machdep.h
@@ -47,20 +47,22 @@ struct machdep_calls {
 #ifdef CONFIG_PPC64
 	void            (*hpte_invalidate)(unsigned long slot,
 					   unsigned long va,
-					   int large,
+					   int psize,
 					   int local);
 	long		(*hpte_updatepp)(unsigned long slot, 
 					 unsigned long newpp, 
 					 unsigned long va,
-					 int large,
+					 int pize,
 					 int local);
 	void            (*hpte_updateboltedpp)(unsigned long newpp, 
-					       unsigned long ea);
+					       unsigned long ea,
+					       int psize);
 	long		(*hpte_insert)(unsigned long hpte_group,
 				       unsigned long va,
 				       unsigned long prpn,
+				       unsigned long rflags,
 				       unsigned long vflags,
-				       unsigned long rflags);
+				       int psize);
 	long		(*hpte_remove)(unsigned long hpte_group);
 	void		(*flush_hash_range)(unsigned long number, int local);
 
diff --git a/include/asm-powerpc/prom.h b/include/asm-powerpc/prom.h
index 3a0104fa0462a24802e85f8895cd4ed3994d6386..7587bf5f38c6eb71aeef5ab518c1563241d622f8 100644
--- a/include/asm-powerpc/prom.h
+++ b/include/asm-powerpc/prom.h
@@ -178,6 +178,14 @@ extern struct device_node *of_get_next_child(const struct device_node *node,
 extern struct device_node *of_node_get(struct device_node *node);
 extern void of_node_put(struct device_node *node);
 
+/* For scanning the flat device-tree at boot time */
+int __init of_scan_flat_dt(int (*it)(unsigned long node,
+				     const char *uname, int depth,
+				     void *data),
+			   void *data);
+void* __init of_get_flat_dt_prop(unsigned long node, const char *name,
+				 unsigned long *size);
+
 /* For updating the device tree at runtime */
 extern void of_attach_node(struct device_node *);
 extern void of_detach_node(const struct device_node *);
diff --git a/include/asm-powerpc/system.h b/include/asm-powerpc/system.h
index b5da0b851e02ee6c9e32f520389d90260b5c73c9..3536a5cd7a2d62fdcd3df5a58be8f4cafeb87714 100644
--- a/include/asm-powerpc/system.h
+++ b/include/asm-powerpc/system.h
@@ -289,7 +289,7 @@ __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
 
 #ifdef CONFIG_PPC64
 static __inline__ unsigned long
-__cmpxchg_u64(volatile long *p, unsigned long old, unsigned long new)
+__cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
 {
 	unsigned long prev;
 
diff --git a/include/asm-powerpc/thread_info.h b/include/asm-powerpc/thread_info.h
index ab17db79f69d8474cb7b8915c974dbf71b60594e..e525f49bd1790609e291dfb2d80fd91de66ed61d 100644
--- a/include/asm-powerpc/thread_info.h
+++ b/include/asm-powerpc/thread_info.h
@@ -65,23 +65,27 @@ struct thread_info {
 
 /* thread information allocation */
 
-#ifdef CONFIG_DEBUG_STACK_USAGE
-#define THREAD_INFO_GFP		GFP_KERNEL | __GFP_ZERO
-#else
-#define THREAD_INFO_GFP		GFP_KERNEL
-#endif
-
 #if THREAD_SHIFT >= PAGE_SHIFT
 
 #define THREAD_ORDER	(THREAD_SHIFT - PAGE_SHIFT)
 
+#ifdef CONFIG_DEBUG_STACK_USAGE
 #define alloc_thread_info(tsk)	\
-	((struct thread_info *)__get_free_pages(THREAD_INFO_GFP, THREAD_ORDER))
+	((struct thread_info *)__get_free_pages(GFP_KERNEL | \
+		__GFP_ZERO, THREAD_ORDER))
+#else
+#define alloc_thread_info(tsk)	\
+	((struct thread_info *)__get_free_pages(GFP_KERNEL, THREAD_ORDER))
+#endif
 #define free_thread_info(ti)	free_pages((unsigned long)ti, THREAD_ORDER)
 
 #else /* THREAD_SHIFT < PAGE_SHIFT */
 
-#define alloc_thread_info(tsk)	kmalloc(THREAD_SIZE, THREAD_INFO_GFP)
+#ifdef CONFIG_DEBUG_STACK_USAGE
+#define alloc_thread_info(tsk)	kzalloc(THREAD_SIZE, GFP_KERNEL)
+#else
+#define alloc_thread_info(tsk)	kmalloc(THREAD_SIZE, GFP_KERNEL)
+#endif
 #define free_thread_info(ti)	kfree(ti)
 
 #endif /* THREAD_SHIFT < PAGE_SHIFT */
diff --git a/include/asm-powerpc/tlbflush.h b/include/asm-powerpc/tlbflush.h
index ca3655672bbc890e9d268fa9418d0b9c880e23a8..a2998eee37bb4e9363e662cad602e600a0e5f38e 100644
--- a/include/asm-powerpc/tlbflush.h
+++ b/include/asm-powerpc/tlbflush.h
@@ -31,9 +31,9 @@ struct mm_struct;
 struct ppc64_tlb_batch {
 	unsigned long index;
 	struct mm_struct *mm;
-	pte_t pte[PPC64_TLB_BATCH_NR];
+	real_pte_t pte[PPC64_TLB_BATCH_NR];
 	unsigned long vaddr[PPC64_TLB_BATCH_NR];
-	unsigned int large;
+	unsigned int psize;
 };
 DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 
@@ -48,8 +48,9 @@ static inline void flush_tlb_pending(void)
 	put_cpu_var(ppc64_tlb_batch);
 }
 
-extern void flush_hash_page(unsigned long va, pte_t pte, int local);
-void flush_hash_range(unsigned long number, int local);
+extern void flush_hash_page(unsigned long va, real_pte_t pte, int psize,
+			    int local);
+extern void flush_hash_range(unsigned long number, int local);
 
 #else /* CONFIG_PPC64 */
 
diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h
index e0505acb77d9971dff1dc19ee3f364d09f193ec2..4c18a5cb69f59f493bdc091e1df6ec2641974f5c 100644
--- a/include/asm-ppc64/mmu.h
+++ b/include/asm-ppc64/mmu.h
@@ -48,13 +48,21 @@ extern char initial_stab[];
 
 /* Bits in the SLB VSID word */
 #define SLB_VSID_SHIFT		12
+#define SLB_VSID_B		ASM_CONST(0xc000000000000000)
+#define SLB_VSID_B_256M		ASM_CONST(0x0000000000000000)
+#define SLB_VSID_B_1T		ASM_CONST(0x4000000000000000)
 #define SLB_VSID_KS		ASM_CONST(0x0000000000000800)
 #define SLB_VSID_KP		ASM_CONST(0x0000000000000400)
 #define SLB_VSID_N		ASM_CONST(0x0000000000000200) /* no-execute */
-#define SLB_VSID_L		ASM_CONST(0x0000000000000100) /* largepage */
+#define SLB_VSID_L		ASM_CONST(0x0000000000000100)
 #define SLB_VSID_C		ASM_CONST(0x0000000000000080) /* class */
-#define SLB_VSID_LS		ASM_CONST(0x0000000000000070) /* size of largepage */
- 
+#define SLB_VSID_LP		ASM_CONST(0x0000000000000030)
+#define SLB_VSID_LP_00		ASM_CONST(0x0000000000000000)
+#define SLB_VSID_LP_01		ASM_CONST(0x0000000000000010)
+#define SLB_VSID_LP_10		ASM_CONST(0x0000000000000020)
+#define SLB_VSID_LP_11		ASM_CONST(0x0000000000000030)
+#define SLB_VSID_LLP		(SLB_VSID_L|SLB_VSID_LP)
+
 #define SLB_VSID_KERNEL		(SLB_VSID_KP)
 #define SLB_VSID_USER		(SLB_VSID_KP|SLB_VSID_KS|SLB_VSID_C)
 
@@ -69,6 +77,7 @@ extern char initial_stab[];
 #define HPTE_V_AVPN_SHIFT	7
 #define HPTE_V_AVPN		ASM_CONST(0xffffffffffffff80)
 #define HPTE_V_AVPN_VAL(x)	(((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT)
+#define HPTE_V_COMPARE(x,y)	(!(((x) ^ (y)) & HPTE_V_AVPN))
 #define HPTE_V_BOLTED		ASM_CONST(0x0000000000000010)
 #define HPTE_V_LOCK		ASM_CONST(0x0000000000000008)
 #define HPTE_V_LARGE		ASM_CONST(0x0000000000000004)
@@ -81,6 +90,7 @@ extern char initial_stab[];
 #define HPTE_R_RPN		ASM_CONST(0x3ffffffffffff000)
 #define HPTE_R_FLAGS		ASM_CONST(0x00000000000003ff)
 #define HPTE_R_PP		ASM_CONST(0x0000000000000003)
+#define HPTE_R_N		ASM_CONST(0x0000000000000004)
 
 /* Values for PP (assumes Ks=0, Kp=1) */
 /* pp0 will always be 0 for linux     */
@@ -99,100 +109,120 @@ typedef struct {
 extern hpte_t *htab_address;
 extern unsigned long htab_hash_mask;
 
-static inline unsigned long hpt_hash(unsigned long vpn, int large)
+/*
+ * Page size definition
+ *
+ *    shift : is the "PAGE_SHIFT" value for that page size
+ *    sllp  : is a bit mask with the value of SLB L || LP to be or'ed
+ *            directly to a slbmte "vsid" value
+ *    penc  : is the HPTE encoding mask for the "LP" field:
+ *
+ */
+struct mmu_psize_def
 {
-	unsigned long vsid;
-	unsigned long page;
-
-	if (large) {
-		vsid = vpn >> 4;
-		page = vpn & 0xf;
-	} else {
-		vsid = vpn >> 16;
-		page = vpn & 0xffff;
-	}
+	unsigned int	shift;	/* number of bits */
+	unsigned int	penc;	/* HPTE encoding */
+	unsigned int	tlbiel;	/* tlbiel supported for that page size */
+	unsigned long	avpnm;	/* bits to mask out in AVPN in the HPTE */
+	unsigned long	sllp;	/* SLB L||LP (exact mask to use in slbmte) */
+};
 
-	return (vsid & 0x7fffffffffUL) ^ page;
-}
-
-static inline void __tlbie(unsigned long va, int large)
-{
-	/* clear top 16 bits, non SLS segment */
-	va &= ~(0xffffULL << 48);
-
-	if (large) {
-		va &= HPAGE_MASK;
-		asm volatile("tlbie %0,1" : : "r"(va) : "memory");
-	} else {
-		va &= PAGE_MASK;
-		asm volatile("tlbie %0,0" : : "r"(va) : "memory");
-	}
-}
+#endif /* __ASSEMBLY__ */
 
-static inline void tlbie(unsigned long va, int large)
-{
-	asm volatile("ptesync": : :"memory");
-	__tlbie(va, large);
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
+/*
+ * The kernel use the constants below to index in the page sizes array.
+ * The use of fixed constants for this purpose is better for performances
+ * of the low level hash refill handlers.
+ *
+ * A non supported page size has a "shift" field set to 0
+ *
+ * Any new page size being implemented can get a new entry in here. Whether
+ * the kernel will use it or not is a different matter though. The actual page
+ * size used by hugetlbfs is not defined here and may be made variable
+ */
 
-static inline void __tlbiel(unsigned long va)
-{
-	/* clear top 16 bits, non SLS segment */
-	va &= ~(0xffffULL << 48);
-	va &= PAGE_MASK;
-
-	/* 
-	 * Thanks to Alan Modra we are now able to use machine specific 
-	 * assembly instructions (like tlbiel) by using the gas -many flag.
-	 * However we have to support older toolchains so for the moment 
-	 * we hardwire it.
-	 */
-#if 0
-	asm volatile("tlbiel %0" : : "r"(va) : "memory");
-#else
-	asm volatile(".long 0x7c000224 | (%0 << 11)" : : "r"(va) : "memory");
-#endif
-}
+#define MMU_PAGE_4K		0	/* 4K */
+#define MMU_PAGE_64K		1	/* 64K */
+#define MMU_PAGE_64K_AP		2	/* 64K Admixed (in a 4K segment) */
+#define MMU_PAGE_1M		3	/* 1M */
+#define MMU_PAGE_16M		4	/* 16M */
+#define MMU_PAGE_16G		5	/* 16G */
+#define MMU_PAGE_COUNT		6
 
-static inline void tlbiel(unsigned long va)
-{
-	asm volatile("ptesync": : :"memory");
-	__tlbiel(va);
-	asm volatile("ptesync": : :"memory");
-}
+#ifndef __ASSEMBLY__
 
-static inline unsigned long slot2va(unsigned long hpte_v, unsigned long slot)
-{
-	unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v);
-	unsigned long va;
+/*
+ * The current system page sizes
+ */
+extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+extern int mmu_linear_psize;
+extern int mmu_virtual_psize;
 
-	va = avpn << 23;
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * The page size index of the huge pages for use by hugetlbfs
+ */
+extern int mmu_huge_psize;
 
-	if (! (hpte_v & HPTE_V_LARGE)) {
-		unsigned long vpi, pteg;
+#endif /* CONFIG_HUGETLB_PAGE */
 
-		pteg = slot / HPTES_PER_GROUP;
-		if (hpte_v & HPTE_V_SECONDARY)
-			pteg = ~pteg;
+/*
+ * This function sets the AVPN and L fields of the HPTE  appropriately
+ * for the page size
+ */
+static inline unsigned long hpte_encode_v(unsigned long va, int psize)
+{
+	unsigned long v =
+	v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm);
+	v <<= HPTE_V_AVPN_SHIFT;
+	if (psize != MMU_PAGE_4K)
+		v |= HPTE_V_LARGE;
+	return v;
+}
 
-		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
+/*
+ * This function sets the ARPN, and LP fields of the HPTE appropriately
+ * for the page size. We assume the pa is already "clean" that is properly
+ * aligned for the requested page size
+ */
+static inline unsigned long hpte_encode_r(unsigned long pa, int psize)
+{
+	unsigned long r;
 
-		va |= vpi << PAGE_SHIFT;
+	/* A 4K page needs no special encoding */
+	if (psize == MMU_PAGE_4K)
+		return pa & HPTE_R_RPN;
+	else {
+		unsigned int penc = mmu_psize_defs[psize].penc;
+		unsigned int shift = mmu_psize_defs[psize].shift;
+		return (pa & ~((1ul << shift) - 1)) | (penc << 12);
 	}
-
-	return va;
+	return r;
 }
 
 /*
- * Handle a fault by adding an HPTE. If the address can't be determined
- * to be valid via Linux page tables, return 1. If handled return 0
+ * This hashes a virtual address for a 256Mb segment only for now
  */
-extern int __hash_page(unsigned long ea, unsigned long access,
-		       unsigned long vsid, pte_t *ptep, unsigned long trap,
-		       int local);
+
+static inline unsigned long hpt_hash(unsigned long va, unsigned int shift)
+{
+	return ((va >> 28) & 0x7fffffffffUL) ^ ((va & 0x0fffffffUL) >> shift);
+}
+
+extern int __hash_page_4K(unsigned long ea, unsigned long access,
+			  unsigned long vsid, pte_t *ptep, unsigned long trap,
+			  unsigned int local);
+extern int __hash_page_64K(unsigned long ea, unsigned long access,
+			   unsigned long vsid, pte_t *ptep, unsigned long trap,
+			   unsigned int local);
+struct mm_struct;
+extern int hash_huge_page(struct mm_struct *mm, unsigned long access,
+			  unsigned long ea, unsigned long vsid, int local);
 
 extern void htab_finish_init(void);
+extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
+			     unsigned long pstart, unsigned long mode,
+			     int psize);
 
 extern void hpte_init_native(void);
 extern void hpte_init_lpar(void);
@@ -200,17 +230,21 @@ extern void hpte_init_iSeries(void);
 
 extern long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 				     unsigned long va, unsigned long prpn,
-				     unsigned long vflags,
-				     unsigned long rflags);
-extern long native_hpte_insert(unsigned long hpte_group, unsigned long va,
-			       unsigned long prpn,
-			       unsigned long vflags, unsigned long rflags);
+				     unsigned long rflags,
+				     unsigned long vflags, int psize);
+
+extern long native_hpte_insert(unsigned long hpte_group,
+			       unsigned long va, unsigned long prpn,
+			       unsigned long rflags,
+			       unsigned long vflags, int psize);
 
-extern long iSeries_hpte_bolt_or_insert(unsigned long hpte_group,
-		unsigned long va, unsigned long prpn,
-		unsigned long vflags, unsigned long rflags);
+extern long iSeries_hpte_insert(unsigned long hpte_group,
+				unsigned long va, unsigned long prpn,
+				unsigned long rflags,
+				unsigned long vflags, int psize);
 
 extern void stabs_alloc(void);
+extern void slb_initialize(void);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/include/asm-ppc64/mmu_context.h b/include/asm-ppc64/mmu_context.h
index 820dd729b895aeecdd257b09888be3e170115c6e..4f512e9fa6b8fbb484fabc5b9e2d8c027d08e970 100644
--- a/include/asm-ppc64/mmu_context.h
+++ b/include/asm-ppc64/mmu_context.h
@@ -16,8 +16,16 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+/*
+ * Getting into a kernel thread, there is no valid user segment, mark
+ * paca->pgdir NULL so that SLB miss on user addresses will fault
+ */
+static inline void enter_lazy_tlb(struct mm_struct *mm,
+				  struct task_struct *tsk)
 {
+#ifdef CONFIG_PPC_64K_PAGES
+	get_paca()->pgdir = NULL;
+#endif /* CONFIG_PPC_64K_PAGES */
 }
 
 #define NO_CONTEXT	0
@@ -40,8 +48,13 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		cpu_set(smp_processor_id(), next->cpu_vm_mask);
 
 	/* No need to flush userspace segments if the mm doesnt change */
+#ifdef CONFIG_PPC_64K_PAGES
+	if (prev == next && get_paca()->pgdir == next->pgd)
+		return;
+#else
 	if (prev == next)
 		return;
+#endif /* CONFIG_PPC_64K_PAGES */
 
 #ifdef CONFIG_ALTIVEC
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
diff --git a/include/asm-ppc64/paca.h b/include/asm-ppc64/paca.h
index f68fe91debafe2cdc93979d52b5c0a6030b95dae..bccacd6aa93a4fa2448f7056ed04ee0ab861b241 100644
--- a/include/asm-ppc64/paca.h
+++ b/include/asm-ppc64/paca.h
@@ -72,10 +72,15 @@ struct paca_struct {
 	/*
 	 * Now, starting in cacheline 2, the exception save areas
 	 */
-	u64 exgen[8] __attribute__((aligned(0x80))); /* used for most interrupts/exceptions */
-	u64 exmc[8];		/* used for machine checks */
-	u64 exslb[8];		/* used for SLB/segment table misses
-				 * on the linear mapping */
+	/* used for most interrupts/exceptions */
+	u64 exgen[10] __attribute__((aligned(0x80)));
+	u64 exmc[10];		/* used for machine checks */
+	u64 exslb[10];		/* used for SLB/segment table misses
+ 				 * on the linear mapping */
+#ifdef CONFIG_PPC_64K_PAGES
+	pgd_t *pgdir;
+#endif /* CONFIG_PPC_64K_PAGES */
+
 	mm_context_t context;
 	u16 slb_cache[SLB_CACHE_ENTRIES];
 	u16 slb_cache_ptr;
diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h
index d404431f0a9a6c88829fe10fe2a749048c1123a7..82ce187e5be83271bd0778e756af9caceb550200 100644
--- a/include/asm-ppc64/page.h
+++ b/include/asm-ppc64/page.h
@@ -13,32 +13,59 @@
 #include <linux/config.h>
 #include <asm/ppc_asm.h> /* for ASM_CONST */
 
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT	12
-#define PAGE_SIZE	(ASM_CONST(1) << PAGE_SHIFT)
-#define PAGE_MASK	(~(PAGE_SIZE-1))
+/*
+ * We support either 4k or 64k software page size. When using 64k pages
+ * however, wether we are really supporting 64k pages in HW or not is
+ * irrelevant to those definitions. We always define HW_PAGE_SHIFT to 12
+ * as use of 64k pages remains a linux kernel specific, every notion of
+ * page number shared with the firmware, TCEs, iommu, etc... still assumes
+ * a page size of 4096.
+ */
+#ifdef CONFIG_PPC_64K_PAGES
+#define PAGE_SHIFT		16
+#else
+#define PAGE_SHIFT		12
+#endif
 
-#define SID_SHIFT       28
-#define SID_MASK        0xfffffffffUL
-#define ESID_MASK	0xfffffffff0000000UL
-#define GET_ESID(x)     (((x) >> SID_SHIFT) & SID_MASK)
+#define PAGE_SIZE		(ASM_CONST(1) << PAGE_SHIFT)
+#define PAGE_MASK		(~(PAGE_SIZE-1))
 
-#define HPAGE_SHIFT	24
-#define HPAGE_SIZE	((1UL) << HPAGE_SHIFT)
-#define HPAGE_MASK	(~(HPAGE_SIZE - 1))
+/* HW_PAGE_SHIFT is always 4k pages */
+#define HW_PAGE_SHIFT		12
+#define HW_PAGE_SIZE		(ASM_CONST(1) << HW_PAGE_SHIFT)
+#define HW_PAGE_MASK		(~(HW_PAGE_SIZE-1))
 
-#ifdef CONFIG_HUGETLB_PAGE
+/* PAGE_FACTOR is the number of bits factor between PAGE_SHIFT and
+ * HW_PAGE_SHIFT, that is 4k pages
+ */
+#define PAGE_FACTOR		(PAGE_SHIFT - HW_PAGE_SHIFT)
+
+/* Segment size */
+#define SID_SHIFT       	28
+#define SID_MASK        	0xfffffffffUL
+#define ESID_MASK		0xfffffffff0000000UL
+#define GET_ESID(x)     	(((x) >> SID_SHIFT) & SID_MASK)
 
+/* Large pages size */
+
+#ifndef __ASSEMBLY__
+extern unsigned int HPAGE_SHIFT;
+#define HPAGE_SIZE		((1UL) << HPAGE_SHIFT)
+#define HPAGE_MASK		(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
+#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_HUGETLB_PAGE
+
 
 #define HTLB_AREA_SHIFT		40
 #define HTLB_AREA_SIZE		(1UL << HTLB_AREA_SHIFT)
 #define GET_HTLB_AREA(x)	((x) >> HTLB_AREA_SHIFT)
 
-#define LOW_ESID_MASK(addr, len)	(((1U << (GET_ESID(addr+len-1)+1)) \
-	   	                	- (1U << GET_ESID(addr))) & 0xffff)
-#define HTLB_AREA_MASK(addr, len)	(((1U << (GET_HTLB_AREA(addr+len-1)+1)) \
-	   	                	- (1U << GET_HTLB_AREA(addr))) & 0xffff)
+#define LOW_ESID_MASK(addr, len)    (((1U << (GET_ESID(addr+len-1)+1)) \
+	   	                      - (1U << GET_ESID(addr))) & 0xffff)
+#define HTLB_AREA_MASK(addr, len)   (((1U << (GET_HTLB_AREA(addr+len-1)+1)) \
+	   	                      - (1U << GET_HTLB_AREA(addr))) & 0xffff)
 
 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
@@ -114,7 +141,25 @@ static __inline__ void clear_page(void *addr)
 	: "ctr", "memory");
 }
 
-extern void copy_page(void *to, void *from);
+extern void copy_4K_page(void *to, void *from);
+
+#ifdef CONFIG_PPC_64K_PAGES
+static inline void copy_page(void *to, void *from)
+{
+	unsigned int i;
+	for (i=0; i < (1 << (PAGE_SHIFT - 12)); i++) {
+		copy_4K_page(to, from);
+		to += 4096;
+		from += 4096;
+	}
+}
+#else /* CONFIG_PPC_64K_PAGES */
+static inline void copy_page(void *to, void *from)
+{
+	copy_4K_page(to, from);
+}
+#endif /* CONFIG_PPC_64K_PAGES */
+
 struct page;
 extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
 extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *p);
@@ -124,43 +169,75 @@ extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct pag
  * These are used to make use of C type-checking.  
  * Entries in the pte table are 64b, while entries in the pgd & pmd are 32b.
  */
-typedef struct { unsigned long pte; } pte_t;
-typedef struct { unsigned long pmd; } pmd_t;
-typedef struct { unsigned long pud; } pud_t;
-typedef struct { unsigned long pgd; } pgd_t;
-typedef struct { unsigned long pgprot; } pgprot_t;
 
+/* PTE level */
+typedef struct { unsigned long pte; } pte_t;
 #define pte_val(x)	((x).pte)
-#define pmd_val(x)	((x).pmd)
-#define pud_val(x)	((x).pud)
-#define pgd_val(x)	((x).pgd)
-#define pgprot_val(x)	((x).pgprot)
-
 #define __pte(x)	((pte_t) { (x) })
+
+/* 64k pages additionally define a bigger "real PTE" type that gathers
+ * the "second half" part of the PTE for pseudo 64k pages
+ */
+#ifdef CONFIG_PPC_64K_PAGES
+typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
+#else
+typedef struct { pte_t pte; } real_pte_t;
+#endif
+
+/* PMD level */
+typedef struct { unsigned long pmd; } pmd_t;
+#define pmd_val(x)	((x).pmd)
 #define __pmd(x)	((pmd_t) { (x) })
+
+/* PUD level exusts only on 4k pages */
+#ifndef CONFIG_PPC_64K_PAGES
+typedef struct { unsigned long pud; } pud_t;
+#define pud_val(x)	((x).pud)
 #define __pud(x)	((pud_t) { (x) })
+#endif
+
+/* PGD level */
+typedef struct { unsigned long pgd; } pgd_t;
+#define pgd_val(x)	((x).pgd)
 #define __pgd(x)	((pgd_t) { (x) })
+
+/* Page protection bits */
+typedef struct { unsigned long pgprot; } pgprot_t;
+#define pgprot_val(x)	((x).pgprot)
 #define __pgprot(x)	((pgprot_t) { (x) })
 
 #else
+
 /*
  * .. while these make it easier on the compiler
  */
-typedef unsigned long pte_t;
-typedef unsigned long pmd_t;
-typedef unsigned long pud_t;
-typedef unsigned long pgd_t;
-typedef unsigned long pgprot_t;
 
+typedef unsigned long pte_t;
 #define pte_val(x)	(x)
+#define __pte(x)	(x)
+
+#ifdef CONFIG_PPC_64K_PAGES
+typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
+#else
+typedef unsigned long real_pte_t;
+#endif
+
+
+typedef unsigned long pmd_t;
 #define pmd_val(x)	(x)
+#define __pmd(x)	(x)
+
+#ifndef CONFIG_PPC_64K_PAGES
+typedef unsigned long pud_t;
 #define pud_val(x)	(x)
+#define __pud(x)	(x)
+#endif
+
+typedef unsigned long pgd_t;
 #define pgd_val(x)	(x)
 #define pgprot_val(x)	(x)
 
-#define __pte(x)	(x)
-#define __pmd(x)	(x)
-#define __pud(x)	(x)
+typedef unsigned long pgprot_t;
 #define __pgd(x)	(x)
 #define __pgprot(x)	(x)
 
diff --git a/include/asm-ppc64/pgalloc.h b/include/asm-ppc64/pgalloc.h
index 26bc49c1108dfcd09cc30c6feb455e00e833e15f..98da0e4262bd2f02a6d07c3df5d2c68eb3f495f3 100644
--- a/include/asm-ppc64/pgalloc.h
+++ b/include/asm-ppc64/pgalloc.h
@@ -8,10 +8,16 @@
 
 extern kmem_cache_t *pgtable_cache[];
 
+#ifdef CONFIG_PPC_64K_PAGES
+#define PTE_CACHE_NUM	0
+#define PMD_CACHE_NUM	0
+#define PGD_CACHE_NUM	1
+#else
 #define PTE_CACHE_NUM	0
 #define PMD_CACHE_NUM	1
 #define PUD_CACHE_NUM	1
 #define PGD_CACHE_NUM	0
+#endif
 
 /*
  * This program is free software; you can redistribute it and/or
@@ -30,6 +36,8 @@ static inline void pgd_free(pgd_t *pgd)
 	kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd);
 }
 
+#ifndef CONFIG_PPC_64K_PAGES
+
 #define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, PUD)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -43,7 +51,30 @@ static inline void pud_free(pud_t *pud)
 	kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud);
 }
 
-#define pud_populate(MM, PUD, PMD)	pud_set(PUD, PMD)
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+	pud_set(pud, (unsigned long)pmd);
+}
+
+#define pmd_populate(mm, pmd, pte_page) \
+	pmd_populate_kernel(mm, pmd, page_address(pte_page))
+#define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, (unsigned long)(pte))
+
+
+#else /* CONFIG_PPC_64K_PAGES */
+
+#define pud_populate(mm, pud, pmd)	pud_set(pud, (unsigned long)pmd)
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
+				       pte_t *pte)
+{
+	pmd_set(pmd, (unsigned long)pte);
+}
+
+#define pmd_populate(mm, pmd, pte_page) \
+	pmd_populate_kernel(mm, pmd, page_address(pte_page))
+
+#endif /* CONFIG_PPC_64K_PAGES */
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
@@ -56,17 +87,15 @@ static inline void pmd_free(pmd_t *pmd)
 	kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd);
 }
 
-#define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, pte)
-#define pmd_populate(mm, pmd, pte_page) \
-	pmd_populate_kernel(mm, pmd, page_address(pte_page))
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+					  unsigned long address)
 {
 	return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM],
 				GFP_KERNEL|__GFP_REPEAT);
 }
 
-static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+static inline struct page *pte_alloc_one(struct mm_struct *mm,
+					 unsigned long address)
 {
 	return virt_to_page(pte_alloc_one_kernel(mm, address));
 }
@@ -103,7 +132,7 @@ static inline void pgtable_free(pgtable_free_t pgf)
 	kmem_cache_free(pgtable_cache[cachenum], p);
 }
 
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
+extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
 
 #define __pte_free_tlb(tlb, ptepage)	\
 	pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \
@@ -111,9 +140,11 @@ void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
 #define __pmd_free_tlb(tlb, pmd) 	\
 	pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \
 		PMD_CACHE_NUM, PMD_TABLE_SIZE-1))
+#ifndef CONFIG_PPC_64K_PAGES
 #define __pud_free_tlb(tlb, pmd)	\
 	pgtable_free_tlb(tlb, pgtable_free_cache(pud, \
 		PUD_CACHE_NUM, PUD_TABLE_SIZE-1))
+#endif /* CONFIG_PPC_64K_PAGES */
 
 #define check_pgt_cache()	do { } while (0)
 
diff --git a/include/asm-ppc64/pgtable-4k.h b/include/asm-ppc64/pgtable-4k.h
new file mode 100644
index 0000000000000000000000000000000000000000..c883a274855878c20602cf246d45993f218ebffe
--- /dev/null
+++ b/include/asm-ppc64/pgtable-4k.h
@@ -0,0 +1,88 @@
+/*
+ * Entries per page directory level.  The PTE level must use a 64b record
+ * for each page table entry.  The PMD and PGD level use a 32b record for
+ * each entry by assuming that each entry is page aligned.
+ */
+#define PTE_INDEX_SIZE  9
+#define PMD_INDEX_SIZE  7
+#define PUD_INDEX_SIZE  7
+#define PGD_INDEX_SIZE  9
+
+#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
+#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
+#define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
+#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
+
+#define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
+#define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PUD	(1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
+
+/* PMD_SHIFT determines what a second-level page table entry can map */
+#define PMD_SHIFT	(PAGE_SHIFT + PTE_INDEX_SIZE)
+#define PMD_SIZE	(1UL << PMD_SHIFT)
+#define PMD_MASK	(~(PMD_SIZE-1))
+
+/* PUD_SHIFT determines what a third-level page table entry can map */
+#define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+#define PUD_SIZE	(1UL << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE-1))
+
+/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
+#define PGDIR_SHIFT	(PUD_SHIFT + PUD_INDEX_SIZE)
+#define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
+#define PGDIR_MASK	(~(PGDIR_SIZE-1))
+
+/* PTE bits */
+#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */
+#define _PAGE_GROUP_IX  0x7000 /* software: HPTE index within group */
+#define _PAGE_F_SECOND  _PAGE_SECONDARY
+#define _PAGE_F_GIX     _PAGE_GROUP_IX
+
+/* PTE flags to conserve for HPTE identification */
+#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | \
+			 _PAGE_SECONDARY | _PAGE_GROUP_IX)
+
+/* PAGE_MASK gives the right answer below, but only by accident */
+/* It should be preserving the high 48 bits and then specifically */
+/* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */
+#define _PAGE_CHG_MASK	(PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | \
+                         _PAGE_HPTEFLAGS)
+
+/* Bits to mask out from a PMD to get to the PTE page */
+#define PMD_MASKED_BITS		0
+/* Bits to mask out from a PUD to get to the PMD page */
+#define PUD_MASKED_BITS		0
+/* Bits to mask out from a PGD to get to the PUD page */
+#define PGD_MASKED_BITS		0
+
+/* shift to put page number into pte */
+#define PTE_RPN_SHIFT	(17)
+
+#define __real_pte(e,p)		((real_pte_t)(e))
+#define __rpte_to_pte(r)	(r)
+#define __rpte_to_hidx(r,index)	(pte_val((r)) >> 12)
+
+#define pte_iterate_hashed_subpages(rpte, psize, va, index, shift)       \
+	do {							         \
+		index = 0;					         \
+		shift = mmu_psize_defs[psize].shift;		         \
+
+#define pte_iterate_hashed_end() } while(0)
+
+/*
+ * 4-level page tables related bits
+ */
+
+#define pgd_none(pgd)		(!pgd_val(pgd))
+#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
+#define pgd_present(pgd)	(pgd_val(pgd) != 0)
+#define pgd_clear(pgdp)		(pgd_val(*(pgdp)) = 0)
+#define pgd_page(pgd)		(pgd_val(pgd) & ~PGD_MASKED_BITS)
+
+#define pud_offset(pgdp, addr)	\
+  (((pud_t *) pgd_page(*(pgdp))) + \
+    (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
+
+#define pud_ERROR(e) \
+	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e))
diff --git a/include/asm-ppc64/pgtable-64k.h b/include/asm-ppc64/pgtable-64k.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5f437c86b3c7a67b264edea5b9de83dedf9001c
--- /dev/null
+++ b/include/asm-ppc64/pgtable-64k.h
@@ -0,0 +1,87 @@
+#include <asm-generic/pgtable-nopud.h>
+
+
+#define PTE_INDEX_SIZE  12
+#define PMD_INDEX_SIZE  12
+#define PUD_INDEX_SIZE	0
+#define PGD_INDEX_SIZE  4
+
+#define PTE_TABLE_SIZE	(sizeof(real_pte_t) << PTE_INDEX_SIZE)
+#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
+#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
+
+#define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
+#define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
+
+/* PMD_SHIFT determines what a second-level page table entry can map */
+#define PMD_SHIFT	(PAGE_SHIFT + PTE_INDEX_SIZE)
+#define PMD_SIZE	(1UL << PMD_SHIFT)
+#define PMD_MASK	(~(PMD_SIZE-1))
+
+/* PGDIR_SHIFT determines what a third-level page table entry can map */
+#define PGDIR_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+#define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
+#define PGDIR_MASK	(~(PGDIR_SIZE-1))
+
+/* Additional PTE bits (don't change without checking asm in hash_low.S) */
+#define _PAGE_HPTE_SUB	0x0ffff000 /* combo only: sub pages HPTE bits */
+#define _PAGE_HPTE_SUB0	0x08000000 /* combo only: first sub page */
+#define _PAGE_COMBO	0x10000000 /* this is a combo 4k page */
+#define _PAGE_F_SECOND  0x00008000 /* full page: hidx bits */
+#define _PAGE_F_GIX     0x00007000 /* full page: hidx bits */
+
+/* PTE flags to conserve for HPTE identification */
+#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | _PAGE_HPTE_SUB |\
+                         _PAGE_COMBO)
+
+/* Shift to put page number into pte.
+ *
+ * That gives us a max RPN of 32 bits, which means a max of 48 bits
+ * of addressable physical space.
+ * We could get 3 more bits here by setting PTE_RPN_SHIFT to 29 but
+ * 32 makes PTEs more readable for debugging for now :)
+ */
+#define PTE_RPN_SHIFT	(32)
+#define PTE_RPN_MAX	(1UL << (64 - PTE_RPN_SHIFT))
+#define PTE_RPN_MASK	(~((1UL<<PTE_RPN_SHIFT)-1))
+
+/* _PAGE_CHG_MASK masks of bits that are to be preserved accross
+ * pgprot changes
+ */
+#define _PAGE_CHG_MASK	(PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+                         _PAGE_ACCESSED)
+
+/* Bits to mask out from a PMD to get to the PTE page */
+#define PMD_MASKED_BITS		0x1ff
+/* Bits to mask out from a PGD/PUD to get to the PMD page */
+#define PUD_MASKED_BITS		0x1ff
+
+#ifndef __ASSEMBLY__
+
+/* Manipulate "rpte" values */
+#define __real_pte(e,p) 	((real_pte_t) { \
+	(e), pte_val(*((p) + PTRS_PER_PTE)) })
+#define __rpte_to_hidx(r,index)	((pte_val((r).pte) & _PAGE_COMBO) ? \
+        (((r).hidx >> ((index)<<2)) & 0xf) : ((pte_val((r).pte) >> 12) & 0xf))
+#define __rpte_to_pte(r)	((r).pte)
+#define __rpte_sub_valid(rpte, index) \
+	(pte_val(rpte.pte) & (_PAGE_HPTE_SUB0 >> (index)))
+
+
+/* Trick: we set __end to va + 64k, which happens works for
+ * a 16M page as well as we want only one iteration
+ */
+#define pte_iterate_hashed_subpages(rpte, psize, va, index, shift)	    \
+        do {                                                                \
+                unsigned long __end = va + PAGE_SIZE;                       \
+                unsigned __split = (psize == MMU_PAGE_4K ||                 \
+				    psize == MMU_PAGE_64K_AP);              \
+                shift = mmu_psize_defs[psize].shift;                        \
+	        for (index = 0; va < __end; index++, va += (1 << shift)) {  \
+		        if (!__split || __rpte_sub_valid(rpte, index)) do { \
+
+#define pte_iterate_hashed_end() } while(0); } } while(0)
+
+
+#endif /*  __ASSEMBLY__ */
diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h
index 8c3f574046b6cbb6310bb72cc4eaa20a464e69c0..fde93ec36abc5550e91724357ef1137debd4cbe6 100644
--- a/include/asm-ppc64/pgtable.h
+++ b/include/asm-ppc64/pgtable.h
@@ -15,40 +15,11 @@
 #include <asm/tlbflush.h>
 #endif /* __ASSEMBLY__ */
 
-/*
- * Entries per page directory level.  The PTE level must use a 64b record
- * for each page table entry.  The PMD and PGD level use a 32b record for 
- * each entry by assuming that each entry is page aligned.
- */
-#define PTE_INDEX_SIZE  9
-#define PMD_INDEX_SIZE  7
-#define PUD_INDEX_SIZE  7
-#define PGD_INDEX_SIZE  9
-
-#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
-#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
-#define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
-#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
-
-#define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
-#define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
-#define PTRS_PER_PUD	(1 << PMD_INDEX_SIZE)
-#define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
-
-/* PMD_SHIFT determines what a second-level page table entry can map */
-#define PMD_SHIFT	(PAGE_SHIFT + PTE_INDEX_SIZE)
-#define PMD_SIZE	(1UL << PMD_SHIFT)
-#define PMD_MASK	(~(PMD_SIZE-1))
-
-/* PUD_SHIFT determines what a third-level page table entry can map */
-#define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
-#define PUD_SIZE	(1UL << PUD_SHIFT)
-#define PUD_MASK	(~(PUD_SIZE-1))
-
-/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
-#define PGDIR_SHIFT	(PUD_SHIFT + PUD_INDEX_SIZE)
-#define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
-#define PGDIR_MASK	(~(PGDIR_SIZE-1))
+#ifdef CONFIG_PPC_64K_PAGES
+#include <asm/pgtable-64k.h>
+#else
+#include <asm/pgtable-4k.h>
+#endif
 
 #define FIRST_USER_ADDRESS	0
 
@@ -75,8 +46,9 @@
 #define VMALLOC_END   (VMALLOC_START + VMALLOC_SIZE)
 
 /*
- * Bits in a linux-style PTE.  These match the bits in the
- * (hardware-defined) PowerPC PTE as closely as possible.
+ * Common bits in a linux-style PTE.  These match the bits in the
+ * (hardware-defined) PowerPC PTE as closely as possible. Additional
+ * bits may be defined in pgtable-*.h
  */
 #define _PAGE_PRESENT	0x0001 /* software: pte contains a translation */
 #define _PAGE_USER	0x0002 /* matches one of the PP bits */
@@ -91,15 +63,6 @@
 #define _PAGE_RW	0x0200 /* software: user write access allowed */
 #define _PAGE_HASHPTE	0x0400 /* software: pte has an associated HPTE */
 #define _PAGE_BUSY	0x0800 /* software: PTE & hash are busy */ 
-#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */
-#define _PAGE_GROUP_IX  0x7000 /* software: HPTE index within group */
-#define _PAGE_HUGE	0x10000 /* 16MB page */
-/* Bits 0x7000 identify the index within an HPT Group */
-#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX)
-/* PAGE_MASK gives the right answer below, but only by accident */
-/* It should be preserving the high 48 bits and then specifically */
-/* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */
-#define _PAGE_CHG_MASK	(PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_HPTEFLAGS)
 
 #define _PAGE_BASE	(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_COHERENT)
 
@@ -122,10 +85,10 @@
 #define PAGE_AGP	__pgprot(_PAGE_BASE | _PAGE_WRENABLE | _PAGE_NO_CACHE)
 #define HAVE_PAGE_AGP
 
-/*
- * This bit in a hardware PTE indicates that the page is *not* executable.
- */
-#define HW_NO_EXEC	_PAGE_EXEC
+/* PTEIDX nibble */
+#define _PTEIDX_SECONDARY	0x8
+#define _PTEIDX_GROUP_IX	0x7
+
 
 /*
  * POWER4 and newer have per page execute protection, older chips can only
@@ -164,21 +127,10 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 #endif /* __ASSEMBLY__ */
 
-/* shift to put page number into pte */
-#define PTE_SHIFT (17)
-
 #ifdef CONFIG_HUGETLB_PAGE
 
-#ifndef __ASSEMBLY__
-int hash_huge_page(struct mm_struct *mm, unsigned long access,
-		   unsigned long ea, unsigned long vsid, int local);
-#endif /* __ASSEMBLY__ */
-
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#else
-
-#define hash_huge_page(mm,a,ea,vsid,local)	-1
 
 #endif
 
@@ -197,7 +149,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 	pte_t pte;
 
 
-	pte_val(pte) = (pfn << PTE_SHIFT) | pgprot_val(pgprot);
+	pte_val(pte) = (pfn << PTE_RPN_SHIFT) | pgprot_val(pgprot);
 	return pte;
 }
 
@@ -209,30 +161,25 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 
 /* pte_clear moved to later in this file */
 
-#define pte_pfn(x)		((unsigned long)((pte_val(x) >> PTE_SHIFT)))
+#define pte_pfn(x)		((unsigned long)((pte_val(x)>>PTE_RPN_SHIFT)))
 #define pte_page(x)		pfn_to_page(pte_pfn(x))
 
-#define pmd_set(pmdp, ptep) 	({BUG_ON((u64)ptep < KERNELBASE); pmd_val(*(pmdp)) = (unsigned long)(ptep);})
+#define pmd_set(pmdp, pmdval) 	(pmd_val(*(pmdp)) = (pmdval))
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(pmd_val(pmd) == 0)
 #define	pmd_present(pmd)	(pmd_val(pmd) != 0)
 #define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
-#define pmd_page_kernel(pmd)	(pmd_val(pmd))
+#define pmd_page_kernel(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
 #define pmd_page(pmd)		virt_to_page(pmd_page_kernel(pmd))
 
-#define pud_set(pudp, pmdp)	(pud_val(*(pudp)) = (unsigned long)(pmdp))
+#define pud_set(pudp, pudval)	(pud_val(*(pudp)) = (pudval))
 #define pud_none(pud)		(!pud_val(pud))
 #define pud_bad(pud)		((pud_val(pud)) == 0)
 #define pud_present(pud)	(pud_val(pud) != 0)
 #define pud_clear(pudp)		(pud_val(*(pudp)) = 0)
-#define pud_page(pud)		(pud_val(pud))
+#define pud_page(pud)		(pud_val(pud) & ~PUD_MASKED_BITS)
 
 #define pgd_set(pgdp, pudp)	({pgd_val(*(pgdp)) = (unsigned long)(pudp);})
-#define pgd_none(pgd)		(!pgd_val(pgd))
-#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
-#define pgd_present(pgd)	(pgd_val(pgd) != 0)
-#define pgd_clear(pgdp)		(pgd_val(*(pgdp)) = 0)
-#define pgd_page(pgd)		(pgd_val(pgd))
 
 /* 
  * Find an entry in a page-table-directory.  We combine the address region 
@@ -243,9 +190,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 
 #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
 
-#define pud_offset(pgdp, addr)	\
-  (((pud_t *) pgd_page(*(pgdp))) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
-
 #define pmd_offset(pudp,addr) \
   (((pmd_t *) pud_page(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
 
@@ -271,7 +215,6 @@ static inline int pte_exec(pte_t pte)  { return pte_val(pte) & _PAGE_EXEC;}
 static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY;}
 static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED;}
 static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE;}
-static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_HUGE;}
 
 static inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; }
 static inline void pte_cache(pte_t pte)   { pte_val(pte) &= ~_PAGE_NO_CACHE; }
@@ -286,7 +229,6 @@ static inline pte_t pte_mkclean(pte_t pte) {
 	pte_val(pte) &= ~(_PAGE_DIRTY); return pte; }
 static inline pte_t pte_mkold(pte_t pte) {
 	pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
-
 static inline pte_t pte_mkread(pte_t pte) {
 	pte_val(pte) |= _PAGE_USER; return pte; }
 static inline pte_t pte_mkexec(pte_t pte) {
@@ -298,7 +240,7 @@ static inline pte_t pte_mkdirty(pte_t pte) {
 static inline pte_t pte_mkyoung(pte_t pte) {
 	pte_val(pte) |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkhuge(pte_t pte) {
-	pte_val(pte) |= _PAGE_HUGE; return pte; }
+	return pte; }
 
 /* Atomic PTE updates */
 static inline unsigned long pte_update(pte_t *p, unsigned long clr)
@@ -321,11 +263,13 @@ static inline unsigned long pte_update(pte_t *p, unsigned long clr)
 /* PTE updating functions, this function puts the PTE in the
  * batch, doesn't actually triggers the hash flush immediately,
  * you need to call flush_tlb_pending() to do that.
+ * Pass -1 for "normal" size (4K or 64K)
  */
-extern void hpte_update(struct mm_struct *mm, unsigned long addr, unsigned long pte,
-			int wrprot);
+extern void hpte_update(struct mm_struct *mm, unsigned long addr,
+			pte_t *ptep, unsigned long pte, int huge);
 
-static inline int __ptep_test_and_clear_young(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
+					      unsigned long addr, pte_t *ptep)
 {
 	unsigned long old;
 
@@ -333,7 +277,7 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, unsigned lon
 		return 0;
 	old = pte_update(ptep, _PAGE_ACCESSED);
 	if (old & _PAGE_HASHPTE) {
-		hpte_update(mm, addr, old, 0);
+		hpte_update(mm, addr, ptep, old, 0);
 		flush_tlb_pending();
 	}
 	return (old & _PAGE_ACCESSED) != 0;
@@ -351,7 +295,8 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, unsigned lon
  * moment we always flush but we need to fix hpte_update and test if the
  * optimisation is worth it.
  */
-static inline int __ptep_test_and_clear_dirty(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static inline int __ptep_test_and_clear_dirty(struct mm_struct *mm,
+					      unsigned long addr, pte_t *ptep)
 {
 	unsigned long old;
 
@@ -359,7 +304,7 @@ static inline int __ptep_test_and_clear_dirty(struct mm_struct *mm, unsigned lon
 		return 0;
 	old = pte_update(ptep, _PAGE_DIRTY);
 	if (old & _PAGE_HASHPTE)
-		hpte_update(mm, addr, old, 0);
+		hpte_update(mm, addr, ptep, old, 0);
 	return (old & _PAGE_DIRTY) != 0;
 }
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
@@ -371,7 +316,8 @@ static inline int __ptep_test_and_clear_dirty(struct mm_struct *mm, unsigned lon
 })
 
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+				      pte_t *ptep)
 {
 	unsigned long old;
 
@@ -379,7 +325,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
        		return;
 	old = pte_update(ptep, _PAGE_RW);
 	if (old & _PAGE_HASHPTE)
-		hpte_update(mm, addr, old, 0);
+		hpte_update(mm, addr, ptep, old, 0);
 }
 
 /*
@@ -408,21 +354,23 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 })
 
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+				       unsigned long addr, pte_t *ptep)
 {
 	unsigned long old = pte_update(ptep, ~0UL);
 
 	if (old & _PAGE_HASHPTE)
-		hpte_update(mm, addr, old, 0);
+		hpte_update(mm, addr, ptep, old, 0);
 	return __pte(old);
 }
 
-static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t * ptep)
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+			     pte_t * ptep)
 {
 	unsigned long old = pte_update(ptep, ~0UL);
 
 	if (old & _PAGE_HASHPTE)
-		hpte_update(mm, addr, old, 0);
+		hpte_update(mm, addr, ptep, old, 0);
 }
 
 /*
@@ -435,7 +383,14 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 		pte_clear(mm, addr, ptep);
 		flush_tlb_pending();
 	}
-	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+	pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+
+#ifdef CONFIG_PPC_64K_PAGES
+	if (mmu_virtual_psize != MMU_PAGE_64K)
+		pte = __pte(pte_val(pte) | _PAGE_COMBO);
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	*ptep = pte;
 }
 
 /* Set the dirty and/or accessed bits atomically in a linux PTE, this
@@ -482,8 +437,6 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 	printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pmd_ERROR(e) \
 	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
-#define pud_ERROR(e) \
-	printk("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
 #define pgd_ERROR(e) \
 	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
@@ -509,12 +462,12 @@ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t);
 /* Encode and de-code a swap entry */
 #define __swp_type(entry)	(((entry).val >> 1) & 0x3f)
 #define __swp_offset(entry)	((entry).val >> 8)
-#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
-#define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) >> PTE_SHIFT })
-#define __swp_entry_to_pte(x)	((pte_t) { (x).val << PTE_SHIFT })
-#define pte_to_pgoff(pte)	(pte_val(pte) >> PTE_SHIFT)
-#define pgoff_to_pte(off)	((pte_t) {((off) << PTE_SHIFT)|_PAGE_FILE})
-#define PTE_FILE_MAX_BITS	(BITS_PER_LONG - PTE_SHIFT)
+#define __swp_entry(type, offset) ((swp_entry_t){((type)<< 1)|((offset)<<8)})
+#define __pte_to_swp_entry(pte)	((swp_entry_t){pte_val(pte) >> PTE_RPN_SHIFT})
+#define __swp_entry_to_pte(x)	((pte_t) { (x).val << PTE_RPN_SHIFT })
+#define pte_to_pgoff(pte)	(pte_val(pte) >> PTE_RPN_SHIFT)
+#define pgoff_to_pte(off)	((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE})
+#define PTE_FILE_MAX_BITS	(BITS_PER_LONG - PTE_RPN_SHIFT)
 
 /*
  * kern_addr_valid is intended to indicate whether an address is a valid
@@ -532,29 +485,22 @@ void pgtable_cache_init(void);
 /*
  * find_linux_pte returns the address of a linux pte for a given 
  * effective address and directory.  If not found, it returns zero.
- */
-static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)
+ */static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)
 {
 	pgd_t *pg;
 	pud_t *pu;
 	pmd_t *pm;
 	pte_t *pt = NULL;
-	pte_t pte;
 
 	pg = pgdir + pgd_index(ea);
 	if (!pgd_none(*pg)) {
 		pu = pud_offset(pg, ea);
 		if (!pud_none(*pu)) {
 			pm = pmd_offset(pu, ea);
-			if (pmd_present(*pm)) {
+			if (pmd_present(*pm))
 				pt = pte_offset_kernel(pm, ea);
-				pte = *pt;
-				if (!pte_present(pte))
-					pt = NULL;
-			}
 		}
 	}
-
 	return pt;
 }
 
diff --git a/include/asm-ppc64/prom.h b/include/asm-ppc64/prom.h
index e8d0d2ab4c0f599e23c4253fbc0f42f304ef8bfe..bdb47174ff0eacde3a69089905bafa02196d3d5b 100644
--- a/include/asm-ppc64/prom.h
+++ b/include/asm-ppc64/prom.h
@@ -188,6 +188,14 @@ extern struct device_node *of_get_next_child(const struct device_node *node,
 extern struct device_node *of_node_get(struct device_node *node);
 extern void of_node_put(struct device_node *node);
 
+/* For scanning the flat device-tree at boot time */
+int __init of_scan_flat_dt(int (*it)(unsigned long node,
+				     const char *uname, int depth,
+				     void *data),
+			   void *data);
+void* __init of_get_flat_dt_prop(unsigned long node, const char *name,
+				 unsigned long *size);
+
 /* For updating the device tree at runtime */
 extern void of_attach_node(struct device_node *);
 extern void of_detach_node(const struct device_node *);
diff --git a/include/asm-ppc64/system.h b/include/asm-ppc64/system.h
index 99b8ca52f101ab7fea76b0c7420d4eb9ebdfd33f..0cdd66c9f4b7ea611a645646e8ccf3f75614ca86 100644
--- a/include/asm-ppc64/system.h
+++ b/include/asm-ppc64/system.h
@@ -248,7 +248,7 @@ __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
 }
 
 static __inline__ unsigned long
-__cmpxchg_u64(volatile long *p, unsigned long old, unsigned long new)
+__cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
 {
 	unsigned long prev;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c9b43360fd33a2347316e49f69b17553f37dd1fb..9a565808da3fac6cad64e7412aa2d12f1a75eb70 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -103,6 +103,9 @@ static int __init hugetlb_init(void)
 	unsigned long i;
 	struct page *page;
 
+	if (HPAGE_SHIFT == 0)
+		return 0;
+
 	for (i = 0; i < MAX_NUMNODES; ++i)
 		INIT_LIST_HEAD(&hugepage_freelists[i]);