X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/1c79356b52d46aa6b508fb032f5ae709b1f2897b..8ad349bb6ed4a0be06e34c92be0d98b92e078db4:/osfmk/ppc/movc.s

diff --git a/osfmk/ppc/movc.s b/osfmk/ppc/movc.s
index 4d88e9cee..8284ff7bd 100644
--- a/osfmk/ppc/movc.s
+++ b/osfmk/ppc/movc.s
@@ -1,23 +1,31 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
  *
- * @APPLE_LICENSE_HEADER_START@
+ * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
  * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
+ * This file contains Original Code and/or Modifications of Original Code 
+ * as defined in and that are subject to the Apple Public Source License 
+ * Version 2.0 (the 'License'). You may not use this file except in 
+ * compliance with the License.  The rights granted to you under the 
+ * License may not be used to create, or enable the creation or 
+ * redistribution of, unlawful or unlicensed copies of an Apple operating 
+ * system, or to circumvent, violate, or enable the circumvention or 
+ * violation of, any terms of an Apple operating system software license 
+ * agreement.
+ *
+ * Please obtain a copy of the License at 
+ * http://www.opensource.apple.com/apsl/ and read it before using this 
+ * file.
+ *
+ * The Original Code and all software distributed under the License are 
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
+ * Please see the License for the specific language governing rights and 
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  */
 /*
  * @OSF_COPYRIGHT@
@@ -29,594 +37,1268 @@
 #include <assym.s>
 #include <sys/errno.h>
 
+#define INSTRUMENT 0
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
  * void pmap_zero_page(vm_offset_t pa)
  *
- * zero a page of physical memory.
+ * Zero a page of physical memory.  This routine runs in 32 or 64-bit mode,
+ * and handles 32 and 128-byte cache lines.
  */
 
-#if DEBUG
-	/* C debug stub in pmap.c calls this */
-ENTRY(pmap_zero_page_assembler, TAG_NO_FRAME_USED)
-#else
-ENTRY(pmap_zero_page, TAG_NO_FRAME_USED)
-#endif /* DEBUG */
-
-		mfmsr	r6								/* Get the MSR */
-		rlwinm	r7,	r6,	0,	MSR_DR_BIT+1,	MSR_DR_BIT-1	/* Turn off DR */
-		rlwinm	r7,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1	; Disable interruptions
-		li		r4,PPC_PGBYTES-CACHE_LINE_SIZE	/* Point to the end of the page */
-		mtmsr	r7								/* Set MSR to DR off */
-		isync									/* Ensure data translations are off */
 
+		.align	5
+		.globl	EXT(pmap_zero_page)
 
-.L_phys_zero_loop:	
-		subic.	r5,r4,CACHE_LINE_SIZE			/* Point to the next one */
-		dcbz	r4, r3							/* Clear the whole thing to 0s */
-		subi	r4,r5,CACHE_LINE_SIZE			/* Point to the next one */
-		dcbz	r5, r3							/* Clear the next to zeros */
-		bgt+	.L_phys_zero_loop				/* Keep going until we do the page... */
+LEXT(pmap_zero_page)
 
-		sync									/* Make sure they're all done */
-		li		r4,PPC_PGBYTES-CACHE_LINE_SIZE	/* Point to the end of the page */
+        mflr	r12								// save return address
+        bl		EXT(ml_set_physical_disabled)	// turn DR and EE off, SF on, get features in r10
+        mtlr	r12								// restore return address
+        andi.	r9,r10,pf32Byte+pf128Byte		// r9 <- cache line size
 
-.L_inst_inval_loop:	
-		subic.	r5,r4,CACHE_LINE_SIZE			/* Point to the next one */
-		icbi	r4, r3							/* Clear the whole thing to 0s */
-		subi	r4,r5,CACHE_LINE_SIZE			/* Point to the next one */
-		icbi	r5, r3							/* Clear the next to zeros */
-		bgt+	.L_inst_inval_loop				/* Keep going until we do the page... */
-
-		sync									/* Make sure they're all done */
+        subfic	r4,r9,PPC_PGBYTES				// r4 <- starting offset in page
+		
+		bt++	pf64Bitb,page0S4				// Go do the big guys...
+		
+		slwi	r3,r3,12						// get page address from page num
+		b		page_zero_1						// Jump to line aligned loop...
+
+        .align	5
+
+		nop
+		nop
+		nop
+		nop
+		nop
+		nop
+		nop
+		
+page0S4:
+		sldi	r3,r3,12						// get page address from page num
 
-		mtmsr	r6		/* Restore original translations */
-		isync			/* Ensure data translations are on */
+page_zero_1:									// loop zeroing cache lines
+        sub.	r5,r4,r9						// more to go?
+        dcbz128	r3,r4							// zero either 32 or 128 bytes
+        sub		r4,r5,r9						// generate next offset
+        dcbz128	r3,r5
+        bne--	page_zero_1
+        
+        b		EXT(ml_restore)					// restore MSR and do the isync
 
-		blr
 
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /* void
  * phys_copy(src, dst, bytecount)
- *      vm_offset_t     src;
- *      vm_offset_t     dst;
+ *      addr64_t 	    src;
+ *      addr64_t 	    dst;
  *      int             bytecount
  *
  * This routine will copy bytecount bytes from physical address src to physical
- * address dst. 
+ * address dst.  It runs in 64-bit mode if necessary, but does not handle
+ * overlap or make any attempt to be optimal.  Length must be a signed word.
+ * Not performance critical.
  */
 
-ENTRY(phys_copy, TAG_NO_FRAME_USED)
-
-	/* Switch off data translations */
-	mfmsr	r6
-	rlwinm	r7,	r6,	0,	MSR_DR_BIT+1,	MSR_DR_BIT-1
-	rlwinm  r7,     r7,     0,      MSR_EE_BIT+1,   MSR_EE_BIT-1
-	mtmsr	r7
-	isync			/* Ensure data translations are off */
-
-	subi	r3,	r3,	4
-	subi	r4,	r4,	4
-
-	cmpwi	r5,	3
-	ble-	.L_phys_copy_bytes
-.L_phys_copy_loop:
-	lwz	r0,	4(r3)
-	addi	r3,	r3,	4
-	subi	r5,	r5,	4
-	stw	r0,	4(r4)
-	addi	r4,	r4,	4
-	cmpwi	r5,	3
-	bgt+	.L_phys_copy_loop
-
-	/* If no leftover bytes, we're done now */
-	cmpwi	r5,	0
-	beq+	.L_phys_copy_done
-	
-.L_phys_copy_bytes:
-	addi	r3,	r3,	3
-	addi	r4,	r4,	3
-.L_phys_copy_byte_loop:	
-	lbz	r0,	1(r3)
-	addi	r3,	r3,	1
-	subi	r5,	r5,	1
-	stb	r0,	1(r4)
-	addi	r4,	r4,	1
-	cmpwi	r5,	0
-	bne+	.L_phys_copy_byte_loop
-
-.L_phys_copy_done:
-	mtmsr	r6		/* Restore original translations */
-	isync			/* Ensure data translations are off */
-
-	blr
 
+		.align	5
+		.globl	EXT(phys_copy)
+
+LEXT(phys_copy)
+
+		rlwinm	r3,r3,0,1,0					; Duplicate high half of long long paddr into top of reg
+        mflr	r12								// get return address
+		rlwimi	r3,r4,0,0,31				; Combine bottom of long long to full 64-bits
+		rlwinm	r4,r5,0,1,0					; Duplicate high half of long long paddr into top of reg
+        bl		EXT(ml_set_physical_disabled)	// turn DR and EE off, SF on, get features in r10
+		rlwimi	r4,r6,0,0,31				; Combine bottom of long long to full 64-bits
+        mtlr	r12								// restore return address
+        subic.	r5,r7,4							// a word to copy?
+        b		phys_copy_2
+        
+		.align	5
+         
+phys_copy_1:									// loop copying words
+        subic.	r5,r5,4							// more to go?
+        lwz		r0,0(r3)
+        addi	r3,r3,4
+        stw		r0,0(r4)
+        addi	r4,r4,4
+phys_copy_2:
+        bge		phys_copy_1
+        addic.	r5,r5,4							// restore count
+        ble		phys_copy_4						// no more
+        
+        										// Loop is aligned here
+        
+phys_copy_3:									// loop copying bytes
+        subic.	r5,r5,1							// more to go?
+        lbz		r0,0(r3)
+        addi	r3,r3,1
+        stb		r0,0(r4)
+        addi	r4,r4,1
+        bgt		phys_copy_3
+phys_copy_4:        
+        b		EXT(ml_restore)					// restore MSR and do the isync
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /* void
  * pmap_copy_page(src, dst)
- *      vm_offset_t     src;
- *      vm_offset_t     dst;
+ *      ppnum_t     src;
+ *      ppnum_t     dst;
  *
  * This routine will copy the physical page src to physical page dst
  * 
- * This routine assumes that the src and dst are page aligned and that the
- * destination is cached.
- *
- * We also must assume that noone will be executing within the destination
- * page.  We also assume that this will be used for paging
+ * This routine assumes that the src and dst are page numbers and that the
+ * destination is cached.  It runs on 32 and 64 bit processors, with and
+ * without altivec, and with 32 and 128 byte cache lines.
+ * We also must assume that no-one will be executing within the destination
+ * page, and that this will be used for paging.  Because this
+ * is a common routine, we have tuned loops for each processor class.
  *
  */
+#define	kSFSize	(FM_SIZE+160)
 
-#if DEBUG
-	/* if debug, we have a little piece of C around this
-	 * in pmap.c that gives some trace ability
-	 */
-ENTRY(pmap_copy_page_assembler, TAG_NO_FRAME_USED)
-#else
 ENTRY(pmap_copy_page, TAG_NO_FRAME_USED)
-#endif /* DEBUG */
 
-#if 0
-			mfpvr	r9							; Get the PVR
-			rlwinm	r9,r9,16,16,31				; Isolate the PPC processor
-			cmplwi	r9,PROCESSOR_VERSION_Max	; Do we have Altivec?
-			beq+	wegotaltivec				; Yeah...
-#endif
-		
-			mfmsr	r9							; Get the MSR
-			stwu	r1,-(FM_SIZE+32)(r1)		; Make a frame for us
-			rlwinm	r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1	; Disable interruptions
-			ori		r7,r7,lo16(MASK(MSR_FP))	; Turn on the FPU
-			mtmsr	r7							; Disable rupts and enable FPU
-			isync
-		
-			stfd	f0,FM_SIZE+0(r1)			; Save an FP register
-			rlwinm	r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1	; Clear the DDAT bit
-			stfd	f1,FM_SIZE+8(r1)			; Save an FP register
-			addi	r6,r3,PPC_PGBYTES			; Point to the start of the next page
-			stfd	f2,FM_SIZE+16(r1)			; Save an FP register
-			mr		r8,r4						; Save the destination
-			stfd	f3,FM_SIZE+24(r1)			; Save an FP register
-		
-			mtmsr	r7							; Set the new MSR
-			isync								; Ensure data translations are off
-
-			dcbt	br0, r3						/* Start in first input line */
-			li		r5,	CACHE_LINE_SIZE			/* Get the line size */
-
-.L_pmap_copy_page_loop:
-			dcbz	0, r4						/* Allocate a line for the output */
-			lfd		f0, 0(r3)					/* Get first 8 */
-			lfd		f1, 8(r3)					/* Get second 8 */
-			lfd		f2, 16(r3)					/* Get third 8 */
-			stfd	f0, 0(r4)					/* Put first 8 */
-			dcbt	r5, r3						/* Start next line coming in */
-			lfd		f3, 24(r3)					/* Get fourth 8 */
-			stfd	f1,	8(r4)					/* Put second 8 */
-			addi	r3,r3,CACHE_LINE_SIZE		/* Point to the next line in */
-			stfd	f2,	16(r4)					/* Put third 8 */
-			cmplw	cr0,r3,r6					/* See if we're finished yet */
-			stfd	f3,	24(r4)					/* Put fourth 8 */
-			dcbst	br0,r4						/* Force it out */
-			addi	r4,r4,CACHE_LINE_SIZE		/* Point to the next line out */
-			blt+	.L_pmap_copy_page_loop		/* Copy the whole page */
-			
-			sync								/* Make sure they're all done */
-			li		r4,PPC_PGBYTES-CACHE_LINE_SIZE	/* Point to the end of the page */
-
-invalinst:	
-			subic.	r5,r4,CACHE_LINE_SIZE		/* Point to the next one */
-			icbi	r4, r8						/* Trash the i-cache */
-			subi	r4,r5,CACHE_LINE_SIZE		/* Point to the next one */
-			icbi	r5, r8						/* Trash the i-cache */
-			bgt+	invalinst					/* Keep going until we do the page... */
-	
-			rlwimi	r7,r9,0,MSR_DR_BIT,MSR_DR_BIT	; Set DDAT if on
-			sync								; Make sure all invalidates done
-			
-			mtmsr	r7							; Set DDAT correctly
-			isync		
-			
-			lfd		f0,FM_SIZE+0(r1)			; Restore an FP register
-			lfd		f1,FM_SIZE+8(r1)			; Restore an FP register
-			lfd		f2,FM_SIZE+16(r1)			; Restore an FP register
-			lfd		f3,FM_SIZE+24(r1)			; Restore an FP register
-			
-			lwz		r1,0(r1)					; Pop up the stack
-	
-			mtmsr	r9							; Turn off FPU now and maybe rupts back on
-			isync								
-			blr
-		
-#if 0
-;
-;			This is not very optimal.  We just do it here for a test of 
-;			Altivec in the kernel.
-;
-wegotaltivec:
-			mfmsr	r9							; Get the MSR
-			lis		r8,hi16(0xC0000000)			; Make sure we keep the first 2 vector registers
-			rlwinm	r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1	; Disable interruptions
-			lis		r6,lo16(2*256+128)			; Specify 128 blocks of 2 vectors each
-			rlwinm	r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1	; Clear the DDAT bit
-			ori		r6,r6,32					; Set a 32-byte stride
-			mtsprg	256,r8						; Set VRSave
-			mtmsr	r7							; Disable rupts and turn xlate off
-			isync
-	
-			addi	r11,r3,4096					; Point to the next page
-			li		r10,16						; Get vector size
-
-avmovepg:	lvxl	v0,br0,r3					; Get first half of line
-			dcba	br0,r4						; Allocate output
-			lvxl	v1,r10,r3					; Get second half of line
-			stvxl	v0,br0,r4					; Save first half of line
-			addi	r3,r3,32					; Point to the next line
-			icbi	br0,r4						; Make the icache go away also
-			stvxl	v1,r10,r4					; Save second half of line
-			cmplw	r3,r11						; Have we reached the next page?
-			dcbst	br0,r4						; Make sure the line is on its way out
-			addi	r4,r4,32					; Point to the next line
-			blt+	avmovepg					; Move the next line...
-			
-			li		r8,0						; Clear this
-			sync								; Make sure all the memory stuff is done
-			mtsprg	256,r8						; Show we are not using VRs any more
-			mtmsr	r9							; Translation and interruptions back on
-			isync
-			blr
-#endif
+		lis		r2,hi16(MASK(MSR_VEC))			; Get the vector flag
+        mflr	r0								// get return
+ 		ori		r2,r2,lo16(MASK(MSR_FP))		; Add the FP flag
+		stw		r0,8(r1)						// save
+        stwu	r1,-kSFSize(r1)					// set up a stack frame for VRs or FPRs
+        mfmsr	r11								// save MSR at entry
+        mfsprg	r10,2							// get feature flags
+        andc	r11,r11,r2						// Clear out vec and fp
+        ori		r2,r2,lo16(MASK(MSR_EE))		// Get EE on also
+        andc	r2,r11,r2						// Clear out EE as well
+        mtcrf	0x02,r10						// we need to test pf64Bit
+        ori		r2,r2,MASK(MSR_FP)				// must enable FP for G3...
+        mtcrf	0x80,r10						// we need to test pfAltivec too
+        oris	r2,r2,hi16(MASK(MSR_VEC))		// enable altivec for G4 (ignored if G3)
+        mtmsr	r2								// turn EE off, FP and VEC on
+        isync
+        bt++	pf64Bitb,pmap_copy_64			// skip if 64-bit processor (only they take hint)
+ 		slwi	r3,r3,12						// get page address from page num
+		slwi	r4,r4,12						// get page address from page num
+        rlwinm	r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1	// get ready to turn off DR
+        bt		pfAltivecb,pmap_copy_g4			// altivec but not 64-bit means G4
+        
+        
+        // G3 -- copy using FPRs
+        
+        stfd	f0,FM_SIZE+0(r1)				// save the 4 FPRs we use to copy
+        stfd	f1,FM_SIZE+8(r1)
+        li		r5,PPC_PGBYTES/32				// count of cache lines in a page
+        stfd	f2,FM_SIZE+16(r1)
+        mtctr	r5
+        stfd	f3,FM_SIZE+24(r1)
+        mtmsr	r12								// turn off DR after saving FPRs on stack
+        isync
+        
+pmap_g3_copy_loop:								// loop over 32-byte cache lines
+        dcbz	0,r4							// avoid read of dest line
+        lfd		f0,0(r3)
+        lfd		f1,8(r3)
+        lfd		f2,16(r3)
+        lfd		f3,24(r3)
+        addi	r3,r3,32
+        stfd	f0,0(r4)
+        stfd	f1,8(r4)
+        stfd	f2,16(r4)
+        stfd	f3,24(r4)
+        dcbst	0,r4							// flush dest line to RAM
+        addi	r4,r4,32
+        bdnz	pmap_g3_copy_loop
+        
+        sync									// wait for stores to take
+        subi	r4,r4,PPC_PGBYTES				// restore ptr to destintation page
+        li		r6,PPC_PGBYTES-32				// point to last line in page
+pmap_g3_icache_flush:
+        subic.	r5,r6,32						// more to go?
+        icbi	r4,r6							// flush another line in icache
+        subi	r6,r5,32						// get offset to next line
+        icbi	r4,r5
+        bne		pmap_g3_icache_flush
+        
+        sync
+        mtmsr	r2								// turn DR back on
+        isync
+        lfd		f0,FM_SIZE+0(r1)				// restore the FPRs
+        lfd		f1,FM_SIZE+8(r1)
+        lfd		f2,FM_SIZE+16(r1)
+        lfd		f3,FM_SIZE+24(r1)        
+        
+        b		pmap_g4_restore					// restore MSR and done
+
+        
+        // G4 -- copy using VRs
+
+pmap_copy_g4:									// r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR
+        la		r9,FM_SIZE+16(r1)				// place where we save VRs to r9
+        li		r5,16							// load x-form offsets into r5-r9
+        li		r6,32							// another offset
+        stvx	v0,0,r9							// save some VRs so we can use to copy
+        li		r7,48							// another offset
+        stvx	v1,r5,r9
+        li		r0,PPC_PGBYTES/64				// we loop over 64-byte chunks
+        stvx	v2,r6,r9
+        mtctr	r0
+        li		r8,96							// get look-ahead for touch
+        stvx	v3,r7,r9
+        li		r9,128
+        mtmsr	r12								// now we've saved VRs on stack, turn off DR
+        isync									// wait for it to happen
+        b		pmap_g4_copy_loop
+        
+        .align	5								// align inner loops
+pmap_g4_copy_loop:								// loop over 64-byte chunks
+        dcbt	r3,r8							// touch 3 lines ahead
+        nop										// avoid a 17-word loop...
+        dcbt	r3,r9							// touch 4 lines ahead
+        nop										// more padding
+        dcba	0,r4							// avoid pre-fetch of 1st dest line
+        lvx		v0,0,r3							// offset 0
+        lvx		v1,r5,r3						// offset 16
+        lvx		v2,r6,r3						// offset 32
+        lvx		v3,r7,r3						// offset 48
+        addi	r3,r3,64
+        dcba	r6,r4							// avoid pre-fetch of 2nd line
+        stvx	v0,0,r4							// offset 0
+        stvx	v1,r5,r4						// offset 16
+        stvx	v2,r6,r4						// offset 32
+        stvx	v3,r7,r4						// offset 48
+        dcbf	0,r4							// push line 1
+        dcbf	r6,r4							// and line 2
+        addi	r4,r4,64
+        bdnz	pmap_g4_copy_loop
+
+        sync									// wait for stores to take
+        subi	r4,r4,PPC_PGBYTES				// restore ptr to destintation page
+        li		r8,PPC_PGBYTES-32				// point to last line in page
+pmap_g4_icache_flush:
+        subic.	r9,r8,32						// more to go?
+        icbi	r4,r8							// flush from icache
+        subi	r8,r9,32						// get offset to next line
+        icbi	r4,r9
+        bne		pmap_g4_icache_flush
+        
+        sync
+        mtmsr	r2								// turn DR back on
+        isync
+        la		r9,FM_SIZE+16(r1)				// get base of VR save area
+        lvx		v0,0,r9							// restore the VRs
+        lvx		v1,r5,r9
+        lvx		v2,r6,r9
+        lvx		v3,r7,r9        
+        
+pmap_g4_restore:								// r11=MSR
+        mtmsr	r11								// turn EE on, VEC and FR off
+        isync									// wait for it to happen
+        addi	r1,r1,kSFSize					// pop off our stack frame
+        lwz		r0,8(r1)						// restore return address
+        mtlr	r0
+        blr
+        
+        
+        // 64-bit/128-byte processor: copy using VRs
+        
+pmap_copy_64:									// r10=features, r11=old MSR
+ 		sldi	r3,r3,12						// get page address from page num
+		sldi	r4,r4,12						// get page address from page num
+		la		r9,FM_SIZE+16(r1)				// get base of VR save area
+        li		r5,16							// load x-form offsets into r5-r9
+        li		r6,32							// another offset
+        bf		pfAltivecb,pmap_novmx_copy		// altivec suppressed...
+        stvx	v0,0,r9							// save 8 VRs so we can copy wo bubbles
+        stvx	v1,r5,r9
+        li		r7,48							// another offset
+        li		r0,PPC_PGBYTES/128				// we loop over 128-byte chunks
+        stvx	v2,r6,r9
+        stvx	v3,r7,r9
+        addi	r9,r9,64						// advance base ptr so we can store another 4
+        mtctr	r0
+        li		r0,MASK(MSR_DR)					// get DR bit
+        stvx	v4,0,r9
+        stvx	v5,r5,r9
+        andc	r12,r2,r0						// turn off DR bit
+        li		r0,1							// get a 1 to slam into SF
+        stvx	v6,r6,r9
+        stvx	v7,r7,r9
+        rldimi	r12,r0,63,MSR_SF_BIT			// set SF bit (bit 0)
+        li		r8,-128							// offset so we can reach back one line
+        mtmsrd	r12								// now we've saved VRs, turn DR off and SF on
+        isync									// wait for it to happen
+        dcbt128	0,r3,1							// start a forward stream
+        b		pmap_64_copy_loop
+        
+        .align	5								// align inner loops
+pmap_64_copy_loop:								// loop over 128-byte chunks
+        dcbz128	0,r4							// avoid read of destination line
+        lvx		v0,0,r3							// offset 0
+        lvx		v1,r5,r3						// offset 16
+        lvx		v2,r6,r3						// offset 32
+        lvx		v3,r7,r3						// offset 48
+        addi	r3,r3,64						// don't have enough GPRs so add 64 2x
+        lvx		v4,0,r3							// offset 64
+        lvx		v5,r5,r3						// offset 80
+        lvx		v6,r6,r3						// offset 96
+        lvx		v7,r7,r3						// offset 112
+        addi	r3,r3,64
+        stvx	v0,0,r4							// offset 0
+        stvx	v1,r5,r4						// offset 16
+        stvx	v2,r6,r4						// offset 32
+        stvx	v3,r7,r4						// offset 48
+        addi	r4,r4,64
+        stvx	v4,0,r4							// offset 64
+        stvx	v5,r5,r4						// offset 80
+        stvx	v6,r6,r4						// offset 96
+        stvx	v7,r7,r4						// offset 112
+        addi	r4,r4,64
+        dcbf	r8,r4							// flush the line we just wrote
+        bdnz	pmap_64_copy_loop
+
+        sync									// wait for stores to take
+        subi	r4,r4,PPC_PGBYTES				// restore ptr to destintation page
+        li		r8,PPC_PGBYTES-128				// point to last line in page
+pmap_64_icache_flush:
+        subic.	r9,r8,128						// more to go?
+        icbi	r4,r8							// flush from icache
+        subi	r8,r9,128						// get offset to next line
+        icbi	r4,r9
+        bne		pmap_64_icache_flush
+        
+        sync
+        mtmsrd	r2								// turn DR back on, SF off
+        isync
+        la		r9,FM_SIZE+16(r1)				// get base address of VR save area on stack
+        lvx		v0,0,r9							// restore the VRs
+        lvx		v1,r5,r9
+        lvx		v2,r6,r9
+        lvx		v3,r7,r9
+        addi	r9,r9,64        
+        lvx		v4,0,r9
+        lvx		v5,r5,r9
+        lvx		v6,r6,r9
+        lvx		v7,r7,r9
+
+        b		pmap_g4_restore					// restore lower half of MSR and return
+
+ //
+ //		Copy on 64-bit without VMX
+ //
+
+pmap_novmx_copy:        
+		li		r0,PPC_PGBYTES/128				// we loop over 128-byte chunks
+		mtctr	r0
+		li		r0,MASK(MSR_DR)					// get DR bit
+		andc	r12,r2,r0						// turn off DR bit
+		li		r0,1							// get a 1 to slam into SF
+		rldimi	r12,r0,63,MSR_SF_BIT			// set SF bit (bit 0)
+		mtmsrd	r12								// now we've saved VRs, turn DR off and SF on
+		isync									// wait for it to happen
+		dcbt128	0,r3,1							// start a forward stream 
+       
+pmap_novmx_copy_loop:							// loop over 128-byte cache lines
+        dcbz128	0,r4							// avoid read of dest line
+        
+        ld		r0,0(r3)						// Load half a line
+        ld		r12,8(r3)
+        ld		r5,16(r3)
+        ld		r6,24(r3)
+        ld		r7,32(r3)
+        ld		r8,40(r3)
+        ld		r9,48(r3)
+        ld		r10,56(r3)
+        
+        std		r0,0(r4)						// Store half a line
+        std		r12,8(r4)
+        std		r5,16(r4)
+        std		r6,24(r4)
+        std		r7,32(r4)
+        std		r8,40(r4)
+        std		r9,48(r4)
+        std		r10,56(r4)
+        
+        ld		r0,64(r3)						// Load half a line
+        ld		r12,72(r3)
+        ld		r5,80(r3)
+        ld		r6,88(r3)
+        ld		r7,96(r3)
+        ld		r8,104(r3)
+        ld		r9,112(r3)
+        ld		r10,120(r3)
+        
+        addi	r3,r3,128
+ 
+        std		r0,64(r4)						// Store half a line
+        std		r12,72(r4)
+        std		r5,80(r4)
+        std		r6,88(r4)
+        std		r7,96(r4)
+        std		r8,104(r4)
+        std		r9,112(r4)
+        std		r10,120(r4)
+        
+        dcbf	0,r4							// flush the line we just wrote
+		addi	r4,r4,128
+        bdnz	pmap_novmx_copy_loop
+
+        sync									// wait for stores to take
+        subi	r4,r4,PPC_PGBYTES				// restore ptr to destintation page
+        li		r8,PPC_PGBYTES-128				// point to last line in page
+
+pmap_novmx_icache_flush:
+        subic.	r9,r8,128						// more to go?
+        icbi	r4,r8							// flush from icache
+        subi	r8,r9,128						// get offset to next line
+        icbi	r4,r9
+        bne		pmap_novmx_icache_flush
+        
+        sync
+        mtmsrd	r2								// turn DR back on, SF off
+        isync
+
+        b		pmap_g4_restore					// restore lower half of MSR and return
+
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>	
 		
-
-	
-
+// Stack frame format used by copyin, copyout, copyinstr and copyoutstr.
+// These routines all run both on 32 and 64-bit machines, though because they are called
+// by the BSD kernel they are always in 32-bit mode when entered.  The mapped ptr returned
+// by MapUserMemoryWindow will be 64 bits however on 64-bit machines.  Beware to avoid
+// using compare instructions on this ptr.  This mapped ptr is kept globally in r31, so there
+// is no need to store or load it, which are mode-dependent operations since it could be
+// 32 or 64 bits.
+
+#define	kkFrameSize	(FM_SIZE+32)
+
+#define	kkBufSize	(FM_SIZE+0)
+#define	kkCR3		(FM_SIZE+4)
+#define	kkSource	(FM_SIZE+8)
+#define	kkDest		(FM_SIZE+12)
+#define	kkCountPtr	(FM_SIZE+16)
+#define	kkR31Save	(FM_SIZE+20)
+#define	kkThrErrJmp	(FM_SIZE+24)
+ 
+ 
+// nonvolatile CR bits we use as flags in cr3
+
+#define	kk64bit		12
+#define	kkNull		13
+#define	kkIn		14
+#define	kkString	15
+#define	kkZero		15
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
  * int
- * copyin(src, dst, count)
- *	vm_offset_t	src;
- *	vm_offset_t	dst;
- *	int		count;
+ * copyoutstr(src, dst, maxcount, count)
+ *	vm_offset_t	src;        // r3
+ *	addr64_t	dst;        // r4 and r5
+ *	vm_size_t	maxcount;   // r6
+ *	vm_size_t*	count;      // r7
  *
+ * Set *count to the number of bytes copied.
  */
 
-ENTRY2(copyin, copyinmsg, TAG_NO_FRAME_USED)
-
-/* Preamble allowing us to call a sub-function */
-		mflr	r0
-		stw		r0,FM_LR_SAVE(r1)
-		stwu	r1,-(FM_SIZE+16)(r1)
-		
-		mfmsr	r0								/* Get the MSR */
-		rlwinm	r6,r0,0,MSR_EE_BIT+1,MSR_EE_BIT-1	/* Clear 'rupts */
-		mtmsr	r6								/* Disable 'rupts */
-
-		mfsprg	r6,0							/* Get the per_proc */
-		lwz		r6,PP_CPU_DATA(r6)
-		cmpli	cr0,r5,0
-		lwz		r10,CPU_ACTIVE_THREAD(r6)
-		mtmsr	r0								/* Set 'rupts back */
-		ble-	cr0,.L_copyinout_trivial
-
-/* we know we have a valid copyin to do now */
-/* Set up thread_recover in case we hit an illegal address */
-		
-		lwz		r8,THREAD_TOP_ACT(r10)
-		lis		r11,hi16(.L_copyinout_error)
-		lwz		r8,ACT_VMMAP(r8)
-		ori		r11,r11,lo16(.L_copyinout_error)
-		add		r9,r3,r5						/* Get the end of the source */
-		lwz		r8,VMMAP_PMAP(r8)				; Get the pmap
-		rlwinm	r12,r3,6,26,29					; Get index to the segment slot
-		subi	r9,r9,1							/* Make sure we don't go too far */
-		add		r8,r8,r12						; Start indexing to the segment value
-		stw		r11,THREAD_RECOVER(r10)
-		xor		r9,r9,r3						/* Smoosh 'em together */
-		lwz		r8,PMAP_SEGS(r8)				; Get the source SR value
-		rlwinm.	r9,r9,0,1,3						/* Top nybble equal? */
-		mtsr	SR_COPYIN,r8					; Set the SR
-		isync
-#if 0
-		lis		r0,HIGH_ADDR(EXT(dbgRegsCall))	/* (TEST/DEBUG) */	
-		ori		r0,r0,LOW_ADDR(EXT(dbgRegsCall))	/* (TEST/DEBUG) */	
-		sc										/* (TEST/DEBUG) */
-#endif
-	
-/* For optimization, we check if the copyin lies on a segment
- * boundary. If it doesn't, we can use a simple copy. If it
- * does, we split it into two separate copies in some C code.
- */
-	
-		bne-	.L_call_copyin_multiple			/* Nope, we went past the segment boundary... */
+ENTRY(copyoutstr, TAG_NO_FRAME_USED)
+        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
+        mr      r10,r4                          // move high word of 64-bit user address to r10
+        li		r0,0
+        crset	kkString						// flag as a string op
+        mr      r11,r5                          // move low word of 64-bit user address to r11
+        stw		r0,0(r7)						// initialize #bytes moved
+        crclr	kkIn							// flag as copyout
+        b		copyJoin
 
-		rlwinm	r3,r3,0,4,31
-		oris	r3,r3,(SR_COPYIN_NUM << (28-16))	/* Set the copyin segment as the source */
-	
-		bl		EXT(bcopy)
-		
-/* Now that copyin is done, we don't need a recovery point */
-		mfmsr	r7								/* Get the MSR */
-		rlwinm	r6,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1	/* Clear 'rupts */
-		mtmsr	r6								/* Disable 'rupts */
 
-		mfsprg	r6,0							/* Get the per_proc */
-		
-		lwz		r6,PP_CPU_DATA(r6)
-		addi	r1,r1,FM_SIZE+16
-		lwz		r10,CPU_ACTIVE_THREAD(r6)
-		mtmsr	r7								; Restore interrupts
-		li		r3,0
-		lwz		r0,FM_LR_SAVE(r1)
-		stw		r3,THREAD_RECOVER(r10)			/* Clear recovery */
-		mtlr	r0
-		blr
-	
-/* we get here via the exception handler if an illegal
- * user memory reference was made.
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+/*
+ * int
+ * copyinstr(src, dst, maxcount, count)
+ *	addr64_t	src;        // r3 and r4
+ *	vm_offset_t	dst;        // r5
+ *	vm_size_t	maxcount;   // r6
+ *	vm_size_t*	count;      // r7
+ *
+ * Set *count to the number of bytes copied
+ * If dst == NULL, don't copy, just count bytes.
+ * Only currently called from klcopyinstr. 
  */
-.L_copyinout_error:
-
-/* Now that copyin is done, we don't need a recovery point */
-	
-		mfmsr	r7								/* Get the MSR */
-		rlwinm	r6,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1	/* Clear 'rupts */
-		mtmsr	r6								/* Disable 'rupts */
-
-		mfsprg	r6,0							/* Get the per_proc */
-		
-		lwz		r6,PP_CPU_DATA(r6)
-		addi	r1,r1,FM_SIZE+16
-		lwz		r10,CPU_ACTIVE_THREAD(r6)
-		mtmsr	r7								; Restore interrupts
-		li		r4,0
-		lwz		r0,FM_LR_SAVE(r1)
-		stw		r4,THREAD_RECOVER(r10)			/* Clear recovery */
-		mtlr	r0
-		li		r3,EFAULT							; Indicate error (EFAULT) 
-		blr
-
-.L_copyinout_trivial:
-	/* The copyin/out was for either 0 bytes or a negative
-	 * number of bytes, return an appropriate value (0 == SUCCESS).
-	 * cr0 still contains result of comparison of len with 0.
-	 */
-	li	r3,	0
-	beq+	cr0,	.L_copyinout_negative
-	li	r3,	1
-.L_copyinout_negative:
-
-	/* unwind the stack */
-	addi	r1,	r1,	FM_SIZE+16
-	lwz	r0,	FM_LR_SAVE(r1)
-	mtlr	r0
-
-	blr
-
-.L_call_copyin_multiple:
-
-	/* unwind the stack */
-	addi	r1,	r1,	FM_SIZE+16
-	lwz	r0,	FM_LR_SAVE(r1)
-	mtlr	r0
-
-	b	EXT(copyin_multiple)				/* not a call - a jump! */
 
+ENTRY(copyinstr, TAG_NO_FRAME_USED)
+        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
+        cmplwi	r5,0							// dst==NULL?
+        mr      r10,r3                          // move high word of 64-bit user address to r10
+        li		r0,0
+        crset	kkString						// flag as a string op
+        mr      r11,r4                          // move low word of 64-bit user address to r11
+        crmove	kkNull,cr0_eq					// remember if (dst==NULL)
+        stw		r0,0(r7)						// initialize #bytes moved
+        crset	kkIn							// flag as copyin (rather than copyout)
+        b		copyJoin1						// skip over the "crclr kkNull"
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
  * int
  * copyout(src, dst, count)
- *	vm_offset_t	src;
- *	vm_offset_t	dst;
- *	int		count;
- *
+ *	vm_offset_t	src;        // r3
+ *	addr64_t	dst;        // r4 and r5
+ *	size_t		count;      // r6
  */
 
-ENTRY2(copyout, copyoutmsg, TAG_NO_FRAME_USED)
-
-/* Preamble allowing us to call a sub-function */
-
-		mflr	r0
-		stw		r0,FM_LR_SAVE(r1)
-		stwu	r1,-(FM_SIZE+16)(r1)
-		
-#if 0
-		stw		r3,FM_SIZE+0(r1)				/* (TEST/DEBUG) */
-		stw		r4,FM_SIZE+4(r1)				/* (TEST/DEBUG) */
-		stw		r5,FM_SIZE+8(r1)				/* (TEST/DEBUG) */
-		mr		r6,r0							/* (TEST/DEBUG) */
-		
-		bl		EXT(tracecopyout)				/* (TEST/DEBUG) */
-		
-		lwz		r3,FM_SIZE+0(r1)				/* (TEST/DEBUG) */
-		lwz		r4,FM_SIZE+4(r1)				/* (TEST/DEBUG) */
-		lwz		r5,FM_SIZE+8(r1)				/* (TEST/DEBUG) */
-#endif
-	
-		mfmsr	r7								/* Get the MSR */
-		rlwinm	r6,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1	/* Clear 'rupts */
-		mtmsr	r6								/* Disable 'rupts */
-
-		mfsprg	r6,0							/* Get the per_proc */
-		
-		lwz		r6,PP_CPU_DATA(r6)
-		cmpli	cr0,r5,0
-		lwz		r10,CPU_ACTIVE_THREAD(r6)
-		mtmsr	r7								/* Restore 'rupts */
-		ble-	cr0,.L_copyinout_trivial
-/* we know we have a valid copyout to do now */
-/* Set up thread_recover in case we hit an illegal address */
-		
-
-		lwz		r8,THREAD_TOP_ACT(r10)
-		lis		r11,HIGH_ADDR(.L_copyinout_error)
-		lwz		r8,ACT_VMMAP(r8)
-		rlwinm	r12,r4,6,26,29					; Get index to the segment slot
-		ori		r11,r11,LOW_ADDR(.L_copyinout_error)
-		add		r9,r4,r5						/* Get the end of the destination */
-		lwz		r8,VMMAP_PMAP(r8)
-		subi	r9,r9,1							/* Make sure we don't go too far */
-		add		r8,r8,r12						; Start indexing to the segment value
-		stw		r11,THREAD_RECOVER(r10)
-		xor		r9,r9,r4						/* Smoosh 'em together */
-		lwz		r8,PMAP_SEGS(r8)				; Get the source SR value
-		rlwinm.	r9,r9,0,1,3						/* Top nybble equal? */
-		mtsr	SR_COPYIN,r8
-		isync
-	
-	
-/* For optimisation, we check if the copyout lies on a segment
- * boundary. If it doesn't, we can use a simple copy. If it
- * does, we split it into two separate copies in some C code.
+			.align	5
+			.globl	EXT(copyout)
+			.globl	EXT(copyoutmsg)
+
+LEXT(copyout)
+LEXT(copyoutmsg)
+
+#if INSTRUMENT
+        mfspr	r12,pmc1						; INSTRUMENT - saveinstr[12] - Take stamp at copyout
+        stw		r12,0x6100+(12*16)+0x0(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc2						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(12*16)+0x4(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc3						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(12*16)+0x8(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc4						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(12*16)+0xC(0)		; INSTRUMENT - Save it
+#endif			
+        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
+        mr      r10,r4                          // move high word of 64-bit user address to r10
+        crclr	kkString						// not a string version
+        mr      r11,r5                          // move low word of 64-bit user address to r11
+        crclr	kkIn							// flag as copyout
+        b		copyJoin
+        
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+/*
+ * int
+ * copyin(src, dst, count)
+ *	addr64_t	src;        // r3 and r4
+ *	vm_offset_t	dst;        // r5
+ *	size_t		count;      // r6
  */
-	
-		bne-	.L_call_copyout_multiple		/* Nope, we went past the segment boundary... */
 
-		rlwinm	r4,r4,0,4,31
-		oris	r4,r4,(SR_COPYIN_NUM << (28-16))	/* Set the copyin segment as the source */
-	
-		bl	EXT(bcopy)
-		
-/* Now that copyout is done, we don't need a recovery point */
-		mfmsr	r7								/* Get the MSR */
-		rlwinm	r6,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1	/* Clear 'rupts */
-		mtmsr	r6								/* Disable 'rupts */
 
-		mfsprg	r6,0							/* Get the per_proc */
-		
-		lwz		r6,PP_CPU_DATA(r6)
-		addi	r1,r1,FM_SIZE+16
-		lwz		r10,CPU_ACTIVE_THREAD(r6)
-		mtmsr	r7								; Restore interrupts
-		li		r3,0
-		lwz		r0,FM_LR_SAVE(r1)
-		stw		r3,THREAD_RECOVER(r10)			/* Clear recovery */
+			.align	5
+			.globl	EXT(copyin)
+			.globl	EXT(copyinmsg)
+
+LEXT(copyin)
+LEXT(copyinmsg)
+
+        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
+        mr      r10,r3                          // move high word of 64-bit user address to r10
+        crclr	kkString						// not a string version
+        mr      r11,r4                          // move low word of 64-bit user address to r11
+        crset	kkIn							// flag as copyin
+        
+        
+// Common code to handle setup for all the copy variants:
+//		r2 = caller's cr3
+//      r3 = source if copyout
+//      r5 = dest if copyin
+//      r6 = buffer length or count
+//      r7 = count output ptr (if kkString set)
+//	   r10 = high word of 64-bit user-space address (source if copyin, dest if copyout)
+//	   r11 = low word of 64-bit user-space address
+//     cr3 = kkIn, kkString, kkNull flags
+
+copyJoin:
+        crclr	kkNull							// (dst==NULL) convention not used with this call
+copyJoin1:										// enter from copyinstr with kkNull set
+		mflr	r0								// get return address
+        cmplwi	r6,0							// buffer length 0?
+        lis		r9,0x1000						// r9 <- 0x10000000 (256MB)
+		stw		r0,FM_LR_SAVE(r1)				// save return
+        cmplw	cr1,r6,r9						// buffer length > 256MB ?
+        mfsprg	r8,2							// get the features
+        beq--	copyinout_0						// 0 length is degenerate case
+		stwu	r1,-kkFrameSize(r1)				// set up stack frame
+        stw		r2,kkCR3(r1)                    // save caller's cr3, which we use for flags
+        mtcrf	0x02,r8							// move pf64Bit to cr6
+        stw		r3,kkSource(r1)					// save args across MapUserMemoryWindow
+        stw		r5,kkDest(r1)
+        stw		r6,kkBufSize(r1)
+        crmove	kk64bit,pf64Bitb				// remember if this is a 64-bit processor
+        stw		r7,kkCountPtr(r1)
+        stw		r31,kkR31Save(r1)				// we use r31 globally for mapped user ptr
+        li		r31,0							// no mapped ptr yet
+        
+        
+// Handle buffer length > 256MB.  This is an error (ENAMETOOLONG) on copyin and copyout.
+// The string ops are passed -1 lengths by some BSD callers, so for them we silently clamp
+// the buffer length to 256MB.  This isn't an issue if the string is less than 256MB
+// (as most are!), but if they are >256MB we eventually return ENAMETOOLONG.  This restriction
+// is due to MapUserMemoryWindow; we don't want to consume more than two segments for
+// the mapping. 
+
+        ble++	cr1,copyin0						// skip if buffer length <= 256MB
+        bf		kkString,copyinout_too_big		// error if not string op
+        mr		r6,r9							// silently clamp buffer length to 256MB
+        stw		r9,kkBufSize(r1)				// update saved copy too
+
+
+// Set up thread_recover in case we hit an illegal address.
+
+copyin0:
+		mfsprg  r8,1							// Get the current thread 
+		lis		r2,hi16(copyinout_error)
+		ori		r2,r2,lo16(copyinout_error)
+		lwz		r4,THREAD_RECOVER(r8)
+		lwz		r3,ACT_VMMAP(r8)				// r3 <- vm_map virtual address
+		stw		r2,THREAD_RECOVER(r8)
+		stw		r4,kkThrErrJmp(r1)
+
+
+// Map user segment into kernel map, turn on 64-bit mode.  At this point:
+//		r3 = vm map
+//		r6 = buffer length
+// r10/r11 = 64-bit user-space ptr (source if copyin, dest if copyout)
+//
+// When we call MapUserMemoryWindow, we pass:
+//      r3 = vm map ptr
+//   r4/r5 = 64-bit user space address as an addr64_t
+        
+        mr      r4,r10                          // copy user ptr into r4/r5
+        mr      r5,r11
+#if INSTRUMENT
+        mfspr	r12,pmc1						; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace
+        stw		r12,0x6100+(13*16)+0x0(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc2						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(13*16)+0x4(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc3						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(13*16)+0x8(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc4						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(13*16)+0xC(0)		; INSTRUMENT - Save it
+#endif			
+        bl		EXT(MapUserMemoryWindow)		// get r3/r4 <- 64-bit address in kernel map of user operand
+#if INSTRUMENT
+        mfspr	r12,pmc1						; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace
+        stw		r12,0x6100+(14*16)+0x0(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc2						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(14*16)+0x4(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc3						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(14*16)+0x8(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc4						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(14*16)+0xC(0)		; INSTRUMENT - Save it
+#endif			
+        mr		r31,r4							// r31 <- mapped ptr into user space (may be 64-bit)
+        bf--	kk64bit,copyin1					// skip if a 32-bit processor
+ 
+ 		rldimi	r31,r3,32,0						// slam high-order bits into mapped ptr
+        mfmsr	r4								// if 64-bit, turn on SF so we can use returned ptr
+        li		r0,1
+        rldimi	r4,r0,63,MSR_SF_BIT				// light bit 0
+        mtmsrd	r4								// turn on 64-bit mode
+        isync									// wait for mode to change
+        
+        
+// Load r3-r5, substituting mapped ptr as appropriate.
+
+copyin1:
+        lwz		r5,kkBufSize(r1)				// restore length to copy
+        bf		kkIn,copyin2					// skip if copyout
+        lwz		r4,kkDest(r1)					// copyin: dest is kernel ptr
+        mr		r3,r31							// source is mapped ptr
+        b		copyin3
+copyin2:										// handle copyout
+        lwz		r3,kkSource(r1)					// source is kernel buffer (r3 at entry)
+        mr		r4,r31							// dest is mapped ptr into user space
+        
+        
+// Finally, all set up to copy:
+//		r3 = source ptr (mapped if copyin)
+//		r4 = dest ptr (mapped if copyout)
+//		r5 = length
+//	   r31 = mapped ptr returned by MapUserMemoryWindow
+//	   cr3 = kkIn, kkString, kk64bit, and kkNull flags
+
+copyin3:
+        bt		kkString,copyString				// handle copyinstr and copyoutstr
+        bl		EXT(bcopy)						// copyin and copyout: let bcopy do the work
+        li		r3,0							// return success
+        
+        
+// Main exit point for copyin, copyout, copyinstr, and copyoutstr.  Also reached
+// from error recovery if we get a DSI accessing user space.  Clear recovery ptr, 
+// and pop off frame.
+//		r3 = 0, EFAULT, or ENAMETOOLONG
+
+copyinx: 
+        lwz		r2,kkCR3(r1)                    // get callers cr3
+		mfsprg  r6,1							// Get the current thread 
+        bf--	kk64bit,copyinx1				// skip if 32-bit processor
+        mfmsr	r12
+        rldicl	r12,r12,0,MSR_SF_BIT+1			// if 64-bit processor, turn 64-bit mode off
+        mtmsrd	r12								// turn SF off
+        isync									// wait for the mode to change
+copyinx1:
+		lwz		r0,FM_LR_SAVE+kkFrameSize(r1)   // get return address
+        lwz		r31,kkR31Save(r1)				// restore callers r31
+        lwz		r4,kkThrErrJmp(r1)				// load saved thread recover
+        addi	r1,r1,kkFrameSize				// pop off our stack frame
 		mtlr	r0
+		stw		r4,THREAD_RECOVER(r6)			// restore thread recover
+        mtcrf	0x10,r2							// restore cr3
 		blr
 
-.L_call_copyout_multiple:
-	/* unwind the stack */
-	addi	r1,	r1,	FM_SIZE+16
-	lwz	r0,	FM_LR_SAVE(r1)
-	mtlr	r0
 
-	b	EXT(copyout_multiple)					/* not a call - a jump! */
+/* We get here via the exception handler if an illegal
+ * user memory reference was made.  This error handler is used by
+ * copyin, copyout, copyinstr, and copyoutstr.  Registers are as
+ * they were at point of fault, so for example cr3 flags are valid.
+ */
 
+copyinout_error:
+        li		r3,EFAULT						// return error
+        b		copyinx
+
+copyinout_0:									// degenerate case: 0-length copy
+		mtcrf	0x10,r2							// restore cr3
+        li		r3,0							// return success
+        blr
+        
+copyinout_too_big:								// degenerate case
+        mtcrf	0x10,r2							// restore cr3
+        lwz		r1,0(r1)						// pop off stack frame
+        li		r3,ENAMETOOLONG
+        blr
+        
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// Handle copyinstr and copyoutstr.  At this point the stack frame is set up,
+// the recovery ptr is set, the user's buffer is mapped, we're in 64-bit mode
+// if necessary, and:
+//		r3 = source ptr, mapped if copyinstr
+//		r4 = dest ptr, mapped if copyoutstr
+//		r5 = buffer length
+//	   r31 = mapped ptr returned by MapUserMemoryWindow
+//     cr3 = kkIn, kkString, kkNull, and kk64bit flags
+// We do word copies unless the buffer is very short, then use a byte copy loop
+// for the leftovers if necessary.  The crossover at which the word loop becomes
+// faster is about seven bytes, counting the zero.
+//
+// We first must word-align the source ptr, in order to avoid taking a spurious
+// page fault.
+
+copyString:
+        cmplwi	cr1,r5,15						// is buffer very short?
+        mr      r12,r3                          // remember ptr to 1st source byte
+        mtctr	r5								// assuming short, set up loop count for bytes
+        blt--   cr1,copyinstr8					// too short for word loop
+        rlwinm  r2,r3,0,0x3                     // get byte offset of 1st byte within word
+        rlwinm  r9,r3,3,0x18                    // get bit offset of 1st byte within word
+        li      r7,-1
+        sub     r3,r3,r2                        // word-align source address
+        add     r6,r5,r2                        // get length starting at byte 0 in word
+        srw     r7,r7,r9                        // get mask for bytes in first word
+        srwi	r0,r6,2							// get #words in buffer
+        lwz     r5,0(r3)                        // get aligned word with first source byte
+        lis		r10,hi16(0xFEFEFEFF)			// load magic constants into r10 and r11
+        lis		r11,hi16(0x80808080)
+        mtctr	r0								// set up word loop count
+        addi    r3,r3,4                         // advance past the source word
+        ori		r10,r10,lo16(0xFEFEFEFF)
+        ori		r11,r11,lo16(0x80808080)
+        orc     r8,r5,r7                        // map bytes preceeding first source byte into 0xFF
+        bt--	kkNull,copyinstr5enter          // enter loop that just counts
+        
+// Special case 1st word, which has been 0xFF filled on left.  Note that we use
+// "and.", even though we execute both in 32 and 64-bit mode.  This is OK.
+
+        slw     r5,r5,r9                        // left justify payload bytes
+        add		r9,r10,r8						// r9 =  data + 0xFEFEFEFF
+        andc	r7,r11,r8						// r7 = ~data & 0x80808080
+		subfic  r0,r2,4							// get r0 <- #payload bytes in 1st word
+        and.    r7,r9,r7						// if r7==0, then all bytes in r8 are nonzero
+        stw     r5,0(r4)                        // copy payload bytes to dest buffer
+        add		r4,r4,r0						// then point to next byte in dest buffer
+        bdnzt   cr0_eq,copyinstr6               // use loop that copies if 0 not found
+        
+        b		copyinstr7                      // 0 found (buffer can't be full)
+        
+        
+// Word loop(s).  They do a word-parallel search for 0s, using the following
+// inobvious but very efficient test:
+//		y =  data + 0xFEFEFEFF
+//		z = ~data & 0x80808080
+// If (y & z)==0, then all bytes in dataword are nonzero.  There are two copies
+// of this loop, one that just counts and another that copies.
+//		r3 = ptr to next word of source (word aligned)
+//		r4 = ptr to next byte in buffer
+//      r6 = original buffer length (adjusted to be word origin)
+//     r10 = 0xFEFEFEFE
+//     r11 = 0x80808080
+//     r12 = ptr to 1st source byte (used to determine string length)
+
+        .align	5								// align inner loops for speed
+copyinstr5:										// version that counts but does not copy
+        lwz     r8,0(r3)						// get next word of source
+        addi    r3,r3,4                         // advance past it
+copyinstr5enter:
+        add		r9,r10,r8						// r9 =  data + 0xFEFEFEFF
+        andc	r7,r11,r8						// r7 = ~data & 0x80808080
+        and.    r7,r9,r7                        // r7 = r9 & r7 ("." ok even in 64-bit mode)
+        bdnzt   cr0_eq,copyinstr5				// if r7==0, then all bytes in r8 are nonzero
+
+        b		copyinstr7
+
+        .align	5								// align inner loops for speed
+copyinstr6:										// version that counts and copies
+        lwz     r8,0(r3)						// get next word of source
+        addi    r3,r3,4                         // advance past it
+        addi	r4,r4,4							// increment dest ptr while we wait for data
+        add		r9,r10,r8						// r9 =  data + 0xFEFEFEFF
+        andc	r7,r11,r8						// r7 = ~data & 0x80808080
+        and.    r7,r9,r7                        // r7 = r9 & r7 ("." ok even in 64-bit mode)
+        stw		r8,-4(r4)						// pack all 4 bytes into buffer
+        bdnzt	cr0_eq,copyinstr6				// if r7==0, then all bytes are nonzero
+
+
+// Either 0 found or buffer filled.  The above algorithm has mapped nonzero bytes to 0
+// and 0 bytes to 0x80 with one exception: 0x01 bytes preceeding the first 0 are also
+// mapped to 0x80.  We must mask out these false hits before searching for an 0x80 byte.
+//		r3 = word aligned ptr to next word of source (ie, r8==mem(r3-4))
+//      r6 = original buffer length (adjusted to be word origin)
+//      r7 = computed vector of 0x00 and 0x80 bytes
+//      r8 = original source word, coming from -4(r3), possibly padded with 0xFFs on left if 1st word
+//     r12 = ptr to 1st source byte (used to determine string length)
+//     cr0 = beq set iff 0 not found
+
+copyinstr7:
+        rlwinm	r2,r8,7,0,31					// move 0x01 bits to 0x80 position
+		rlwinm  r6,r6,0,0x3						// mask down to partial byte count in last word
+        andc	r7,r7,r2						// turn off false hits from 0x0100 worst case
+        crnot	kkZero,cr0_eq					// 0 found iff cr0_eq is off
+        srwi    r7,r7,8                         // we want to count the 0 as a byte xferred
+		cmpwi   r6,0							// any bytes left over in last word?
+        cntlzw	r7,r7							// now we can find the 0 byte (ie, the 0x80)
+        subi    r3,r3,4                         // back up r3 to point to 1st byte in r8
+        srwi	r7,r7,3							// convert 8,16,24,32 to 1,2,3,4
+        add     r3,r3,r7                        // now r3 points one past 0 byte, or at 1st byte not xferred
+        bt++	kkZero,copyinstr10				// 0 found, so done
+        
+        beq		copyinstr10						// r6==0, so buffer truly full
+        mtctr	r6								// 0 not found, loop over r6 bytes
+        b		copyinstr8						// enter byte loop for last 1-3 leftover bytes
+        
+
+// Byte loop.  This is used for very small buffers and for the odd bytes left over
+// after searching and copying words at a time.
+//      r3 = ptr to next byte of source
+//      r4 = ptr to next dest byte
+//     r12 = ptr to first byte of source
+//     ctr = count of bytes to check
+    
+        .align	5								// align inner loops for speed
+copyinstr8:										// loop over bytes of source
+        lbz		r0,0(r3)						// get next byte of source
+        addi	r3,r3,1
+        addi	r4,r4,1							// increment dest addr whether we store or not
+        cmpwi	r0,0							// the 0?
+        bt--	kkNull,copyinstr9				// don't store if copyinstr with NULL ptr
+        stb		r0,-1(r4)
+copyinstr9:
+        bdnzf	cr0_eq,copyinstr8				// loop if byte not 0 and more room in buffer
+        
+        crmove	kkZero,cr0_eq					// remember if 0 found or buffer filled
+
+        
+// Buffer filled or 0 found.  Unwind and return.
+//      r3 = ptr to 1st source byte not transferred
+//     r12 = ptr to 1st source byte
+//     r31 = mapped ptr returned by MapUserMemoryWindow
+//     cr3 = kkZero set iff 0 found
+
+copyinstr10:
+        lwz		r9,kkCountPtr(r1)				// get ptr to place to store count of bytes moved
+        sub     r2,r3,r12                       // compute #bytes copied (including the 0)
+        li		r3,0							// assume success return status
+        stw		r2,0(r9)						// store #bytes moved
+        bt++	kkZero,copyinx					// we did find the 0 so return 0
+        li		r3,ENAMETOOLONG					// buffer filled
+        b		copyinx							// join main exit routine
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
- * boolean_t
- * copyinstr(src, dst, count, maxcount)
- *	vm_offset_t	src;
- *	vm_offset_t	dst;
- *	vm_size_t	maxcount; 
- *	vm_size_t*	count;
+ * int
+ * copypv(source, sink, size, which)
+ *	addr64_t	src;        // r3 and r4
+ *	addr64_t	dst;        // r5 and r6
+ *	size_t		size;		// r7
+ *	int			which;		// r8
  *
- * Set *count to the number of bytes copied
- * 
- * If dst == NULL, don't copy, just count bytes.
- * Only currently called from klcopyinstr. 
+ * Operand size bytes are copied from operand src into operand dst. The source and
+ * destination operand addresses are given as addr64_t, and may designate starting
+ * locations in physical or virtual memory in any combination except where both are
+ * virtual. Virtual memory locations may be in either the kernel or the current thread's
+ * address space. Operand size may be up to 256MB.
+ *
+ * Operation is controlled by operand which, which offers these options:
+ *		cppvPsrc : source operand is (1) physical or (0) virtual
+ *		cppvPsnk : destination operand is (1) physical or (0) virtual
+ *		cppvKmap : virtual operand is in (1) kernel or (0) current thread
+ *		cppvFsnk : (1) flush destination before and after transfer
+ *		cppvFsrc : (1) flush source before and after transfer
+ *		cppvNoModSnk : (1) don't set source operand's changed bit(s)
+ *		cppvNoRefSrc : (1) don't set destination operand's referenced bit(s)
+ *
+ * Implementation is now split into this new 64-bit path and the old path, hw_copypv_32().
+ * This section describes the operation of the new 64-bit path.
+ *
+ * The 64-bit path utilizes the more capacious 64-bit kernel address space to create a
+ * window in the kernel address space into all of physical RAM plus the I/O hole. Since
+ * the window's mappings specify the proper access policies for the underlying memory,
+ * the new path does not have to flush caches to avoid a cache paradox, so cppvFsnk
+ * and cppvFsrc are ignored. Physical operand adresses are relocated into the physical
+ * memory window, and are accessed with data relocation on. Virtual addresses are either
+ * within the kernel, or are mapped into the kernel address space through the user memory
+ * window. Because accesses to a virtual operand are performed with data relocation on,
+ * the new path does not have to translate the address, disable/enable interrupts, lock
+ * the mapping, or update referenced and changed bits.
+ *
+ * The IBM 970 (a.k.a. G5) processor treats real-mode accesses as guarded, so there is
+ * a substantial performance penalty for copypv operating in real mode. Utilizing the
+ * new 64-bit path, transfer performance increases >100% on the G5.
+ *
+ * The attentive reader may notice that mtmsrd ops are not followed by isync ops as 
+ * might be expected. The 970 follows PowerPC architecture version 2.01, which defines
+ * mtmsrd with L=0 as a context synchronizing op, so a following isync is no longer
+ * required.
+ *
+ * To keep things exciting, we develop 64-bit values in non-volatiles, but we also need
+ * to call 32-bit functions, which would lead to the high-order 32 bits of our values
+ * getting clobbered unless we do something special. So, we preserve our 64-bit non-volatiles
+ * in our own stack frame across calls to 32-bit functions.
+ *		
  */
 
-ENTRY(copyinstr, TAG_NO_FRAME_USED)
-
-/* Preamble allowing us to call a sub-function */
-		mflr	r0
-		stw		r0,FM_LR_SAVE(r1)
-		stwu	r1,-(FM_SIZE+16)(r1)
-
-#if 0
-		stw		r3,FM_SIZE+0(r1)				/* (TEST/DEBUG) */
-		stw		r4,FM_SIZE+4(r1)				/* (TEST/DEBUG) */
-		stw		r5,FM_SIZE+8(r1)				/* (TEST/DEBUG) */
-		stw		r6,FM_SIZE+12(r1)				/* (TEST/DEBUG) */
-		mr		r7,r0							/* (TEST/DEBUG) */
+// Map operand which bits into non-volatile CR2 and CR3 bits.
+#define whichAlign	((3+1)*4)
+#define whichMask	0x007F0000
+#define pvPsnk		(cppvPsnkb - whichAlign)
+#define pvPsrc		(cppvPsrcb - whichAlign)
+#define pvFsnk		(cppvFsnkb - whichAlign)
+#define pvFsrc		(cppvFsrcb - whichAlign)
+#define pvNoModSnk	(cppvNoModSnkb - whichAlign)
+#define pvNoRefSrc	(cppvNoRefSrcb - whichAlign)
+#define pvKmap		(cppvKmapb - whichAlign)
+#define pvNoCache	cr2_lt
+
+		.align	5
+		.globl	EXT(copypv)
+
+LEXT(copypv)
+        mfsprg	r10,2							// get feature flags
+        mtcrf	0x02,r10						// we need to test pf64Bit
+        bt++	pf64Bitb,copypv_64				// skip if 64-bit processor (only they take hint)
+        
+        b		EXT(hw_copypv_32)				// carry on with 32-bit copypv
+
+// Push a 32-bit ABI-compliant stack frame and preserve all non-volatiles that we'll clobber.        
+copypv_64:
+		mfsprg	r9,1							// get current thread
+		stwu	r1,-(FM_ALIGN((31-26+11)*4)+FM_SIZE)(r1)
+												// allocate stack frame and link it
+		mflr	r0								// get return address
+		mfcr	r10								// get cr2 and cr3
+		lwz		r12,THREAD_RECOVER(r9)			// get error callback
+		stw		r26,FM_ARG0+0x00(r1)			// save non-volatile r26
+		stw		r27,FM_ARG0+0x04(r1)			// save non-volatile r27
+		stw		r28,FM_ARG0+0x08(r1)			// save non-volatile r28
+		stw		r29,FM_ARG0+0x0C(r1)			// save non-volatile r29
+		stw		r30,FM_ARG0+0x10(r1)			// save non-volatile r30
+		stw		r31,FM_ARG0+0x14(r1)			// save non-volatile r31
+		stw		r12,FM_ARG0+0x20(r1)			// save error callback
+		stw		r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
+												// save return address
+		stw		r10,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
+												// save non-volatile cr2 and cr3
+
+// Non-volatile register usage in this routine is:
+//	r26: saved msr image
+//	r27: current pmap_t / virtual source address
+//	r28: destination virtual address
+//	r29: source address
+//	r30: destination address
+//	r31: byte count to copy
+//	cr2/3: parameter 'which' bits
+
+		rlwinm	r8,r8,whichAlign,whichMask		// align and mask which bits
+		mr		r31,r7							// copy size to somewhere non-volatile
+		mtcrf	0x20,r8							// insert which bits into cr2 and cr3
+		mtcrf	0x10,r8							// insert which bits into cr2 and cr3
+		rlwinm	r29,r3,0,1,0					// form source address high-order bits
+		rlwinm	r30,r5,0,1,0					// form destination address high-order bits
+		rlwimi	r29,r4,0,0,31					// form source address low-order bits
+		rlwimi	r30,r6,0,0,31					// form destination address low-order bits
+		crand	cr7_lt,pvPsnk,pvPsrc			// are both operand addresses physical?
+		cntlzw	r0,r31							// count leading zeroes in byte count
+		cror	cr7_eq,pvPsnk,pvPsrc			// cr7_eq <- source or destination is physical
+		bf--	cr7_eq,copypv_einval			// both operands may not be virtual
+		cmplwi	r0,4							// byte count greater than or equal 256M (2**28)?
+		blt--	copypv_einval					// byte count too big, give EINVAL
+		cmplwi	r31,0							// byte count zero?
+		beq--	copypv_zero						// early out
+		bt		cr7_lt,copypv_phys				// both operand addresses are physical
+		mr		r28,r30							// assume destination is virtual
+		bf		pvPsnk,copypv_dv				// is destination virtual?
+		mr		r28,r29							// no, so source must be virtual
+copypv_dv:
+		lis		r27,ha16(EXT(kernel_pmap))		// get kernel's pmap_t *, high-order
+		lwz		r27,lo16(EXT(kernel_pmap))(r27) // get kernel's pmap_t
+		bt		pvKmap,copypv_kern				// virtual address in kernel map?
+		lwz		r3,ACT_VMMAP(r9)				// get user's vm_map *
+		rldicl	r4,r28,32,32					// r4, r5 <- addr64_t virtual address 
+		rldicl	r5,r28,0,32
+		std		r29,FM_ARG0+0x30(r1)			// preserve 64-bit r29 across 32-bit call
+		std		r30,FM_ARG0+0x38(r1)			// preserve 64-bit r30 across 32-bit call
+		bl		EXT(MapUserMemoryWindow)		// map slice of user space into kernel space
+		ld		r29,FM_ARG0+0x30(r1)			// restore 64-bit r29
+		ld		r30,FM_ARG0+0x38(r1)			// restore 64-bit r30
+		rlwinm	r28,r3,0,1,0					// convert relocated addr64_t virtual address 
+		rlwimi	r28,r4,0,0,31					//  into a single 64-bit scalar
+copypv_kern:
+
+// Since we'll be accessing the virtual operand with data-relocation on, we won't need to 
+// update the referenced and changed bits manually after the copy. So, force the appropriate
+// flag bit on for the virtual operand.
+		crorc	pvNoModSnk,pvNoModSnk,pvPsnk	// for virtual dest, let hardware do ref/chg bits
+		crorc	pvNoRefSrc,pvNoRefSrc,pvPsrc	// for virtual source, let hardware do ref bit
 		
-		bl		EXT(tracecopystr)				/* (TEST/DEBUG) */
+// We'll be finding a mapping and looking at, so we need to disable 'rupts.
+		lis		r0,hi16(MASK(MSR_VEC))			// get vector mask
+		ori		r0,r0,lo16(MASK(MSR_FP))		// insert fp mask
+		mfmsr	r26								// save current msr
+		andc	r26,r26,r0						// turn off VEC and FP in saved copy
+		ori		r0,r0,lo16(MASK(MSR_EE))		// add EE to our mask
+		andc	r0,r26,r0						// disable EE in our new msr image
+		mtmsrd	r0								// introduce new msr image
+
+// We're now holding the virtual operand's pmap_t in r27 and its virtual address in r28. We now
+// try to find a mapping corresponding to this address in order to determine whether the address
+// is cacheable. If we don't find a mapping, we can safely assume that the operand is cacheable
+// (a non-cacheable operand must be a block mapping, which will always exist); otherwise, we
+// examine the mapping's caching-inhibited bit.
+		mr		r3,r27							// r3 <- pmap_t pmap
+		rldicl	r4,r28,32,32					// r4, r5 <- addr64_t va
+		rldicl	r5,r28,0,32
+		la		r6,FM_ARG0+0x18(r1)				// r6 <- addr64_t *nextva
+		li		r7,1							// r7 <- int full, search nested mappings
+		std		r26,FM_ARG0+0x28(r1)			// preserve 64-bit r26 across 32-bit calls
+		std		r28,FM_ARG0+0x30(r1)			// preserve 64-bit r28 across 32-bit calls
+		std		r29,FM_ARG0+0x38(r1)			// preserve 64-bit r29 across 32-bit calls
+		std		r30,FM_ARG0+0x40(r1)			// preserve 64-bit r30 across 32-bit calls
+		bl		EXT(mapping_find)				// find mapping for virtual operand
+		mr.		r3,r3							// did we find it?
+		beq		copypv_nomapping				// nope, so we'll assume it's cacheable
+		lwz		r4,mpVAddr+4(r3)				// get low half of virtual addr for hw flags
+		rlwinm.	r4,r4,0,mpIb-32,mpIb-32			// caching-inhibited bit set?
+		crnot	pvNoCache,cr0_eq				// if it is, use bcopy_nc
+		bl		EXT(mapping_drop_busy)			// drop busy on the mapping
+copypv_nomapping:
+		ld		r26,FM_ARG0+0x28(r1)			// restore 64-bit r26
+		ld		r28,FM_ARG0+0x30(r1)			// restore 64-bit r28
+		ld		r29,FM_ARG0+0x38(r1)			// restore 64-bit r29
+		ld		r30,FM_ARG0+0x40(r1)			// restore 64-bit r30
+		mtmsrd	r26								// restore msr to it's previous state
+
+// Set both the source and destination virtual addresses to the virtual operand's address --
+// we'll overlay one of them with the physical operand's address.
+		mr		r27,r28							// make virtual operand BOTH source AND destination
+
+// Now we're ready to relocate the physical operand address(es) into the physical memory window.
+// Recall that we've mapped physical memory (including the I/O hole) into the kernel's address
+// space somewhere at or over the 2**32 line. If one or both of the operands are in the I/O hole,
+// we'll set the pvNoCache flag, forcing use of non-caching bcopy_nc() to do the copy.
+copypv_phys:
+		ld		r6,lgPMWvaddr(0)				// get physical memory window virtual address
+		bf		pvPsnk,copypv_dstvirt			// is destination address virtual?
+		cntlzd	r4,r30							// count leading zeros in destination address
+		cmplwi	r4,32							// if it's 32, then it's in the I/O hole (2**30 to 2**31-1)
+		cror	pvNoCache,cr0_eq,pvNoCache		// use bcopy_nc for I/O hole locations		
+		add		r28,r30,r6						// relocate physical destination into physical window
+copypv_dstvirt:
+		bf		pvPsrc,copypv_srcvirt			// is source address virtual?
+		cntlzd	r4,r29							// count leading zeros in source address
+		cmplwi	r4,32							// if it's 32, then it's in the I/O hole (2**30 to 2**31-1)
+		cror	pvNoCache,cr0_eq,pvNoCache		// use bcopy_nc for I/O hole locations		
+		add		r27,r29,r6						// relocate physical source into physical window
+copypv_srcvirt:
+
+// Once the copy is under way (bcopy or bcopy_nc), we will want to get control if anything
+// funny happens during the copy. So, we set a pointer to our error handler in the per-thread
+// control block.
+		mfsprg	r8,1							// get current threads stuff
+		lis		r3,hi16(copypv_error)			// get our error callback's address, high
+		ori		r3,r3,lo16(copypv_error)		// get our error callback's address, low
+		stw		r3,THREAD_RECOVER(r8)			// set our error callback
 		
-		lwz		r3,FM_SIZE+0(r1)				/* (TEST/DEBUG) */
-		lwz		r4,FM_SIZE+4(r1)				/* (TEST/DEBUG) */
-		lwz		r5,FM_SIZE+8(r1)				/* (TEST/DEBUG) */
-		stw		r6,FM_SIZE+12(r1)				/* (TEST/DEBUG) */
+// Since our physical operand(s) are relocated at or above the 2**32 line, we must enter
+// 64-bit mode.
+		li		r0,1							// get a handy one bit
+		mfmsr	r3								// get current msr
+		rldimi	r3,r0,63,MSR_SF_BIT				// set SF bit on in our msr copy
+		mtmsrd	r3								// enter 64-bit mode
+
+// If requested, flush data cache
+// Note that we don't flush, the code is being saved "just in case".
+#if 0
+		bf		pvFsrc,copypv_nfs				// do we flush the source?
+		rldicl	r3,r27,32,32					// r3, r4 <- addr64_t source virtual address
+		rldicl	r4,r27,0,32
+		mr		r5,r31							// r5 <- count (in bytes)
+		li		r6,0							// r6 <- boolean phys (false, not physical)
+		bl		EXT(flush_dcache)				// flush the source operand
+copypv_nfs:
+		bf		pvFsnk,copypv_nfdx				// do we flush the destination?
+		rldicl	r3,r28,32,32					// r3, r4 <- addr64_t destination virtual address
+		rldicl	r4,r28,0,32
+		mr		r5,r31							// r5 <- count (in bytes)
+		li		r6,0							// r6 <- boolean phys (false, not physical)
+		bl		EXT(flush_dcache)				// flush the destination operand
+copypv_nfdx:
 #endif
-				
-		mfmsr	r0								/* Get the MSR */
-		rlwinm	r7,r0,0,MSR_EE_BIT+1,MSR_EE_BIT-1	/* Clear 'rupts */
-		mtmsr	r7								/* Disable 'rupts */
-
-		mfsprg	r7,0							/* Get the per_proc */
-		lwz		r7,PP_CPU_DATA(r7)
-		cmpli	cr0,r5,0
-		lwz		r10,CPU_ACTIVE_THREAD(r7)
-		mtmsr	r0								/* Restore 'rupts */
-		ble-	cr0,.L_copyinout_trivial
-
-/* we know we have a valid copyin to do now */
-/* Set up thread_recover in case we hit an illegal address */
-		
-		li		r0,0							
-		lwz		r8,THREAD_TOP_ACT(r10)
-		stw		r0,0(r6)						/* Clear result length */
-		lis		r11,HIGH_ADDR(.L_copyinout_error)
-		lwz		r8,ACT_VMMAP(r8)				; Get the map for this activation
-		rlwinm	r12,r3,6,26,29					; Get index to the segment slot
-		lwz		r8,VMMAP_PMAP(r8)
-		ori		r11,r11,LOW_ADDR(.L_copyinout_error)
-		add		r8,r8,r12						; Start indexing to the segment value
-		stw		r11,THREAD_RECOVER(r10)
-		rlwinm	r3,r3,0,4,31
-		lwz		r7,PMAP_SEGS(r8)				; Get the source SR value
-		oris	r3,r3,(SR_COPYIN_NUM << (28-16))	/* Set the copyin segment as the source */
-
-/* Copy byte by byte for now - TODO NMGS speed this up with
- * some clever (but fairly standard) logic for word copies.
- * We don't use a copyinstr_multiple since copyinstr is called
- * with INT_MAX in the linux server. Eugh.
- */
 
-		li		r9,0							/* Clear byte counter */
-
-/* If the destination is NULL, don't do writes,
- * just count bytes. We set CR7 outside the loop to save time
- */
-		cmpwi	cr7,r4,0						/* Is the destination null? */
-		
-nxtseg:	mtsr	SR_COPYIN,r7					/* Set the source SR */
-		isync
+// Call bcopy or bcopy_nc to perform the copy.
+		mr		r3,r27							// r3 <- source virtual address
+		mr		r4,r28							// r4 <- destination virtual address
+		mr		r5,r31							// r5 <- bytes to copy
+		bt		pvNoCache,copypv_nc				// take non-caching route
+		bl		EXT(bcopy)						// call bcopy to do the copying
+		b		copypv_copydone
+copypv_nc:
+		bl		EXT(bcopy_nc)					// call bcopy_nc to do the copying
+copypv_copydone:
+
+// If requested, flush data cache
+// Note that we don't flush, the code is being saved "just in case".
+#if 0
+		bf		pvFsrc,copypv_nfsx				// do we flush the source?
+		rldicl	r3,r27,32,32					// r3, r4 <- addr64_t source virtual address
+		rldicl	r4,r27,0,32
+		mr		r5,r31							// r5 <- count (in bytes)
+		li		r6,0							// r6 <- boolean phys (false, not physical)
+		bl		EXT(flush_dcache)				// flush the source operand
+copypv_nfsx:
+		bf		pvFsnk,copypv_nfd				// do we flush the destination?
+		rldicl	r3,r28,32,32					// r3, r4 <- addr64_t destination virtual address
+		rldicl	r4,r28,0,32
+		mr		r5,r31							// r5 <- count (in bytes)
+		li		r6,0							// r6 <- boolean phys (false, not physical)
+		bl		EXT(flush_dcache)				// flush the destination operand
+copypv_nfd:
+#endif
 
-.L_copyinstr_loop:
-		lbz		r0,0(r3)						/* Get the source */
-		addic.	r5,r5,-1						/* Have we gone far enough? */
-		addi	r3,r3,1							/* Bump source pointer */
+// Leave 64-bit mode.
+		mfmsr	r3								// get current msr
+		rldicl	r3,r3,0,MSR_SF_BIT+1			// clear SF bit in our copy
+		mtmsrd	r3								// leave 64-bit mode
+
+// If requested, set ref/chg on source/dest physical operand(s). It is possible that copy is
+// from/to a RAM disk situated outside of mapped physical RAM, so we check each page by calling
+// mapping_phys_lookup() before we try to set its ref/chg bits; otherwise, we might panic.
+// Note that this code is page-size sensitive, so it should probably be a part of our low-level
+// code in hw_vm.s.
+		bt		pvNoModSnk,copypv_nomod			// skip destination update if not requested
+		std		r29,FM_ARG0+0x30(r1)			// preserve 64-bit r29 across 32-bit calls
+		li		r26,1							// r26 <- 4K-page count						
+		mr		r27,r31							// r27 <- byte count
+		rlwinm	r3,r30,0,20,31					// does destination cross a page boundary?
+		subfic	r3,r3,4096						//
+		cmplw	r3,r27							// 
+		blt		copypv_modnox					// skip if not crossing case
+		subf	r27,r3,r27						// r27 <- byte count less initial fragment
+		addi	r26,r26,1						// increment page count
+copypv_modnox:
+		srdi	r3,r27,12						// pages to update (not including crosser)
+		add		r26,r26,r3						// add in crosser
+		srdi	r27,r30,12						// r27 <- destination page number
+copypv_modloop:
+		mr		r3,r27							// r3 <- destination page number				
+		la		r4,FM_ARG0+0x18(r1)				// r4 <- unsigned int *pindex
+		bl		EXT(mapping_phys_lookup)		// see if page is really there
+		mr.		r3,r3							// is it?
+		beq--	copypv_modend					// nope, break out of modify loop
+		mr		r3,r27							// r3 <- destination page number
+		bl		EXT(mapping_set_mod)			// set page changed status
+		subi	r26,r26,1						// decrement page count
+		cmpwi	r26,0							// done yet?
+		bgt		copypv_modloop					// nope, iterate
+copypv_modend:
+		ld		r29,FM_ARG0+0x30(r1)			// restore 64-bit r29
+copypv_nomod:
+		bt		pvNoRefSrc,copypv_done			// skip source update if not requested
+copypv_debugref:
+		li		r26,1							// r26 <- 4K-page count						
+		mr		r27,r31							// r27 <- byte count
+		rlwinm	r3,r29,0,20,31					// does source cross a page boundary?
+		subfic	r3,r3,4096						//
+		cmplw	r3,r27							// 
+		blt		copypv_refnox					// skip if not crossing case
+		subf	r27,r3,r27						// r27 <- byte count less initial fragment
+		addi	r26,r26,1						// increment page count
+copypv_refnox:
+		srdi	r3,r27,12						// pages to update (not including crosser)
+		add		r26,r26,r3						// add in crosser
+		srdi	r27,r29,12						// r27 <- source page number
+copypv_refloop:
+		mr		r3,r27							// r3 <- source page number
+		la		r4,FM_ARG0+0x18(r1)				// r4 <- unsigned int *pindex
+		bl		EXT(mapping_phys_lookup)		// see if page is really there
+		mr.		r3,r3							// is it?
+		beq--	copypv_done						// nope, break out of modify loop
+		mr		r3,r27							// r3 <- source  page number
+		bl		EXT(mapping_set_ref)			// set page referenced status
+		subi	r26,r26,1						// decrement page count
+		cmpwi	r26,0							// done yet?
+		bgt		copypv_refloop					// nope, iterate
 		
-		cmpwi	cr1,r0,0						/* Did we hit a null? */
-
-		beq		cr7,.L_copyinstr_no_store		/* If we are just counting, skip the store... */
-	
-		stb		r0,0(r4)						/* Move to sink */
-		addi	r4,r4,1							/* Advance sink pointer */
-
-.L_copyinstr_no_store:
-
-		addi	r9,r9,1							/* Count the character */
-		beq-	cr1,.L_copyinstr_done			/* We're done if we did a null... */
-		beq-	cr0,L_copyinstr_toobig			/* Also if we maxed the count... */
-	
-/* Check to see if the copyin pointer has moved out of the
- * copyin segment, if it has we must remap.
- */
-
-		rlwinm.	r0,r3,0,4,31					/* Did we wrap around to 0? */
-		bne+	cr0,.L_copyinstr_loop			/* Nope... */
-
-		lwz		r7,PMAP_SEGS+4(r8)				; Get the next source SR value
-		addi	r8,r8,4							; Point to the next segment
-		oris	r3,r0,(SR_COPYIN_NUM << (28-16))	/* Reset the segment number */
-		b		nxtseg							/* Keep going... */
-	
-L_copyinstr_toobig:
-		li		r3,ENAMETOOLONG
-		b		L_copyinstr_return
-.L_copyinstr_done:
-		li		r3,0							/* Normal return */
-L_copyinstr_return:
-		li		r4,0							/* to clear thread_recover */
-		stw		r9,0(r6)						/* Set how many bytes we did */
-		stw		r4,THREAD_RECOVER(r10)			/* Clear recovery exit */
-
-		addi	r1,	r1,	FM_SIZE+16
-		lwz		r0,	FM_LR_SAVE(r1)
-		mtlr	r0
-		blr
+// Return, indicating success.
+copypv_done:
+copypv_zero:
+		li		r3,0							// our efforts were crowned with success
+
+// Pop frame, restore caller's non-volatiles, clear recovery routine pointer.
+copypv_return:
+		mfsprg	r9,1							// get current threads stuff
+		lwz		r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
+												// get return address
+		lwz		r4,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
+												// get non-volatile cr2 and cr3
+		lwz		r26,FM_ARG0+0x00(r1)			// restore non-volatile r26
+		lwz		r27,FM_ARG0+0x04(r1)			// restore non-volatile r27
+		mtlr	r0								// restore return address
+		lwz		r28,FM_ARG0+0x08(r1)			// restore non-volatile r28
+		mtcrf	0x20,r4							// restore non-volatile cr2
+		mtcrf	0x10,r4							// restore non-volatile cr3
+		lwz		r11,FM_ARG0+0x20(r1)			// save error callback
+		lwz		r29,FM_ARG0+0x0C(r1)			// restore non-volatile r29
+		lwz		r30,FM_ARG0+0x10(r1)			// restore non-volatile r30
+		lwz		r31,FM_ARG0+0x14(r1)			// restore non-volatile r31
+		stw		r11,THREAD_RECOVER(r9)			// restore our error callback
+		lwz		r1,0(r1)						// release stack frame
+												
+		blr										// y'all come back now
+
+// Invalid argument handler.
+copypv_einval:
+		li		r3,EINVAL						// invalid argument
+		b		copypv_return					// return
+
+// Error encountered during bcopy or bcopy_nc.		
+copypv_error:
+		mfmsr	r3								// get current msr
+		rldicl	r3,r3,0,MSR_SF_BIT+1			// clear SF bit in our copy
+		mtmsrd	r3								// leave 64-bit mode
+		li		r3,EFAULT						// it was all his fault
+		b		copypv_return					// return