X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/d7e50217d7adf6e52786a38bcaa4cd698cb9a79e..2d21ac55c334faf3a56e5634905ed6987fc787d4:/osfmk/ppc/movc.s

diff --git a/osfmk/ppc/movc.s b/osfmk/ppc/movc.s
index 6dee8f87a..1e111ec0c 100644
--- a/osfmk/ppc/movc.s
+++ b/osfmk/ppc/movc.s
@@ -1,16 +1,19 @@
 /*
  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
  *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
  * 
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
@@ -20,7 +23,7 @@
  * Please see the License for the specific language governing rights and
  * limitations under the License.
  * 
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
  * @OSF_COPYRIGHT@
@@ -464,7 +467,7 @@ pmap_novmx_icache_flush:
 // Stack frame format used by copyin, copyout, copyinstr and copyoutstr.
 // These routines all run both on 32 and 64-bit machines, though because they are called
 // by the BSD kernel they are always in 32-bit mode when entered.  The mapped ptr returned
-// by MapUserAddressSpace will be 64 bits however on 64-bit machines.  Beware to avoid
+// by MapUserMemoryWindow will be 64 bits however on 64-bit machines.  Beware to avoid
 // using compare instructions on this ptr.  This mapped ptr is kept globally in r31, so there
 // is no need to store or load it, which are mode-dependent operations since it could be
 // 32 or 64 bits.
@@ -472,11 +475,12 @@ pmap_novmx_icache_flush:
 #define	kkFrameSize	(FM_SIZE+32)
 
 #define	kkBufSize	(FM_SIZE+0)
-#define	kkCR		(FM_SIZE+4)
+#define	kkCR3		(FM_SIZE+4)
 #define	kkSource	(FM_SIZE+8)
 #define	kkDest		(FM_SIZE+12)
 #define	kkCountPtr	(FM_SIZE+16)
 #define	kkR31Save	(FM_SIZE+20)
+#define	kkThrErrJmp	(FM_SIZE+24)
  
  
 // nonvolatile CR bits we use as flags in cr3
@@ -492,20 +496,21 @@ pmap_novmx_icache_flush:
 /*
  * int
  * copyoutstr(src, dst, maxcount, count)
- *	vm_offset_t	src;
- *	vm_offset_t	dst;
- *	vm_size_t	maxcount; 
- *	vm_size_t*	count;
+ *	vm_offset_t	src;        // r3
+ *	addr64_t	dst;        // r4 and r5
+ *	vm_size_t	maxcount;   // r6
+ *	vm_size_t*	count;      // r7
  *
  * Set *count to the number of bytes copied.
  */
 
 ENTRY(copyoutstr, TAG_NO_FRAME_USED)
-        mfcr	r2								// we use nonvolatile cr3
+        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
+        mr      r10,r4                          // move high word of 64-bit user address to r10
         li		r0,0
         crset	kkString						// flag as a string op
-        mr		r10,r4							// for copyout, dest ptr (r4) is in user space
-        stw		r0,0(r6)						// initialize #bytes moved
+        mr      r11,r5                          // move low word of 64-bit user address to r11
+        stw		r0,0(r7)						// initialize #bytes moved
         crclr	kkIn							// flag as copyout
         b		copyJoin
 
@@ -514,10 +519,10 @@ ENTRY(copyoutstr, TAG_NO_FRAME_USED)
 /*
  * int
  * copyinstr(src, dst, maxcount, count)
- *	vm_offset_t	src;
- *	vm_offset_t	dst;
- *	vm_size_t	maxcount; 
- *	vm_size_t*	count;
+ *	addr64_t	src;        // r3 and r4
+ *	vm_offset_t	dst;        // r5
+ *	vm_size_t	maxcount;   // r6
+ *	vm_size_t*	count;      // r7
  *
  * Set *count to the number of bytes copied
  * If dst == NULL, don't copy, just count bytes.
@@ -525,13 +530,14 @@ ENTRY(copyoutstr, TAG_NO_FRAME_USED)
  */
 
 ENTRY(copyinstr, TAG_NO_FRAME_USED)
-        mfcr	r2								// we use nonvolatile cr3
-        cmplwi	r4,0							// dst==NULL?
+        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
+        cmplwi	r5,0							// dst==NULL?
+        mr      r10,r3                          // move high word of 64-bit user address to r10
         li		r0,0
         crset	kkString						// flag as a string op
-        mr		r10,r3							// for copyin, source ptr (r3) is in user space
+        mr      r11,r4                          // move low word of 64-bit user address to r11
         crmove	kkNull,cr0_eq					// remember if (dst==NULL)
-        stw		r0,0(r6)						// initialize #bytes moved
+        stw		r0,0(r7)						// initialize #bytes moved
         crset	kkIn							// flag as copyin (rather than copyout)
         b		copyJoin1						// skip over the "crclr kkNull"
 
@@ -540,9 +546,9 @@ ENTRY(copyinstr, TAG_NO_FRAME_USED)
 /*
  * int
  * copyout(src, dst, count)
- *	vm_offset_t	src;
- *	vm_offset_t	dst;
- *	size_t		count;
+ *	vm_offset_t	src;        // r3
+ *	addr64_t	dst;        // r4 and r5
+ *	size_t		count;      // r6
  */
 
 			.align	5
@@ -553,18 +559,19 @@ LEXT(copyout)
 LEXT(copyoutmsg)
 
 #if INSTRUMENT
-			mfspr	r12,pmc1						; INSTRUMENT - saveinstr[12] - Take stamp at copyout
-			stw		r12,0x6100+(12*16)+0x0(0)		; INSTRUMENT - Save it
-			mfspr	r12,pmc2						; INSTRUMENT - Get stamp
-			stw		r12,0x6100+(12*16)+0x4(0)		; INSTRUMENT - Save it
-			mfspr	r12,pmc3						; INSTRUMENT - Get stamp
-			stw		r12,0x6100+(12*16)+0x8(0)		; INSTRUMENT - Save it
-			mfspr	r12,pmc4						; INSTRUMENT - Get stamp
-			stw		r12,0x6100+(12*16)+0xC(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc1						; INSTRUMENT - saveinstr[12] - Take stamp at copyout
+        stw		r12,0x6100+(12*16)+0x0(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc2						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(12*16)+0x4(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc3						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(12*16)+0x8(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc4						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(12*16)+0xC(0)		; INSTRUMENT - Save it
 #endif			
-        mfcr	r2								// save caller's CR
+        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
+        mr      r10,r4                          // move high word of 64-bit user address to r10
         crclr	kkString						// not a string version
-        mr		r10,r4							// dest (r4) is user-space ptr
+        mr      r11,r5                          // move low word of 64-bit user address to r11
         crclr	kkIn							// flag as copyout
         b		copyJoin
         
@@ -573,9 +580,9 @@ LEXT(copyoutmsg)
 /*
  * int
  * copyin(src, dst, count)
- *	vm_offset_t	src;
- *	vm_offset_t	dst;
- *	size_t		count;
+ *	addr64_t	src;        // r3 and r4
+ *	vm_offset_t	dst;        // r5
+ *	size_t		count;      // r6
  */
 
 
@@ -586,36 +593,41 @@ LEXT(copyoutmsg)
 LEXT(copyin)
 LEXT(copyinmsg)
 
-        mfcr	r2								// save caller's CR
+        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
+        mr      r10,r3                          // move high word of 64-bit user address to r10
         crclr	kkString						// not a string version
-        mr		r10,r3							// source (r3) is user-space ptr in copyin
+        mr      r11,r4                          // move low word of 64-bit user address to r11
         crset	kkIn							// flag as copyin
         
         
 // Common code to handle setup for all the copy variants:
-//		r2 = caller's CR, since we use cr3
-//   r3-r6 = parameters
-//	   r10 = user-space ptr (r3 if copyin, r4 if copyout)
+//		r2 = caller's cr3
+//      r3 = source if copyout
+//      r5 = dest if copyin
+//      r6 = buffer length or count
+//      r7 = count output ptr (if kkString set)
+//	   r10 = high word of 64-bit user-space address (source if copyin, dest if copyout)
+//	   r11 = low word of 64-bit user-space address
 //     cr3 = kkIn, kkString, kkNull flags
 
 copyJoin:
         crclr	kkNull							// (dst==NULL) convention not used with this call
 copyJoin1:										// enter from copyinstr with kkNull set
 		mflr	r0								// get return address
-        cmplwi	r5,0							// buffer length 0?
+        cmplwi	r6,0							// buffer length 0?
         lis		r9,0x1000						// r9 <- 0x10000000 (256MB)
 		stw		r0,FM_LR_SAVE(r1)				// save return
-        cmplw	cr1,r5,r9						// buffer length > 256MB ?
+        cmplw	cr1,r6,r9						// buffer length > 256MB ?
         mfsprg	r8,2							// get the features
         beq--	copyinout_0						// 0 length is degenerate case
 		stwu	r1,-kkFrameSize(r1)				// set up stack frame
-        stw		r2,kkCR(r1)						// save caller's CR since we use cr3
+        stw		r2,kkCR3(r1)                    // save caller's cr3, which we use for flags
         mtcrf	0x02,r8							// move pf64Bit to cr6
-        stw		r3,kkSource(r1)					// save args across MapUserAddressSpace
-        stw		r4,kkDest(r1)
-        stw		r5,kkBufSize(r1)
+        stw		r3,kkSource(r1)					// save args across MapUserMemoryWindow
+        stw		r5,kkDest(r1)
+        stw		r6,kkBufSize(r1)
         crmove	kk64bit,pf64Bitb				// remember if this is a 64-bit processor
-        stw		r6,kkCountPtr(r1)
+        stw		r7,kkCountPtr(r1)
         stw		r31,kkR31Save(r1)				// we use r31 globally for mapped user ptr
         li		r31,0							// no mapped ptr yet
         
@@ -624,58 +636,60 @@ copyJoin1:										// enter from copyinstr with kkNull set
 // The string ops are passed -1 lengths by some BSD callers, so for them we silently clamp
 // the buffer length to 256MB.  This isn't an issue if the string is less than 256MB
 // (as most are!), but if they are >256MB we eventually return ENAMETOOLONG.  This restriction
-// is due to MapUserAddressSpace; we don't want to consume more than two segments for
+// is due to MapUserMemoryWindow; we don't want to consume more than two segments for
 // the mapping. 
 
         ble++	cr1,copyin0						// skip if buffer length <= 256MB
         bf		kkString,copyinout_too_big		// error if not string op
-        mr		r5,r9							// silently clamp buffer length to 256MB
+        mr		r6,r9							// silently clamp buffer length to 256MB
         stw		r9,kkBufSize(r1)				// update saved copy too
 
 
 // Set up thread_recover in case we hit an illegal address.
 
 copyin0:
-		mfsprg  r8,1							/* Get the current act */ 
+		mfsprg  r8,1							// Get the current thread 
 		lis		r2,hi16(copyinout_error)
-		lwz		r7,ACT_THREAD(r8)
 		ori		r2,r2,lo16(copyinout_error)
+		lwz		r4,THREAD_RECOVER(r8)
 		lwz		r3,ACT_VMMAP(r8)				// r3 <- vm_map virtual address
-		stw		r2,THREAD_RECOVER(r7)
+		stw		r2,THREAD_RECOVER(r8)
+		stw		r4,kkThrErrJmp(r1)
 
 
-// Map user segment into kernel map, turn on 64-bit mode.
+// Map user segment into kernel map, turn on 64-bit mode.  At this point:
 //		r3 = vm map
-//		r5 = buffer length
-//	   r10 = user space ptr (r3 if copyin, r4 if copyout)
+//		r6 = buffer length
+// r10/r11 = 64-bit user-space ptr (source if copyin, dest if copyout)
+//
+// When we call MapUserMemoryWindow, we pass:
+//      r3 = vm map ptr
+//   r4/r5 = 64-bit user space address as an addr64_t
         
-		mr		r6,r5							// Set length to map
-		li		r4,0							// Note: we only do this 32-bit for now
-        mr		r5,r10							// arg2 <- user space ptr
+        mr      r4,r10                          // copy user ptr into r4/r5
+        mr      r5,r11
 #if INSTRUMENT
-			mfspr	r12,pmc1						; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace
-			stw		r12,0x6100+(13*16)+0x0(0)		; INSTRUMENT - Save it
-			mfspr	r12,pmc2						; INSTRUMENT - Get stamp
-			stw		r12,0x6100+(13*16)+0x4(0)		; INSTRUMENT - Save it
-			mfspr	r12,pmc3						; INSTRUMENT - Get stamp
-			stw		r12,0x6100+(13*16)+0x8(0)		; INSTRUMENT - Save it
-			mfspr	r12,pmc4						; INSTRUMENT - Get stamp
-			stw		r12,0x6100+(13*16)+0xC(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc1						; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace
+        stw		r12,0x6100+(13*16)+0x0(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc2						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(13*16)+0x4(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc3						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(13*16)+0x8(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc4						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(13*16)+0xC(0)		; INSTRUMENT - Save it
 #endif			
-        bl		EXT(MapUserAddressSpace)		// set r3 <- address in kernel map of user operand
+        bl		EXT(MapUserMemoryWindow)		// get r3/r4 <- 64-bit address in kernel map of user operand
 #if INSTRUMENT
-			mfspr	r12,pmc1						; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace
-			stw		r12,0x6100+(14*16)+0x0(0)		; INSTRUMENT - Save it
-			mfspr	r12,pmc2						; INSTRUMENT - Get stamp
-			stw		r12,0x6100+(14*16)+0x4(0)		; INSTRUMENT - Save it
-			mfspr	r12,pmc3						; INSTRUMENT - Get stamp
-			stw		r12,0x6100+(14*16)+0x8(0)		; INSTRUMENT - Save it
-			mfspr	r12,pmc4						; INSTRUMENT - Get stamp
-			stw		r12,0x6100+(14*16)+0xC(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc1						; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace
+        stw		r12,0x6100+(14*16)+0x0(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc2						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(14*16)+0x4(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc3						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(14*16)+0x8(0)		; INSTRUMENT - Save it
+        mfspr	r12,pmc4						; INSTRUMENT - Get stamp
+        stw		r12,0x6100+(14*16)+0xC(0)		; INSTRUMENT - Save it
 #endif			
-		or.		r0,r3,r4						// Did we fail the mapping?
         mr		r31,r4							// r31 <- mapped ptr into user space (may be 64-bit)
-        beq--	copyinout_error					// was 0, so there was an error making the mapping
         bf--	kk64bit,copyin1					// skip if a 32-bit processor
  
  		rldimi	r31,r3,32,0						// slam high-order bits into mapped ptr
@@ -691,7 +705,7 @@ copyin0:
 copyin1:
         lwz		r5,kkBufSize(r1)				// restore length to copy
         bf		kkIn,copyin2					// skip if copyout
-        lwz		r4,kkDest(r1)					// copyin: source is mapped, dest is r4 at entry
+        lwz		r4,kkDest(r1)					// copyin: dest is kernel ptr
         mr		r3,r31							// source is mapped ptr
         b		copyin3
 copyin2:										// handle copyout
@@ -703,7 +717,7 @@ copyin2:										// handle copyout
 //		r3 = source ptr (mapped if copyin)
 //		r4 = dest ptr (mapped if copyout)
 //		r5 = length
-//	   r31 = mapped ptr returned by MapUserAddressSpace
+//	   r31 = mapped ptr returned by MapUserMemoryWindow
 //	   cr3 = kkIn, kkString, kk64bit, and kkNull flags
 
 copyin3:
@@ -714,29 +728,24 @@ copyin3:
         
 // Main exit point for copyin, copyout, copyinstr, and copyoutstr.  Also reached
 // from error recovery if we get a DSI accessing user space.  Clear recovery ptr, 
-// and pop off frame.  Note that we have kept
-// the mapped ptr into user space in r31, as a reg64_t type (ie, a 64-bit ptr on
-// 64-bit machines.)  We must unpack r31 into an addr64_t in (r3,r4) before passing
-// it to ReleaseUserAddressSpace.
+// and pop off frame.
 //		r3 = 0, EFAULT, or ENAMETOOLONG
 
 copyinx: 
-        lwz		r2,kkCR(r1)						// get callers cr3
-		mfsprg  r6,1							// Get the current act 
-		lwz		r10,ACT_THREAD(r6)
-		
+        lwz		r2,kkCR3(r1)                    // get callers cr3
+		mfsprg  r6,1							// Get the current thread 
         bf--	kk64bit,copyinx1				// skip if 32-bit processor
         mfmsr	r12
         rldicl	r12,r12,0,MSR_SF_BIT+1			// if 64-bit processor, turn 64-bit mode off
-        mtmsrd	r12								// turn SF off and EE back on
+        mtmsrd	r12								// turn SF off
         isync									// wait for the mode to change
 copyinx1:
+		lwz		r0,FM_LR_SAVE+kkFrameSize(r1)   // get return address
         lwz		r31,kkR31Save(r1)				// restore callers r31
+        lwz		r4,kkThrErrJmp(r1)				// load saved thread recover
         addi	r1,r1,kkFrameSize				// pop off our stack frame
-		lwz		r0,FM_LR_SAVE(r1)
-		li		r4,0
-		stw		r4,THREAD_RECOVER(r10)			// Clear recovery
 		mtlr	r0
+		stw		r4,THREAD_RECOVER(r6)			// restore thread recover
         mtcrf	0x10,r2							// restore cr3
 		blr
 
@@ -770,55 +779,85 @@ copyinout_too_big:								// degenerate case
 //		r3 = source ptr, mapped if copyinstr
 //		r4 = dest ptr, mapped if copyoutstr
 //		r5 = buffer length
-//	   r31 = mapped ptr returned by MapUserAddressSpace
+//	   r31 = mapped ptr returned by MapUserMemoryWindow
 //     cr3 = kkIn, kkString, kkNull, and kk64bit flags
 // We do word copies unless the buffer is very short, then use a byte copy loop
-// for the leftovers if necessary.
+// for the leftovers if necessary.  The crossover at which the word loop becomes
+// faster is about seven bytes, counting the zero.
+//
+// We first must word-align the source ptr, in order to avoid taking a spurious
+// page fault.
 
 copyString:
-        li		r12,0							// Set header bytes count to zero
-        cmplwi	cr1,r5,20						// is buffer very short?
+        cmplwi	cr1,r5,15						// is buffer very short?
+        mr      r12,r3                          // remember ptr to 1st source byte
         mtctr	r5								// assuming short, set up loop count for bytes
-        blt		cr1,copyinstr8					// too short for word loop
-        andi.	r12,r3,0x3						// is source ptr word aligned?
-        bne		copyinstr11						//  bytes loop
-copyinstr1:
-        srwi	r6,r5,2							// get #words in buffer
-        mtctr	r6								// set up word loop count
+        blt--   cr1,copyinstr8					// too short for word loop
+        rlwinm  r2,r3,0,0x3                     // get byte offset of 1st byte within word
+        rlwinm  r9,r3,3,0x18                    // get bit offset of 1st byte within word
+        li      r7,-1
+        sub     r3,r3,r2                        // word-align source address
+        add     r6,r5,r2                        // get length starting at byte 0 in word
+        srw     r7,r7,r9                        // get mask for bytes in first word
+        srwi	r0,r6,2							// get #words in buffer
+        lwz     r5,0(r3)                        // get aligned word with first source byte
         lis		r10,hi16(0xFEFEFEFF)			// load magic constants into r10 and r11
         lis		r11,hi16(0x80808080)
+        mtctr	r0								// set up word loop count
+        addi    r3,r3,4                         // advance past the source word
         ori		r10,r10,lo16(0xFEFEFEFF)
         ori		r11,r11,lo16(0x80808080)
-        bf		kkNull,copyinstr6				// enter loop that copies
-        b		copyinstr5						// use loop that just counts
+        orc     r8,r5,r7                        // map bytes preceeding first source byte into 0xFF
+        bt--	kkNull,copyinstr5enter          // enter loop that just counts
+        
+// Special case 1st word, which has been 0xFF filled on left.  Note that we use
+// "and.", even though we execute both in 32 and 64-bit mode.  This is OK.
+
+        slw     r5,r5,r9                        // left justify payload bytes
+        add		r9,r10,r8						// r9 =  data + 0xFEFEFEFF
+        andc	r7,r11,r8						// r7 = ~data & 0x80808080
+		subfic  r0,r2,4							// get r0 <- #payload bytes in 1st word
+        and.    r7,r9,r7						// if r7==0, then all bytes in r8 are nonzero
+        stw     r5,0(r4)                        // copy payload bytes to dest buffer
+        add		r4,r4,r0						// then point to next byte in dest buffer
+        bdnzt   cr0_eq,copyinstr6               // use loop that copies if 0 not found
+        
+        b		copyinstr7                      // 0 found (buffer can't be full)
         
         
 // Word loop(s).  They do a word-parallel search for 0s, using the following
 // inobvious but very efficient test:
 //		y =  data + 0xFEFEFEFF
 //		z = ~data & 0x80808080
-// If (y & z)==0, then all bytes in dataword are nonzero.  We need two copies of
-// this loop, since if we test kkNull in the loop then it becomes 9 words long.
+// If (y & z)==0, then all bytes in dataword are nonzero.  There are two copies
+// of this loop, one that just counts and another that copies.
+//		r3 = ptr to next word of source (word aligned)
+//		r4 = ptr to next byte in buffer
+//      r6 = original buffer length (adjusted to be word origin)
+//     r10 = 0xFEFEFEFE
+//     r11 = 0x80808080
+//     r12 = ptr to 1st source byte (used to determine string length)
 
         .align	5								// align inner loops for speed
 copyinstr5:										// version that counts but does not copy
-        lwz		r8,0(r3)						// get next word of source
-        addi	r3,r3,4							// increment source ptr
+        lwz     r8,0(r3)						// get next word of source
+        addi    r3,r3,4                         // advance past it
+copyinstr5enter:
         add		r9,r10,r8						// r9 =  data + 0xFEFEFEFF
         andc	r7,r11,r8						// r7 = ~data & 0x80808080
-        and.	r7,r9,r7						// r7 = r9 & r7
-        bdnzt	cr0_eq,copyinstr5				// if r7==0, then all bytes are nonzero
+        and.    r7,r9,r7                        // r7 = r9 & r7 ("." ok even in 64-bit mode)
+        bdnzt   cr0_eq,copyinstr5				// if r7==0, then all bytes in r8 are nonzero
 
         b		copyinstr7
 
         .align	5								// align inner loops for speed
 copyinstr6:										// version that counts and copies
-        lwz		r8,0(r3)						// get next word of source
-        addi	r3,r3,4							// increment source ptr
+        lwz     r8,0(r3)						// get next word of source
+        addi    r3,r3,4                         // advance past it
         addi	r4,r4,4							// increment dest ptr while we wait for data
         add		r9,r10,r8						// r9 =  data + 0xFEFEFEFF
         andc	r7,r11,r8						// r7 = ~data & 0x80808080
-        and.	r7,r9,r7						// r7 = r9 & r7
+        and.    r7,r9,r7                        // r7 = r9 & r7 ("." ok even in 64-bit mode)
         stw		r8,-4(r4)						// pack all 4 bytes into buffer
         bdnzt	cr0_eq,copyinstr6				// if r7==0, then all bytes are nonzero
 
@@ -826,19 +865,24 @@ copyinstr6:										// version that counts and copies
 // Either 0 found or buffer filled.  The above algorithm has mapped nonzero bytes to 0
 // and 0 bytes to 0x80 with one exception: 0x01 bytes preceeding the first 0 are also
 // mapped to 0x80.  We must mask out these false hits before searching for an 0x80 byte.
+//		r3 = word aligned ptr to next word of source (ie, r8==mem(r3-4))
+//      r6 = original buffer length (adjusted to be word origin)
+//      r7 = computed vector of 0x00 and 0x80 bytes
+//      r8 = original source word, coming from -4(r3), possibly padded with 0xFFs on left if 1st word
+//     r12 = ptr to 1st source byte (used to determine string length)
+//     cr0 = beq set iff 0 not found
 
 copyinstr7:
-        crnot	kkZero,cr0_eq					// 0 found iff cr0_eq is off
-        mfctr	r6								// get #words remaining in buffer
         rlwinm	r2,r8,7,0,31					// move 0x01 bits to 0x80 position
-        slwi	r6,r6,2							// convert to #bytes remaining
+		rlwinm  r6,r6,0,0x3						// mask down to partial byte count in last word
         andc	r7,r7,r2						// turn off false hits from 0x0100 worst case
-        rlwimi	r6,r5,0,30,31					// add in odd bytes leftover in buffer
-        srwi	r7,r7,8							// we want to count the 0 as a byte xferred
-        addi	r6,r6,4							// don't count last word xferred (yet)
+        crnot	kkZero,cr0_eq					// 0 found iff cr0_eq is off
+        srwi    r7,r7,8                         // we want to count the 0 as a byte xferred
+		cmpwi   r6,0							// any bytes left over in last word?
         cntlzw	r7,r7							// now we can find the 0 byte (ie, the 0x80)
+        subi    r3,r3,4                         // back up r3 to point to 1st byte in r8
         srwi	r7,r7,3							// convert 8,16,24,32 to 1,2,3,4
-        sub.	r6,r6,r7						// account for nonzero bytes in last word
+        add     r3,r3,r7                        // now r3 points one past 0 byte, or at 1st byte not xferred
         bt++	kkZero,copyinstr10				// 0 found, so done
         
         beq		copyinstr10						// r6==0, so buffer truly full
@@ -848,6 +892,10 @@ copyinstr7:
 
 // Byte loop.  This is used for very small buffers and for the odd bytes left over
 // after searching and copying words at a time.
+//      r3 = ptr to next byte of source
+//      r4 = ptr to next dest byte
+//     r12 = ptr to first byte of source
+//     ctr = count of bytes to check
     
         .align	5								// align inner loops for speed
 copyinstr8:										// loop over bytes of source
@@ -855,53 +903,400 @@ copyinstr8:										// loop over bytes of source
         addi	r3,r3,1
         addi	r4,r4,1							// increment dest addr whether we store or not
         cmpwi	r0,0							// the 0?
-        bt--	kkNull,copyinstr9				// don't store (was copyinstr with NULL ptr)
+        bt--	kkNull,copyinstr9				// don't store if copyinstr with NULL ptr
         stb		r0,-1(r4)
 copyinstr9:
         bdnzf	cr0_eq,copyinstr8				// loop if byte not 0 and more room in buffer
         
-        mfctr	r6								// get #bytes left in buffer
         crmove	kkZero,cr0_eq					// remember if 0 found or buffer filled
 
         
 // Buffer filled or 0 found.  Unwind and return.
-//	r5 = kkBufSize, ie buffer length
-//  r6 = untransferred bytes remaining in buffer
-// r31 = mapped ptr returned by MapUserAddressSpace
-// cr3 = kkZero set iff 0 found
+//      r3 = ptr to 1st source byte not transferred
+//     r12 = ptr to 1st source byte
+//     r31 = mapped ptr returned by MapUserMemoryWindow
+//     cr3 = kkZero set iff 0 found
 
 copyinstr10:
         lwz		r9,kkCountPtr(r1)				// get ptr to place to store count of bytes moved
-        sub		r2,r5,r6						// get #bytes we moved, counting the 0 iff any
-        add		r2,r2,r12						// add the header bytes count
-        li		r3,0							// assume 0 return status
+        sub     r2,r3,r12                       // compute #bytes copied (including the 0)
+        li		r3,0							// assume success return status
         stw		r2,0(r9)						// store #bytes moved
         bt++	kkZero,copyinx					// we did find the 0 so return 0
         li		r3,ENAMETOOLONG					// buffer filled
         b		copyinx							// join main exit routine
 
-// Byte loop.  This is used on the header bytes for unaligned source 
-    
-        .align	5								// align inner loops for speed
-copyinstr11:
-        li		r10,4							// load word size
-        sub		r12,r10,r12						// set the header bytes count
-        mtctr	r12								// set up bytes loop count
-copyinstr12:									// loop over bytes of source
-        lbz		r0,0(r3)						// get next byte of source
-        addi	r3,r3,1
-        addi	r4,r4,1							// increment dest addr whether we store or not
-        cmpwi	r0,0							// the 0?
-        bt--	kkNull,copyinstr13				// don't store (was copyinstr with NULL ptr)
-        stb		r0,-1(r4)
-copyinstr13:
-        bdnzf	cr0_eq,copyinstr12				// loop if byte not 0 and more room in buffer
-        sub		r5,r5,r12						// substract the bytes copied
-        bne		cr0_eq,copyinstr1				// branch to word loop
-
-        mr		r5,r12							// Get the header bytes count
-        li		r12,0							// Clear the header bytes count
-        mfctr	r6								// get #bytes left in buffer
-        crmove	kkZero,cr0_eq					// remember if 0 found or buffer filled
-        b		copyinstr10
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+/*
+ * int
+ * copypv(source, sink, size, which)
+ *	addr64_t	src;        // r3 and r4
+ *	addr64_t	dst;        // r5 and r6
+ *	size_t		size;		// r7
+ *	int			which;		// r8
+ *
+ * Operand size bytes are copied from operand src into operand dst. The source and
+ * destination operand addresses are given as addr64_t, and may designate starting
+ * locations in physical or virtual memory in any combination except where both are
+ * virtual. Virtual memory locations may be in either the kernel or the current thread's
+ * address space. Operand size may be up to 256MB.
+ *
+ * Operation is controlled by operand which, which offers these options:
+ *		cppvPsrc : source operand is (1) physical or (0) virtual
+ *		cppvPsnk : destination operand is (1) physical or (0) virtual
+ *		cppvKmap : virtual operand is in (1) kernel or (0) current thread
+ *		cppvFsnk : (1) flush destination before and after transfer
+ *		cppvFsrc : (1) flush source before and after transfer
+ *		cppvNoModSnk : (1) don't set source operand's changed bit(s)
+ *		cppvNoRefSrc : (1) don't set destination operand's referenced bit(s)
+ *
+ * Implementation is now split into this new 64-bit path and the old path, hw_copypv_32().
+ * This section describes the operation of the new 64-bit path.
+ *
+ * The 64-bit path utilizes the more capacious 64-bit kernel address space to create a
+ * window in the kernel address space into all of physical RAM plus the I/O hole. Since
+ * the window's mappings specify the proper access policies for the underlying memory,
+ * the new path does not have to flush caches to avoid a cache paradox, so cppvFsnk
+ * and cppvFsrc are ignored. Physical operand adresses are relocated into the physical
+ * memory window, and are accessed with data relocation on. Virtual addresses are either
+ * within the kernel, or are mapped into the kernel address space through the user memory
+ * window. Because accesses to a virtual operand are performed with data relocation on,
+ * the new path does not have to translate the address, disable/enable interrupts, lock
+ * the mapping, or update referenced and changed bits.
+ *
+ * The IBM 970 (a.k.a. G5) processor treats real-mode accesses as guarded, so there is
+ * a substantial performance penalty for copypv operating in real mode. Utilizing the
+ * new 64-bit path, transfer performance increases >100% on the G5.
+ *
+ * The attentive reader may notice that mtmsrd ops are not followed by isync ops as 
+ * might be expected. The 970 follows PowerPC architecture version 2.01, which defines
+ * mtmsrd with L=0 as a context synchronizing op, so a following isync is no longer
+ * required.
+ *
+ * To keep things exciting, we develop 64-bit values in non-volatiles, but we also need
+ * to call 32-bit functions, which would lead to the high-order 32 bits of our values
+ * getting clobbered unless we do something special. So, we preserve our 64-bit non-volatiles
+ * in our own stack frame across calls to 32-bit functions.
+ *		
+ */
+
+// Map operand which bits into non-volatile CR2 and CR3 bits.
+#define whichAlign	((3+1)*4)
+#define whichMask	0x007F0000
+#define pvPsnk		(cppvPsnkb - whichAlign)
+#define pvPsrc		(cppvPsrcb - whichAlign)
+#define pvFsnk		(cppvFsnkb - whichAlign)
+#define pvFsrc		(cppvFsrcb - whichAlign)
+#define pvNoModSnk	(cppvNoModSnkb - whichAlign)
+#define pvNoRefSrc	(cppvNoRefSrcb - whichAlign)
+#define pvKmap		(cppvKmapb - whichAlign)
+#define pvNoCache	cr2_lt
 
+		.align	5
+		.globl	EXT(copypv)
+
+LEXT(copypv)
+        mfsprg	r10,2							// get feature flags
+        mtcrf	0x02,r10						// we need to test pf64Bit
+        bt++	pf64Bitb,copypv_64				// skip if 64-bit processor (only they take hint)
+        
+        b		EXT(hw_copypv_32)				// carry on with 32-bit copypv
+
+// Push a 32-bit ABI-compliant stack frame and preserve all non-volatiles that we'll clobber.        
+copypv_64:
+		mfsprg	r9,1							// get current thread
+		stwu	r1,-(FM_ALIGN((31-26+11)*4)+FM_SIZE)(r1)
+												// allocate stack frame and link it
+		mflr	r0								// get return address
+		mfcr	r10								// get cr2 and cr3
+		lwz		r12,THREAD_RECOVER(r9)			// get error callback
+		stw		r26,FM_ARG0+0x00(r1)			// save non-volatile r26
+		stw		r27,FM_ARG0+0x04(r1)			// save non-volatile r27
+		stw		r28,FM_ARG0+0x08(r1)			// save non-volatile r28
+		stw		r29,FM_ARG0+0x0C(r1)			// save non-volatile r29
+		stw		r30,FM_ARG0+0x10(r1)			// save non-volatile r30
+		stw		r31,FM_ARG0+0x14(r1)			// save non-volatile r31
+		stw		r12,FM_ARG0+0x20(r1)			// save error callback
+		stw		r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
+												// save return address
+		stw		r10,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
+												// save non-volatile cr2 and cr3
+
+// Non-volatile register usage in this routine is:
+//	r26: saved msr image
+//	r27: current pmap_t / virtual source address
+//	r28: destination virtual address
+//	r29: source address
+//	r30: destination address
+//	r31: byte count to copy
+//	cr2/3: parameter 'which' bits
+
+		rlwinm	r8,r8,whichAlign,whichMask		// align and mask which bits
+		mr		r31,r7							// copy size to somewhere non-volatile
+		mtcrf	0x20,r8							// insert which bits into cr2 and cr3
+		mtcrf	0x10,r8							// insert which bits into cr2 and cr3
+		rlwinm	r29,r3,0,1,0					// form source address high-order bits
+		rlwinm	r30,r5,0,1,0					// form destination address high-order bits
+		rlwimi	r29,r4,0,0,31					// form source address low-order bits
+		rlwimi	r30,r6,0,0,31					// form destination address low-order bits
+		crand	cr7_lt,pvPsnk,pvPsrc			// are both operand addresses physical?
+		cntlzw	r0,r31							// count leading zeroes in byte count
+		cror	cr7_eq,pvPsnk,pvPsrc			// cr7_eq <- source or destination is physical
+		bf--	cr7_eq,copypv_einval			// both operands may not be virtual
+		cmplwi	r0,4							// byte count greater than or equal 256M (2**28)?
+		blt--	copypv_einval					// byte count too big, give EINVAL
+		cmplwi	r31,0							// byte count zero?
+		beq--	copypv_zero						// early out
+		bt		cr7_lt,copypv_phys				// both operand addresses are physical
+		mr		r28,r30							// assume destination is virtual
+		bf		pvPsnk,copypv_dv				// is destination virtual?
+		mr		r28,r29							// no, so source must be virtual
+copypv_dv:
+		lis		r27,ha16(EXT(kernel_pmap))		// get kernel's pmap_t *, high-order
+		lwz		r27,lo16(EXT(kernel_pmap))(r27) // get kernel's pmap_t
+		bt		pvKmap,copypv_kern				// virtual address in kernel map?
+		lwz		r3,ACT_VMMAP(r9)				// get user's vm_map *
+		rldicl	r4,r28,32,32					// r4, r5 <- addr64_t virtual address 
+		rldicl	r5,r28,0,32
+		std		r29,FM_ARG0+0x30(r1)			// preserve 64-bit r29 across 32-bit call
+		std		r30,FM_ARG0+0x38(r1)			// preserve 64-bit r30 across 32-bit call
+		bl		EXT(MapUserMemoryWindow)		// map slice of user space into kernel space
+		ld		r29,FM_ARG0+0x30(r1)			// restore 64-bit r29
+		ld		r30,FM_ARG0+0x38(r1)			// restore 64-bit r30
+		rlwinm	r28,r3,0,1,0					// convert relocated addr64_t virtual address 
+		rlwimi	r28,r4,0,0,31					//  into a single 64-bit scalar
+copypv_kern:
+
+// Since we'll be accessing the virtual operand with data-relocation on, we won't need to 
+// update the referenced and changed bits manually after the copy. So, force the appropriate
+// flag bit on for the virtual operand.
+		crorc	pvNoModSnk,pvNoModSnk,pvPsnk	// for virtual dest, let hardware do ref/chg bits
+		crorc	pvNoRefSrc,pvNoRefSrc,pvPsrc	// for virtual source, let hardware do ref bit
+		
+// We'll be finding a mapping and looking at, so we need to disable 'rupts.
+		lis		r0,hi16(MASK(MSR_VEC))			// get vector mask
+		ori		r0,r0,lo16(MASK(MSR_FP))		// insert fp mask
+		mfmsr	r26								// save current msr
+		andc	r26,r26,r0						// turn off VEC and FP in saved copy
+		ori		r0,r0,lo16(MASK(MSR_EE))		// add EE to our mask
+		andc	r0,r26,r0						// disable EE in our new msr image
+		mtmsrd	r0								// introduce new msr image
+
+// We're now holding the virtual operand's pmap_t in r27 and its virtual address in r28. We now
+// try to find a mapping corresponding to this address in order to determine whether the address
+// is cacheable. If we don't find a mapping, we can safely assume that the operand is cacheable
+// (a non-cacheable operand must be a block mapping, which will always exist); otherwise, we
+// examine the mapping's caching-inhibited bit.
+		mr		r3,r27							// r3 <- pmap_t pmap
+		rldicl	r4,r28,32,32					// r4, r5 <- addr64_t va
+		rldicl	r5,r28,0,32
+		la		r6,FM_ARG0+0x18(r1)				// r6 <- addr64_t *nextva
+		li		r7,1							// r7 <- int full, search nested mappings
+		std		r26,FM_ARG0+0x28(r1)			// preserve 64-bit r26 across 32-bit calls
+		std		r28,FM_ARG0+0x30(r1)			// preserve 64-bit r28 across 32-bit calls
+		std		r29,FM_ARG0+0x38(r1)			// preserve 64-bit r29 across 32-bit calls
+		std		r30,FM_ARG0+0x40(r1)			// preserve 64-bit r30 across 32-bit calls
+		bl		EXT(mapping_find)				// find mapping for virtual operand
+		mr.		r3,r3							// did we find it?
+		beq		copypv_nomapping				// nope, so we'll assume it's cacheable
+		lwz		r4,mpVAddr+4(r3)				// get low half of virtual addr for hw flags
+		rlwinm.	r4,r4,0,mpIb-32,mpIb-32			// caching-inhibited bit set?
+		crnot	pvNoCache,cr0_eq				// if it is, use bcopy_nc
+		bl		EXT(mapping_drop_busy)			// drop busy on the mapping
+copypv_nomapping:
+		ld		r26,FM_ARG0+0x28(r1)			// restore 64-bit r26
+		ld		r28,FM_ARG0+0x30(r1)			// restore 64-bit r28
+		ld		r29,FM_ARG0+0x38(r1)			// restore 64-bit r29
+		ld		r30,FM_ARG0+0x40(r1)			// restore 64-bit r30
+		mtmsrd	r26								// restore msr to it's previous state
+
+// Set both the source and destination virtual addresses to the virtual operand's address --
+// we'll overlay one of them with the physical operand's address.
+		mr		r27,r28							// make virtual operand BOTH source AND destination
+
+// Now we're ready to relocate the physical operand address(es) into the physical memory window.
+// Recall that we've mapped physical memory (including the I/O hole) into the kernel's address
+// space somewhere at or over the 2**32 line. If one or both of the operands are in the I/O hole,
+// we'll set the pvNoCache flag, forcing use of non-caching bcopy_nc() to do the copy.
+copypv_phys:
+		ld		r6,lgPMWvaddr(0)				// get physical memory window virtual address
+		bf		pvPsnk,copypv_dstvirt			// is destination address virtual?
+		cntlzd	r4,r30							// count leading zeros in destination address
+		cmplwi	r4,32							// if it's 32, then it's in the I/O hole (2**30 to 2**31-1)
+		cror	pvNoCache,cr0_eq,pvNoCache		// use bcopy_nc for I/O hole locations		
+		add		r28,r30,r6						// relocate physical destination into physical window
+copypv_dstvirt:
+		bf		pvPsrc,copypv_srcvirt			// is source address virtual?
+		cntlzd	r4,r29							// count leading zeros in source address
+		cmplwi	r4,32							// if it's 32, then it's in the I/O hole (2**30 to 2**31-1)
+		cror	pvNoCache,cr0_eq,pvNoCache		// use bcopy_nc for I/O hole locations		
+		add		r27,r29,r6						// relocate physical source into physical window
+copypv_srcvirt:
+
+// Once the copy is under way (bcopy or bcopy_nc), we will want to get control if anything
+// funny happens during the copy. So, we set a pointer to our error handler in the per-thread
+// control block.
+		mfsprg	r8,1							// get current threads stuff
+		lis		r3,hi16(copypv_error)			// get our error callback's address, high
+		ori		r3,r3,lo16(copypv_error)		// get our error callback's address, low
+		stw		r3,THREAD_RECOVER(r8)			// set our error callback
+		
+// Since our physical operand(s) are relocated at or above the 2**32 line, we must enter
+// 64-bit mode.
+		li		r0,1							// get a handy one bit
+		mfmsr	r3								// get current msr
+		rldimi	r3,r0,63,MSR_SF_BIT				// set SF bit on in our msr copy
+		mtmsrd	r3								// enter 64-bit mode
+
+// If requested, flush data cache
+// Note that we don't flush, the code is being saved "just in case".
+#if 0
+		bf		pvFsrc,copypv_nfs				// do we flush the source?
+		rldicl	r3,r27,32,32					// r3, r4 <- addr64_t source virtual address
+		rldicl	r4,r27,0,32
+		mr		r5,r31							// r5 <- count (in bytes)
+		li		r6,0							// r6 <- boolean phys (false, not physical)
+		bl		EXT(flush_dcache)				// flush the source operand
+copypv_nfs:
+		bf		pvFsnk,copypv_nfdx				// do we flush the destination?
+		rldicl	r3,r28,32,32					// r3, r4 <- addr64_t destination virtual address
+		rldicl	r4,r28,0,32
+		mr		r5,r31							// r5 <- count (in bytes)
+		li		r6,0							// r6 <- boolean phys (false, not physical)
+		bl		EXT(flush_dcache)				// flush the destination operand
+copypv_nfdx:
+#endif
+
+// Call bcopy or bcopy_nc to perform the copy.
+		mr		r3,r27							// r3 <- source virtual address
+		mr		r4,r28							// r4 <- destination virtual address
+		mr		r5,r31							// r5 <- bytes to copy
+		bt		pvNoCache,copypv_nc				// take non-caching route
+		bl		EXT(bcopy)						// call bcopy to do the copying
+		b		copypv_copydone
+copypv_nc:
+		bl		EXT(bcopy_nc)					// call bcopy_nc to do the copying
+copypv_copydone:
+
+// If requested, flush data cache
+// Note that we don't flush, the code is being saved "just in case".
+#if 0
+		bf		pvFsrc,copypv_nfsx				// do we flush the source?
+		rldicl	r3,r27,32,32					// r3, r4 <- addr64_t source virtual address
+		rldicl	r4,r27,0,32
+		mr		r5,r31							// r5 <- count (in bytes)
+		li		r6,0							// r6 <- boolean phys (false, not physical)
+		bl		EXT(flush_dcache)				// flush the source operand
+copypv_nfsx:
+		bf		pvFsnk,copypv_nfd				// do we flush the destination?
+		rldicl	r3,r28,32,32					// r3, r4 <- addr64_t destination virtual address
+		rldicl	r4,r28,0,32
+		mr		r5,r31							// r5 <- count (in bytes)
+		li		r6,0							// r6 <- boolean phys (false, not physical)
+		bl		EXT(flush_dcache)				// flush the destination operand
+copypv_nfd:
+#endif
+
+// Leave 64-bit mode.
+		mfmsr	r3								// get current msr
+		rldicl	r3,r3,0,MSR_SF_BIT+1			// clear SF bit in our copy
+		mtmsrd	r3								// leave 64-bit mode
+
+// If requested, set ref/chg on source/dest physical operand(s). It is possible that copy is
+// from/to a RAM disk situated outside of mapped physical RAM, so we check each page by calling
+// mapping_phys_lookup() before we try to set its ref/chg bits; otherwise, we might panic.
+// Note that this code is page-size sensitive, so it should probably be a part of our low-level
+// code in hw_vm.s.
+		bt		pvNoModSnk,copypv_nomod			// skip destination update if not requested
+		std		r29,FM_ARG0+0x30(r1)			// preserve 64-bit r29 across 32-bit calls
+		li		r26,1							// r26 <- 4K-page count						
+		mr		r27,r31							// r27 <- byte count
+		rlwinm	r3,r30,0,20,31					// does destination cross a page boundary?
+		subfic	r3,r3,4096						//
+		cmplw	r3,r27							// 
+		blt		copypv_modnox					// skip if not crossing case
+		subf	r27,r3,r27						// r27 <- byte count less initial fragment
+		addi	r26,r26,1						// increment page count
+copypv_modnox:
+		srdi	r3,r27,12						// pages to update (not including crosser)
+		add		r26,r26,r3						// add in crosser
+		srdi	r27,r30,12						// r27 <- destination page number
+copypv_modloop:
+		mr		r3,r27							// r3 <- destination page number				
+		la		r4,FM_ARG0+0x18(r1)				// r4 <- unsigned int *pindex
+		bl		EXT(mapping_phys_lookup)		// see if page is really there
+		mr.		r3,r3							// is it?
+		beq--	copypv_modend					// nope, break out of modify loop
+		mr		r3,r27							// r3 <- destination page number
+		bl		EXT(mapping_set_mod)			// set page changed status
+		subi	r26,r26,1						// decrement page count
+		cmpwi	r26,0							// done yet?
+		bgt		copypv_modloop					// nope, iterate
+copypv_modend:
+		ld		r29,FM_ARG0+0x30(r1)			// restore 64-bit r29
+copypv_nomod:
+		bt		pvNoRefSrc,copypv_done			// skip source update if not requested
+copypv_debugref:
+		li		r26,1							// r26 <- 4K-page count						
+		mr		r27,r31							// r27 <- byte count
+		rlwinm	r3,r29,0,20,31					// does source cross a page boundary?
+		subfic	r3,r3,4096						//
+		cmplw	r3,r27							// 
+		blt		copypv_refnox					// skip if not crossing case
+		subf	r27,r3,r27						// r27 <- byte count less initial fragment
+		addi	r26,r26,1						// increment page count
+copypv_refnox:
+		srdi	r3,r27,12						// pages to update (not including crosser)
+		add		r26,r26,r3						// add in crosser
+		srdi	r27,r29,12						// r27 <- source page number
+copypv_refloop:
+		mr		r3,r27							// r3 <- source page number
+		la		r4,FM_ARG0+0x18(r1)				// r4 <- unsigned int *pindex
+		bl		EXT(mapping_phys_lookup)		// see if page is really there
+		mr.		r3,r3							// is it?
+		beq--	copypv_done						// nope, break out of modify loop
+		mr		r3,r27							// r3 <- source  page number
+		bl		EXT(mapping_set_ref)			// set page referenced status
+		subi	r26,r26,1						// decrement page count
+		cmpwi	r26,0							// done yet?
+		bgt		copypv_refloop					// nope, iterate
+		
+// Return, indicating success.
+copypv_done:
+copypv_zero:
+		li		r3,0							// our efforts were crowned with success
+
+// Pop frame, restore caller's non-volatiles, clear recovery routine pointer.
+copypv_return:
+		mfsprg	r9,1							// get current threads stuff
+		lwz		r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
+												// get return address
+		lwz		r4,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
+												// get non-volatile cr2 and cr3
+		lwz		r26,FM_ARG0+0x00(r1)			// restore non-volatile r26
+		lwz		r27,FM_ARG0+0x04(r1)			// restore non-volatile r27
+		mtlr	r0								// restore return address
+		lwz		r28,FM_ARG0+0x08(r1)			// restore non-volatile r28
+		mtcrf	0x20,r4							// restore non-volatile cr2
+		mtcrf	0x10,r4							// restore non-volatile cr3
+		lwz		r11,FM_ARG0+0x20(r1)			// save error callback
+		lwz		r29,FM_ARG0+0x0C(r1)			// restore non-volatile r29
+		lwz		r30,FM_ARG0+0x10(r1)			// restore non-volatile r30
+		lwz		r31,FM_ARG0+0x14(r1)			// restore non-volatile r31
+		stw		r11,THREAD_RECOVER(r9)			// restore our error callback
+		lwz		r1,0(r1)						// release stack frame
+												
+		blr										// y'all come back now
+
+// Invalid argument handler.
+copypv_einval:
+		li		r3,EINVAL						// invalid argument
+		b		copypv_return					// return
+
+// Error encountered during bcopy or bcopy_nc.		
+copypv_error:
+		mfmsr	r3								// get current msr
+		rldicl	r3,r3,0,MSR_SF_BIT+1			// clear SF bit in our copy
+		mtmsrd	r3								// leave 64-bit mode
+		li		r3,EFAULT						// it was all his fault
+		b		copypv_return					// return