]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/ppc/movc.s
xnu-792.12.6.tar.gz
[apple/xnu.git] / osfmk / ppc / movc.s
index cb3188f3396743e0f93186d64875fa4820bdfbb8..8284ff7bdf9db03c41af9a84f0ac1eeba78922d4 100644 (file)
@@ -1,23 +1,31 @@
 /*
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
  *
  *
- * @APPLE_LICENSE_HEADER_START@
+ * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
  * 
  * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
+ * This file contains Original Code and/or Modifications of Original Code 
+ * as defined in and that are subject to the Apple Public Source License 
+ * Version 2.0 (the 'License'). You may not use this file except in 
+ * compliance with the License.  The rights granted to you under the 
+ * License may not be used to create, or enable the creation or 
+ * redistribution of, unlawful or unlicensed copies of an Apple operating 
+ * system, or to circumvent, violate, or enable the circumvention or 
+ * violation of, any terms of an Apple operating system software license 
+ * agreement.
+ *
+ * Please obtain a copy of the License at 
+ * http://www.opensource.apple.com/apsl/ and read it before using this 
+ * file.
+ *
+ * The Original Code and all software distributed under the License are 
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
+ * Please see the License for the specific language governing rights and 
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  */
 /*
  * @OSF_COPYRIGHT@
  */
 /*
  * @OSF_COPYRIGHT@
 #include <assym.s>
 #include <sys/errno.h>
 
 #include <assym.s>
 #include <sys/errno.h>
 
+#define INSTRUMENT 0
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
  * void pmap_zero_page(vm_offset_t pa)
  *
 /*
  * void pmap_zero_page(vm_offset_t pa)
  *
- * zero a page of physical memory.
+ * Zero a page of physical memory.  This routine runs in 32 or 64-bit mode,
+ * and handles 32 and 128-byte cache lines.
  */
 
  */
 
-#if DEBUG
-       /* C debug stub in pmap.c calls this */
-ENTRY(pmap_zero_page_assembler, TAG_NO_FRAME_USED)
-#else
-ENTRY(pmap_zero_page, TAG_NO_FRAME_USED)
-#endif /* DEBUG */
-
-               mfmsr   r6                                                              /* Get the MSR */
-               rlwinm  r6,r6,0,MSR_FP_BIT+1,MSR_FP_BIT-1       ; Force floating point off
-               rlwinm  r6,r6,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1     ; Force vectors off
-               rlwinm  r7,     r6,     0,      MSR_DR_BIT+1,   MSR_DR_BIT-1    /* Turn off DR */
-               rlwinm  r7,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1       ; Disable interruptions
-               li              r4,PPC_PGBYTES-CACHE_LINE_SIZE  /* Point to the end of the page */
-               mtmsr   r7                                                              /* Set MSR to DR off */
-               isync                                                                   /* Ensure data translations are off */
-
-
-.L_phys_zero_loop:     
-               subic.  r5,r4,CACHE_LINE_SIZE                   /* Point to the next one */
-               dcbz    r4, r3                                                  /* Clear the whole thing to 0s */
-               subi    r4,r5,CACHE_LINE_SIZE                   /* Point to the next one */
-               dcbz    r5, r3                                                  /* Clear the next to zeros */
-               bgt+    .L_phys_zero_loop                               /* Keep going until we do the page... */
-
-               sync                                                                    /* Make sure they're all done */
-               li              r4,PPC_PGBYTES-CACHE_LINE_SIZE  /* Point to the end of the page */
-
-.L_inst_inval_loop:    
-               subic.  r5,r4,CACHE_LINE_SIZE                   /* Point to the next one */
-               icbi    r4, r3                                                  /* Clear the whole thing to 0s */
-               subi    r4,r5,CACHE_LINE_SIZE                   /* Point to the next one */
-               icbi    r5, r3                                                  /* Clear the next to zeros */
-               bgt+    .L_inst_inval_loop                              /* Keep going until we do the page... */
-
-               sync                                                                    /* Make sure they're all done */
-
-               mtmsr   r6              /* Restore original translations */
-               isync                   /* Ensure data translations are on */
 
 
-               blr
+               .align  5
+               .globl  EXT(pmap_zero_page)
+
+LEXT(pmap_zero_page)
+
+        mflr   r12                                                             // save return address
+        bl             EXT(ml_set_physical_disabled)   // turn DR and EE off, SF on, get features in r10
+        mtlr   r12                                                             // restore return address
+        andi.  r9,r10,pf32Byte+pf128Byte               // r9 <- cache line size
 
 
+        subfic r4,r9,PPC_PGBYTES                               // r4 <- starting offset in page
+               
+               bt++    pf64Bitb,page0S4                                // Go do the big guys...
+               
+               slwi    r3,r3,12                                                // get page address from page num
+               b               page_zero_1                                             // Jump to line aligned loop...
+
+        .align 5
+
+               nop
+               nop
+               nop
+               nop
+               nop
+               nop
+               nop
+               
+page0S4:
+               sldi    r3,r3,12                                                // get page address from page num
+
+page_zero_1:                                                                   // loop zeroing cache lines
+        sub.   r5,r4,r9                                                // more to go?
+        dcbz128        r3,r4                                                   // zero either 32 or 128 bytes
+        sub            r4,r5,r9                                                // generate next offset
+        dcbz128        r3,r5
+        bne--  page_zero_1
+        
+        b              EXT(ml_restore)                                 // restore MSR and do the isync
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /* void
  * phys_copy(src, dst, bytecount)
 /* void
  * phys_copy(src, dst, bytecount)
- *      vm_offset_t     src;
- *      vm_offset_t     dst;
+ *      addr64_t           src;
+ *      addr64_t           dst;
  *      int             bytecount
  *
  * This routine will copy bytecount bytes from physical address src to physical
  *      int             bytecount
  *
  * This routine will copy bytecount bytes from physical address src to physical
- * address dst. 
+ * address dst.  It runs in 64-bit mode if necessary, but does not handle
+ * overlap or make any attempt to be optimal.  Length must be a signed word.
+ * Not performance critical.
  */
 
  */
 
-ENTRY(phys_copy, TAG_NO_FRAME_USED)
-
-       /* Switch off data translations */
-       mfmsr   r6
-       rlwinm  r6,r6,0,MSR_FP_BIT+1,MSR_FP_BIT-1       ; Force floating point off
-       rlwinm  r6,r6,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1     ; Force vectors off
-       rlwinm  r7,     r6,     0,      MSR_DR_BIT+1,   MSR_DR_BIT-1
-       rlwinm  r7,     r7,     0,      MSR_EE_BIT+1,   MSR_EE_BIT-1
-       mtmsr   r7
-       isync                   /* Ensure data translations are off */
-
-       subi    r3,     r3,     4
-       subi    r4,     r4,     4
-
-       cmpwi   r5,     3
-       ble-    .L_phys_copy_bytes
-.L_phys_copy_loop:
-       lwz     r0,     4(r3)
-       addi    r3,     r3,     4
-       subi    r5,     r5,     4
-       stw     r0,     4(r4)
-       addi    r4,     r4,     4
-       cmpwi   r5,     3
-       bgt+    .L_phys_copy_loop
-
-       /* If no leftover bytes, we're done now */
-       cmpwi   r5,     0
-       beq+    .L_phys_copy_done
-       
-.L_phys_copy_bytes:
-       addi    r3,     r3,     3
-       addi    r4,     r4,     3
-.L_phys_copy_byte_loop:        
-       lbz     r0,     1(r3)
-       addi    r3,     r3,     1
-       subi    r5,     r5,     1
-       stb     r0,     1(r4)
-       addi    r4,     r4,     1
-       cmpwi   r5,     0
-       bne+    .L_phys_copy_byte_loop
-
-.L_phys_copy_done:
-       mtmsr   r6              /* Restore original translations */
-       isync                   /* Ensure data translations are off */
-
-       blr
 
 
+               .align  5
+               .globl  EXT(phys_copy)
+
+LEXT(phys_copy)
+
+               rlwinm  r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
+        mflr   r12                                                             // get return address
+               rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
+               rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
+        bl             EXT(ml_set_physical_disabled)   // turn DR and EE off, SF on, get features in r10
+               rlwimi  r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
+        mtlr   r12                                                             // restore return address
+        subic. r5,r7,4                                                 // a word to copy?
+        b              phys_copy_2
+        
+               .align  5
+         
+phys_copy_1:                                                                   // loop copying words
+        subic. r5,r5,4                                                 // more to go?
+        lwz            r0,0(r3)
+        addi   r3,r3,4
+        stw            r0,0(r4)
+        addi   r4,r4,4
+phys_copy_2:
+        bge            phys_copy_1
+        addic. r5,r5,4                                                 // restore count
+        ble            phys_copy_4                                             // no more
+        
+                                                                                       // Loop is aligned here
+        
+phys_copy_3:                                                                   // loop copying bytes
+        subic. r5,r5,1                                                 // more to go?
+        lbz            r0,0(r3)
+        addi   r3,r3,1
+        stb            r0,0(r4)
+        addi   r4,r4,1
+        bgt            phys_copy_3
+phys_copy_4:        
+        b              EXT(ml_restore)                                 // restore MSR and do the isync
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /* void
  * pmap_copy_page(src, dst)
 /* void
  * pmap_copy_page(src, dst)
- *      vm_offset_t     src;
- *      vm_offset_t     dst;
+ *      ppnum_t     src;
+ *      ppnum_t     dst;
  *
  * This routine will copy the physical page src to physical page dst
  * 
  *
  * This routine will copy the physical page src to physical page dst
  * 
- * This routine assumes that the src and dst are page aligned and that the
- * destination is cached.
- *
- * We also must assume that noone will be executing within the destination
- * page.  We also assume that this will be used for paging
+ * This routine assumes that the src and dst are page numbers and that the
+ * destination is cached.  It runs on 32 and 64 bit processors, with and
+ * without altivec, and with 32 and 128 byte cache lines.
+ * We also must assume that no-one will be executing within the destination
+ * page, and that this will be used for paging.  Because this
+ * is a common routine, we have tuned loops for each processor class.
  *
  */
  *
  */
+#define        kSFSize (FM_SIZE+160)
 
 
-#if DEBUG
-       /* if debug, we have a little piece of C around this
-        * in pmap.c that gives some trace ability
-        */
-ENTRY(pmap_copy_page_assembler, TAG_NO_FRAME_USED)
-#else
 ENTRY(pmap_copy_page, TAG_NO_FRAME_USED)
 ENTRY(pmap_copy_page, TAG_NO_FRAME_USED)
-#endif /* DEBUG */
 
 
-#if 0
-                       mfpvr   r9                                                      ; Get the PVR
-                       rlwinm  r9,r9,16,16,31                          ; Isolate the PPC processor
-                       cmplwi  r9,PROCESSOR_VERSION_Max        ; Do we have Altivec?
-                       beq+    wegotaltivec                            ; Yeah...
-#endif
-               
-                       mfmsr   r9                                                      ; Get the MSR
-                       rlwinm  r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1       ; Force floating point off
-                       rlwinm  r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1     ; Force vectors off
-                       stwu    r1,-(FM_SIZE+32)(r1)            ; Make a frame for us
-                       rlwinm  r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1       ; Disable interruptions
-                       ori             r7,r7,lo16(MASK(MSR_FP))        ; Turn on the FPU
-                       mtmsr   r7                                                      ; Disable rupts and enable FPU
-                       isync
-               
-                       stfd    f0,FM_SIZE+0(r1)                        ; Save an FP register
-                       rlwinm  r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1       ; Clear the DDAT bit
-                       stfd    f1,FM_SIZE+8(r1)                        ; Save an FP register
-                       addi    r6,r3,PPC_PGBYTES                       ; Point to the start of the next page
-                       stfd    f2,FM_SIZE+16(r1)                       ; Save an FP register
-                       mr              r8,r4                                           ; Save the destination
-                       stfd    f3,FM_SIZE+24(r1)                       ; Save an FP register
-               
-                       mtmsr   r7                                                      ; Set the new MSR
-                       isync                                                           ; Ensure data translations are off
-
-                       dcbt    br0, r3                                         /* Start in first input line */
-                       li              r5,     CACHE_LINE_SIZE                 /* Get the line size */
-
-.L_pmap_copy_page_loop:
-                       dcbz    0, r4                                           /* Allocate a line for the output */
-                       lfd             f0, 0(r3)                                       /* Get first 8 */
-                       lfd             f1, 8(r3)                                       /* Get second 8 */
-                       lfd             f2, 16(r3)                                      /* Get third 8 */
-                       stfd    f0, 0(r4)                                       /* Put first 8 */
-                       dcbt    r5, r3                                          /* Start next line coming in */
-                       lfd             f3, 24(r3)                                      /* Get fourth 8 */
-                       stfd    f1,     8(r4)                                   /* Put second 8 */
-                       addi    r3,r3,CACHE_LINE_SIZE           /* Point to the next line in */
-                       stfd    f2,     16(r4)                                  /* Put third 8 */
-                       cmplw   cr0,r3,r6                                       /* See if we're finished yet */
-                       stfd    f3,     24(r4)                                  /* Put fourth 8 */
-                       dcbst   br0,r4                                          /* Force it out */
-                       addi    r4,r4,CACHE_LINE_SIZE           /* Point to the next line out */
-                       blt+    .L_pmap_copy_page_loop          /* Copy the whole page */
-                       
-                       sync                                                            /* Make sure they're all done */
-                       li              r4,PPC_PGBYTES-CACHE_LINE_SIZE  /* Point to the end of the page */
-
-invalinst:     
-                       subic.  r5,r4,CACHE_LINE_SIZE           /* Point to the next one */
-                       icbi    r4, r8                                          /* Trash the i-cache */
-                       subi    r4,r5,CACHE_LINE_SIZE           /* Point to the next one */
-                       icbi    r5, r8                                          /* Trash the i-cache */
-                       bgt+    invalinst                                       /* Keep going until we do the page... */
-       
-                       rlwimi  r7,r9,0,MSR_DR_BIT,MSR_DR_BIT   ; Set DDAT if on
-                       sync                                                            ; Make sure all invalidates done
-                       
-                       mtmsr   r7                                                      ; Set DDAT correctly
-                       isync           
-                       
-                       lfd             f0,FM_SIZE+0(r1)                        ; Restore an FP register
-                       lfd             f1,FM_SIZE+8(r1)                        ; Restore an FP register
-                       lfd             f2,FM_SIZE+16(r1)                       ; Restore an FP register
-                       lfd             f3,FM_SIZE+24(r1)                       ; Restore an FP register
-                       
-                       lwz             r1,0(r1)                                        ; Pop up the stack
-       
-                       mtmsr   r9                                                      ; Turn off FPU now and maybe rupts back on
-                       isync                                                           
-                       blr
-               
-#if 0
-;
-;                      This is not very optimal.  We just do it here for a test of 
-;                      Altivec in the kernel.
-;
-wegotaltivec:
-                       mfmsr   r9                                                      ; Get the MSR
-                       lis             r8,hi16(0xC0000000)                     ; Make sure we keep the first 2 vector registers
-                       rlwinm  r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1       ; Disable interruptions
-                       lis             r6,lo16(2*256+128)                      ; Specify 128 blocks of 2 vectors each
-                       rlwinm  r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1       ; Clear the DDAT bit
-                       ori             r6,r6,32                                        ; Set a 32-byte stride
-                       mtsprg  256,r8                                          ; Set VRSave
-                       mtmsr   r7                                                      ; Disable rupts and turn xlate off
-                       isync
-       
-                       addi    r11,r3,4096                                     ; Point to the next page
-                       li              r10,16                                          ; Get vector size
-
-avmovepg:      lvxl    v0,br0,r3                                       ; Get first half of line
-                       dcba    br0,r4                                          ; Allocate output
-                       lvxl    v1,r10,r3                                       ; Get second half of line
-                       stvxl   v0,br0,r4                                       ; Save first half of line
-                       addi    r3,r3,32                                        ; Point to the next line
-                       icbi    br0,r4                                          ; Make the icache go away also
-                       stvxl   v1,r10,r4                                       ; Save second half of line
-                       cmplw   r3,r11                                          ; Have we reached the next page?
-                       dcbst   br0,r4                                          ; Make sure the line is on its way out
-                       addi    r4,r4,32                                        ; Point to the next line
-                       blt+    avmovepg                                        ; Move the next line...
-                       
-                       li              r8,0                                            ; Clear this
-                       sync                                                            ; Make sure all the memory stuff is done
-                       mtsprg  256,r8                                          ; Show we are not using VRs any more
-                       mtmsr   r9                                                      ; Translation and interruptions back on
-                       isync
-                       blr
-#endif
+               lis             r2,hi16(MASK(MSR_VEC))                  ; Get the vector flag
+        mflr   r0                                                              // get return
+               ori             r2,r2,lo16(MASK(MSR_FP))                ; Add the FP flag
+               stw             r0,8(r1)                                                // save
+        stwu   r1,-kSFSize(r1)                                 // set up a stack frame for VRs or FPRs
+        mfmsr  r11                                                             // save MSR at entry
+        mfsprg r10,2                                                   // get feature flags
+        andc   r11,r11,r2                                              // Clear out vec and fp
+        ori            r2,r2,lo16(MASK(MSR_EE))                // Get EE on also
+        andc   r2,r11,r2                                               // Clear out EE as well
+        mtcrf  0x02,r10                                                // we need to test pf64Bit
+        ori            r2,r2,MASK(MSR_FP)                              // must enable FP for G3...
+        mtcrf  0x80,r10                                                // we need to test pfAltivec too
+        oris   r2,r2,hi16(MASK(MSR_VEC))               // enable altivec for G4 (ignored if G3)
+        mtmsr  r2                                                              // turn EE off, FP and VEC on
+        isync
+        bt++   pf64Bitb,pmap_copy_64                   // skip if 64-bit processor (only they take hint)
+               slwi    r3,r3,12                                                // get page address from page num
+               slwi    r4,r4,12                                                // get page address from page num
+        rlwinm r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1      // get ready to turn off DR
+        bt             pfAltivecb,pmap_copy_g4                 // altivec but not 64-bit means G4
+        
+        
+        // G3 -- copy using FPRs
+        
+        stfd   f0,FM_SIZE+0(r1)                                // save the 4 FPRs we use to copy
+        stfd   f1,FM_SIZE+8(r1)
+        li             r5,PPC_PGBYTES/32                               // count of cache lines in a page
+        stfd   f2,FM_SIZE+16(r1)
+        mtctr  r5
+        stfd   f3,FM_SIZE+24(r1)
+        mtmsr  r12                                                             // turn off DR after saving FPRs on stack
+        isync
+        
+pmap_g3_copy_loop:                                                             // loop over 32-byte cache lines
+        dcbz   0,r4                                                    // avoid read of dest line
+        lfd            f0,0(r3)
+        lfd            f1,8(r3)
+        lfd            f2,16(r3)
+        lfd            f3,24(r3)
+        addi   r3,r3,32
+        stfd   f0,0(r4)
+        stfd   f1,8(r4)
+        stfd   f2,16(r4)
+        stfd   f3,24(r4)
+        dcbst  0,r4                                                    // flush dest line to RAM
+        addi   r4,r4,32
+        bdnz   pmap_g3_copy_loop
+        
+        sync                                                                   // wait for stores to take
+        subi   r4,r4,PPC_PGBYTES                               // restore ptr to destintation page
+        li             r6,PPC_PGBYTES-32                               // point to last line in page
+pmap_g3_icache_flush:
+        subic. r5,r6,32                                                // more to go?
+        icbi   r4,r6                                                   // flush another line in icache
+        subi   r6,r5,32                                                // get offset to next line
+        icbi   r4,r5
+        bne            pmap_g3_icache_flush
+        
+        sync
+        mtmsr  r2                                                              // turn DR back on
+        isync
+        lfd            f0,FM_SIZE+0(r1)                                // restore the FPRs
+        lfd            f1,FM_SIZE+8(r1)
+        lfd            f2,FM_SIZE+16(r1)
+        lfd            f3,FM_SIZE+24(r1)        
+        
+        b              pmap_g4_restore                                 // restore MSR and done
+
+        
+        // G4 -- copy using VRs
+
+pmap_copy_g4:                                                                  // r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR
+        la             r9,FM_SIZE+16(r1)                               // place where we save VRs to r9
+        li             r5,16                                                   // load x-form offsets into r5-r9
+        li             r6,32                                                   // another offset
+        stvx   v0,0,r9                                                 // save some VRs so we can use to copy
+        li             r7,48                                                   // another offset
+        stvx   v1,r5,r9
+        li             r0,PPC_PGBYTES/64                               // we loop over 64-byte chunks
+        stvx   v2,r6,r9
+        mtctr  r0
+        li             r8,96                                                   // get look-ahead for touch
+        stvx   v3,r7,r9
+        li             r9,128
+        mtmsr  r12                                                             // now we've saved VRs on stack, turn off DR
+        isync                                                                  // wait for it to happen
+        b              pmap_g4_copy_loop
+        
+        .align 5                                                               // align inner loops
+pmap_g4_copy_loop:                                                             // loop over 64-byte chunks
+        dcbt   r3,r8                                                   // touch 3 lines ahead
+        nop                                                                            // avoid a 17-word loop...
+        dcbt   r3,r9                                                   // touch 4 lines ahead
+        nop                                                                            // more padding
+        dcba   0,r4                                                    // avoid pre-fetch of 1st dest line
+        lvx            v0,0,r3                                                 // offset 0
+        lvx            v1,r5,r3                                                // offset 16
+        lvx            v2,r6,r3                                                // offset 32
+        lvx            v3,r7,r3                                                // offset 48
+        addi   r3,r3,64
+        dcba   r6,r4                                                   // avoid pre-fetch of 2nd line
+        stvx   v0,0,r4                                                 // offset 0
+        stvx   v1,r5,r4                                                // offset 16
+        stvx   v2,r6,r4                                                // offset 32
+        stvx   v3,r7,r4                                                // offset 48
+        dcbf   0,r4                                                    // push line 1
+        dcbf   r6,r4                                                   // and line 2
+        addi   r4,r4,64
+        bdnz   pmap_g4_copy_loop
+
+        sync                                                                   // wait for stores to take
+        subi   r4,r4,PPC_PGBYTES                               // restore ptr to destintation page
+        li             r8,PPC_PGBYTES-32                               // point to last line in page
+pmap_g4_icache_flush:
+        subic. r9,r8,32                                                // more to go?
+        icbi   r4,r8                                                   // flush from icache
+        subi   r8,r9,32                                                // get offset to next line
+        icbi   r4,r9
+        bne            pmap_g4_icache_flush
+        
+        sync
+        mtmsr  r2                                                              // turn DR back on
+        isync
+        la             r9,FM_SIZE+16(r1)                               // get base of VR save area
+        lvx            v0,0,r9                                                 // restore the VRs
+        lvx            v1,r5,r9
+        lvx            v2,r6,r9
+        lvx            v3,r7,r9        
+        
+pmap_g4_restore:                                                               // r11=MSR
+        mtmsr  r11                                                             // turn EE on, VEC and FR off
+        isync                                                                  // wait for it to happen
+        addi   r1,r1,kSFSize                                   // pop off our stack frame
+        lwz            r0,8(r1)                                                // restore return address
+        mtlr   r0
+        blr
+        
+        
+        // 64-bit/128-byte processor: copy using VRs
+        
+pmap_copy_64:                                                                  // r10=features, r11=old MSR
+               sldi    r3,r3,12                                                // get page address from page num
+               sldi    r4,r4,12                                                // get page address from page num
+               la              r9,FM_SIZE+16(r1)                               // get base of VR save area
+        li             r5,16                                                   // load x-form offsets into r5-r9
+        li             r6,32                                                   // another offset
+        bf             pfAltivecb,pmap_novmx_copy              // altivec suppressed...
+        stvx   v0,0,r9                                                 // save 8 VRs so we can copy wo bubbles
+        stvx   v1,r5,r9
+        li             r7,48                                                   // another offset
+        li             r0,PPC_PGBYTES/128                              // we loop over 128-byte chunks
+        stvx   v2,r6,r9
+        stvx   v3,r7,r9
+        addi   r9,r9,64                                                // advance base ptr so we can store another 4
+        mtctr  r0
+        li             r0,MASK(MSR_DR)                                 // get DR bit
+        stvx   v4,0,r9
+        stvx   v5,r5,r9
+        andc   r12,r2,r0                                               // turn off DR bit
+        li             r0,1                                                    // get a 1 to slam into SF
+        stvx   v6,r6,r9
+        stvx   v7,r7,r9
+        rldimi r12,r0,63,MSR_SF_BIT                    // set SF bit (bit 0)
+        li             r8,-128                                                 // offset so we can reach back one line
+        mtmsrd r12                                                             // now we've saved VRs, turn DR off and SF on
+        isync                                                                  // wait for it to happen
+        dcbt128        0,r3,1                                                  // start a forward stream
+        b              pmap_64_copy_loop
+        
+        .align 5                                                               // align inner loops
+pmap_64_copy_loop:                                                             // loop over 128-byte chunks
+        dcbz128        0,r4                                                    // avoid read of destination line
+        lvx            v0,0,r3                                                 // offset 0
+        lvx            v1,r5,r3                                                // offset 16
+        lvx            v2,r6,r3                                                // offset 32
+        lvx            v3,r7,r3                                                // offset 48
+        addi   r3,r3,64                                                // don't have enough GPRs so add 64 2x
+        lvx            v4,0,r3                                                 // offset 64
+        lvx            v5,r5,r3                                                // offset 80
+        lvx            v6,r6,r3                                                // offset 96
+        lvx            v7,r7,r3                                                // offset 112
+        addi   r3,r3,64
+        stvx   v0,0,r4                                                 // offset 0
+        stvx   v1,r5,r4                                                // offset 16
+        stvx   v2,r6,r4                                                // offset 32
+        stvx   v3,r7,r4                                                // offset 48
+        addi   r4,r4,64
+        stvx   v4,0,r4                                                 // offset 64
+        stvx   v5,r5,r4                                                // offset 80
+        stvx   v6,r6,r4                                                // offset 96
+        stvx   v7,r7,r4                                                // offset 112
+        addi   r4,r4,64
+        dcbf   r8,r4                                                   // flush the line we just wrote
+        bdnz   pmap_64_copy_loop
+
+        sync                                                                   // wait for stores to take
+        subi   r4,r4,PPC_PGBYTES                               // restore ptr to destintation page
+        li             r8,PPC_PGBYTES-128                              // point to last line in page
+pmap_64_icache_flush:
+        subic. r9,r8,128                                               // more to go?
+        icbi   r4,r8                                                   // flush from icache
+        subi   r8,r9,128                                               // get offset to next line
+        icbi   r4,r9
+        bne            pmap_64_icache_flush
+        
+        sync
+        mtmsrd r2                                                              // turn DR back on, SF off
+        isync
+        la             r9,FM_SIZE+16(r1)                               // get base address of VR save area on stack
+        lvx            v0,0,r9                                                 // restore the VRs
+        lvx            v1,r5,r9
+        lvx            v2,r6,r9
+        lvx            v3,r7,r9
+        addi   r9,r9,64        
+        lvx            v4,0,r9
+        lvx            v5,r5,r9
+        lvx            v6,r6,r9
+        lvx            v7,r7,r9
+
+        b              pmap_g4_restore                                 // restore lower half of MSR and return
+
+ //
+ //            Copy on 64-bit without VMX
+ //
+
+pmap_novmx_copy:        
+               li              r0,PPC_PGBYTES/128                              // we loop over 128-byte chunks
+               mtctr   r0
+               li              r0,MASK(MSR_DR)                                 // get DR bit
+               andc    r12,r2,r0                                               // turn off DR bit
+               li              r0,1                                                    // get a 1 to slam into SF
+               rldimi  r12,r0,63,MSR_SF_BIT                    // set SF bit (bit 0)
+               mtmsrd  r12                                                             // now we've saved VRs, turn DR off and SF on
+               isync                                                                   // wait for it to happen
+               dcbt128 0,r3,1                                                  // start a forward stream 
+       
+pmap_novmx_copy_loop:                                                  // loop over 128-byte cache lines
+        dcbz128        0,r4                                                    // avoid read of dest line
+        
+        ld             r0,0(r3)                                                // Load half a line
+        ld             r12,8(r3)
+        ld             r5,16(r3)
+        ld             r6,24(r3)
+        ld             r7,32(r3)
+        ld             r8,40(r3)
+        ld             r9,48(r3)
+        ld             r10,56(r3)
+        
+        std            r0,0(r4)                                                // Store half a line
+        std            r12,8(r4)
+        std            r5,16(r4)
+        std            r6,24(r4)
+        std            r7,32(r4)
+        std            r8,40(r4)
+        std            r9,48(r4)
+        std            r10,56(r4)
+        
+        ld             r0,64(r3)                                               // Load half a line
+        ld             r12,72(r3)
+        ld             r5,80(r3)
+        ld             r6,88(r3)
+        ld             r7,96(r3)
+        ld             r8,104(r3)
+        ld             r9,112(r3)
+        ld             r10,120(r3)
+        
+        addi   r3,r3,128
+        std            r0,64(r4)                                               // Store half a line
+        std            r12,72(r4)
+        std            r5,80(r4)
+        std            r6,88(r4)
+        std            r7,96(r4)
+        std            r8,104(r4)
+        std            r9,112(r4)
+        std            r10,120(r4)
+        
+        dcbf   0,r4                                                    // flush the line we just wrote
+               addi    r4,r4,128
+        bdnz   pmap_novmx_copy_loop
+
+        sync                                                                   // wait for stores to take
+        subi   r4,r4,PPC_PGBYTES                               // restore ptr to destintation page
+        li             r8,PPC_PGBYTES-128                              // point to last line in page
+
+pmap_novmx_icache_flush:
+        subic. r9,r8,128                                               // more to go?
+        icbi   r4,r8                                                   // flush from icache
+        subi   r8,r9,128                                               // get offset to next line
+        icbi   r4,r9
+        bne            pmap_novmx_icache_flush
+        
+        sync
+        mtmsrd r2                                                              // turn DR back on, SF off
+        isync
+
+        b              pmap_g4_restore                                 // restore lower half of MSR and return
+
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>   
                
                
+// Stack frame format used by copyin, copyout, copyinstr and copyoutstr.
+// These routines all run both on 32 and 64-bit machines, though because they are called
+// by the BSD kernel they are always in 32-bit mode when entered.  The mapped ptr returned
+// by MapUserMemoryWindow will be 64 bits however on 64-bit machines.  Beware to avoid
+// using compare instructions on this ptr.  This mapped ptr is kept globally in r31, so there
+// is no need to store or load it, which are mode-dependent operations since it could be
+// 32 or 64 bits.
+
+#define        kkFrameSize     (FM_SIZE+32)
 
 
-       
+#define        kkBufSize       (FM_SIZE+0)
+#define        kkCR3           (FM_SIZE+4)
+#define        kkSource        (FM_SIZE+8)
+#define        kkDest          (FM_SIZE+12)
+#define        kkCountPtr      (FM_SIZE+16)
+#define        kkR31Save       (FM_SIZE+20)
+#define        kkThrErrJmp     (FM_SIZE+24)
+// nonvolatile CR bits we use as flags in cr3
 
 
+#define        kk64bit         12
+#define        kkNull          13
+#define        kkIn            14
+#define        kkString        15
+#define        kkZero          15
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
  * int
 /*
  * int
- * copyin(src, dst, count)
- *     vm_offset_t     src;
- *     vm_offset_t     dst;
- *     int             count;
+ * copyoutstr(src, dst, maxcount, count)
+ *     vm_offset_t     src;        // r3
+ *     addr64_t        dst;        // r4 and r5
+ *     vm_size_t       maxcount;   // r6
+ *     vm_size_t*      count;      // r7
  *
  *
+ * Set *count to the number of bytes copied.
  */
 
  */
 
-ENTRY2(copyin, copyinmsg, TAG_NO_FRAME_USED)
-
-/* Preamble allowing us to call a sub-function */
-               mflr    r0
-               stw             r0,FM_LR_SAVE(r1)
-               stwu    r1,-(FM_SIZE+16)(r1)
-               
-               cmpli   cr0,r5,0
-               ble-    cr0,.L_copyinout_trivial
+ENTRY(copyoutstr, TAG_NO_FRAME_USED)
+        mfcr   r2,0x10                         // save caller's cr3, which we use for flags
+        mr      r10,r4                          // move high word of 64-bit user address to r10
+        li             r0,0
+        crset  kkString                                                // flag as a string op
+        mr      r11,r5                          // move low word of 64-bit user address to r11
+        stw            r0,0(r7)                                                // initialize #bytes moved
+        crclr  kkIn                                                    // flag as copyout
+        b              copyJoin
 
 
-/* we know we have a valid copyin to do now */
-/* Set up thread_recover in case we hit an illegal address */
-               
-               mfsprg  r8,1                                                    /* Get the current act */ 
-               lwz             r10,ACT_THREAD(r8)
-               lis             r11,hi16(.L_copyinout_error)
-               lwz             r8,ACT_VMMAP(r8)
-               ori             r11,r11,lo16(.L_copyinout_error)
-               add             r9,r3,r5                                                /* Get the end of the source */
-               lwz             r8,VMMAP_PMAP(r8)                               ; Get the pmap
-               rlwinm  r12,r3,6,26,29                                  ; Get index to the segment slot
-               subi    r9,r9,1                                                 /* Make sure we don't go too far */
-               add             r8,r8,r12                                               ; Start indexing to the segment value
-               stw             r11,THREAD_RECOVER(r10)
-               xor             r9,r9,r3                                                /* Smoosh 'em together */
-               lwz             r8,PMAP_SEGS(r8)                                ; Get the source SR value
-               rlwinm. r9,r9,0,1,3                                             /* Top nybble equal? */
-               mtsr    SR_COPYIN,r8                                    ; Set the SR
-               isync
-#if 0
-               lis             r0,HIGH_ADDR(EXT(dbgRegsCall))  /* (TEST/DEBUG) */      
-               ori             r0,r0,LOW_ADDR(EXT(dbgRegsCall))        /* (TEST/DEBUG) */      
-               sc                                                                              /* (TEST/DEBUG) */
-#endif
-       
-/* For optimization, we check if the copyin lies on a segment
- * boundary. If it doesn't, we can use a simple copy. If it
- * does, we split it into two separate copies in some C code.
- */
-       
-               bne-    .L_call_copyin_multiple                 /* Nope, we went past the segment boundary... */
 
 
-               rlwinm  r3,r3,0,4,31
-               oris    r3,r3,(SR_COPYIN_NUM << (28-16))        /* Set the copyin segment as the source */
-       
-               bl              EXT(bcopy)
-               
-/* Now that copyin is done, we don't need a recovery point */
-
-               addi    r1,r1,FM_SIZE+16
-               mfsprg  r6,1                                                    /* Get the current act */ 
-               lwz             r10,ACT_THREAD(r6)
-               li              r3,0
-               lwz             r0,FM_LR_SAVE(r1)
-               stw             r3,THREAD_RECOVER(r10)                  /* Clear recovery */
-               mtlr    r0
-               blr
-       
-/* we get here via the exception handler if an illegal
- * user memory reference was made.
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+/*
+ * int
+ * copyinstr(src, dst, maxcount, count)
+ *     addr64_t        src;        // r3 and r4
+ *     vm_offset_t     dst;        // r5
+ *     vm_size_t       maxcount;   // r6
+ *     vm_size_t*      count;      // r7
+ *
+ * Set *count to the number of bytes copied
+ * If dst == NULL, don't copy, just count bytes.
+ * Only currently called from klcopyinstr. 
  */
  */
-.L_copyinout_error:
-
-/* Now that copyin is done, we don't need a recovery point */
-       
-               mfsprg  r6,1                                                    /* Get the current act */ 
-               addi    r1,r1,FM_SIZE+16
-               lwz             r10,ACT_THREAD(r6)
-               li              r4,0
-               lwz             r0,FM_LR_SAVE(r1)
-               stw             r4,THREAD_RECOVER(r10)                  /* Clear recovery */
-               mtlr    r0
-               li              r3,EFAULT                                                       ; Indicate error (EFAULT) 
-               blr
 
 
-.L_copyinout_trivial:
-       /* The copyin/out was for either 0 bytes or a negative
-        * number of bytes, return an appropriate value (0 == SUCCESS).
-        * cr0 still contains result of comparison of len with 0.
-        */
-       li      r3,     0
-       beq+    cr0,    .L_copyinout_negative
-       li      r3,     1
-.L_copyinout_negative:
+ENTRY(copyinstr, TAG_NO_FRAME_USED)
+        mfcr   r2,0x10                         // save caller's cr3, which we use for flags
+        cmplwi r5,0                                                    // dst==NULL?
+        mr      r10,r3                          // move high word of 64-bit user address to r10
+        li             r0,0
+        crset  kkString                                                // flag as a string op
+        mr      r11,r4                          // move low word of 64-bit user address to r11
+        crmove kkNull,cr0_eq                                   // remember if (dst==NULL)
+        stw            r0,0(r7)                                                // initialize #bytes moved
+        crset  kkIn                                                    // flag as copyin (rather than copyout)
+        b              copyJoin1                                               // skip over the "crclr kkNull"
 
 
-       /* unwind the stack */
-       addi    r1,     r1,     FM_SIZE+16
-       lwz     r0,     FM_LR_SAVE(r1)
-       mtlr    r0
 
 
-       blr
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+/*
+ * int
+ * copyout(src, dst, count)
+ *     vm_offset_t     src;        // r3
+ *     addr64_t        dst;        // r4 and r5
+ *     size_t          count;      // r6
+ */
 
 
-.L_call_copyin_multiple:
+                       .align  5
+                       .globl  EXT(copyout)
+                       .globl  EXT(copyoutmsg)
 
 
-       /* unwind the stack */
-       addi    r1,     r1,     FM_SIZE+16
-       lwz     r0,     FM_LR_SAVE(r1)
-       mtlr    r0
+LEXT(copyout)
+LEXT(copyoutmsg)
 
 
-       b       EXT(copyin_multiple)                            /* not a call - a jump! */
+#if INSTRUMENT
+        mfspr  r12,pmc1                                                ; INSTRUMENT - saveinstr[12] - Take stamp at copyout
+        stw            r12,0x6100+(12*16)+0x0(0)               ; INSTRUMENT - Save it
+        mfspr  r12,pmc2                                                ; INSTRUMENT - Get stamp
+        stw            r12,0x6100+(12*16)+0x4(0)               ; INSTRUMENT - Save it
+        mfspr  r12,pmc3                                                ; INSTRUMENT - Get stamp
+        stw            r12,0x6100+(12*16)+0x8(0)               ; INSTRUMENT - Save it
+        mfspr  r12,pmc4                                                ; INSTRUMENT - Get stamp
+        stw            r12,0x6100+(12*16)+0xC(0)               ; INSTRUMENT - Save it
+#endif                 
+        mfcr   r2,0x10                         // save caller's cr3, which we use for flags
+        mr      r10,r4                          // move high word of 64-bit user address to r10
+        crclr  kkString                                                // not a string version
+        mr      r11,r5                          // move low word of 64-bit user address to r11
+        crclr  kkIn                                                    // flag as copyout
+        b              copyJoin
+        
 
 
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
  * int
 /*
  * int
- * copyout(src, dst, count)
- *     vm_offset_t     src;
- *     vm_offset_t     dst;
- *     int             count;
- *
+ * copyin(src, dst, count)
+ *     addr64_t        src;        // r3 and r4
+ *     vm_offset_t     dst;        // r5
+ *     size_t          count;      // r6
  */
 
  */
 
-ENTRY2(copyout, copyoutmsg, TAG_NO_FRAME_USED)
 
 
-/* Preamble allowing us to call a sub-function */
+                       .align  5
+                       .globl  EXT(copyin)
+                       .globl  EXT(copyinmsg)
 
 
-               mflr    r0
-               stw             r0,FM_LR_SAVE(r1)
-               stwu    r1,-(FM_SIZE+16)(r1)
-               
-#if 0
-               stw             r3,FM_SIZE+0(r1)                                /* (TEST/DEBUG) */
-               stw             r4,FM_SIZE+4(r1)                                /* (TEST/DEBUG) */
-               stw             r5,FM_SIZE+8(r1)                                /* (TEST/DEBUG) */
-               mr              r6,r0                                                   /* (TEST/DEBUG) */
-               
-               bl              EXT(tracecopyout)                               /* (TEST/DEBUG) */
-               
-               lwz             r3,FM_SIZE+0(r1)                                /* (TEST/DEBUG) */
-               lwz             r4,FM_SIZE+4(r1)                                /* (TEST/DEBUG) */
-               lwz             r5,FM_SIZE+8(r1)                                /* (TEST/DEBUG) */
-#endif
-       
-               cmpli   cr0,r5,0
-               ble-    cr0,.L_copyinout_trivial
-/* we know we have a valid copyout to do now */
-/* Set up thread_recover in case we hit an illegal address */
-               
+LEXT(copyin)
+LEXT(copyinmsg)
 
 
-               mfsprg  r8,1                                                    /* Get the current act */
-               lwz             r10,ACT_THREAD(r8)
-               lis             r11,HIGH_ADDR(.L_copyinout_error)
-               lwz             r8,ACT_VMMAP(r8)
-               rlwinm  r12,r4,6,26,29                                  ; Get index to the segment slot
-               ori             r11,r11,LOW_ADDR(.L_copyinout_error)
-               add             r9,r4,r5                                                /* Get the end of the destination */
-               lwz             r8,VMMAP_PMAP(r8)
-               subi    r9,r9,1                                                 /* Make sure we don't go too far */
-               add             r8,r8,r12                                               ; Start indexing to the segment value
-               stw             r11,THREAD_RECOVER(r10)
-               xor             r9,r9,r4                                                /* Smoosh 'em together */
-               lwz             r8,PMAP_SEGS(r8)                                ; Get the source SR value
-               rlwinm. r9,r9,0,1,3                                             /* Top nybble equal? */
-               mtsr    SR_COPYIN,r8
-               isync
-       
-       
-/* For optimisation, we check if the copyout lies on a segment
- * boundary. If it doesn't, we can use a simple copy. If it
- * does, we split it into two separate copies in some C code.
- */
-       
-               bne-    .L_call_copyout_multiple                /* Nope, we went past the segment boundary... */
+        mfcr   r2,0x10                         // save caller's cr3, which we use for flags
+        mr      r10,r3                          // move high word of 64-bit user address to r10
+        crclr  kkString                                                // not a string version
+        mr      r11,r4                          // move low word of 64-bit user address to r11
+        crset  kkIn                                                    // flag as copyin
+        
+        
+// Common code to handle setup for all the copy variants:
+//             r2 = caller's cr3
+//      r3 = source if copyout
+//      r5 = dest if copyin
+//      r6 = buffer length or count
+//      r7 = count output ptr (if kkString set)
+//        r10 = high word of 64-bit user-space address (source if copyin, dest if copyout)
+//        r11 = low word of 64-bit user-space address
+//     cr3 = kkIn, kkString, kkNull flags
 
 
-               rlwinm  r4,r4,0,4,31
-               oris    r4,r4,(SR_COPYIN_NUM << (28-16))        /* Set the copyin segment as the source */
-       
-               bl      EXT(bcopy)
-               
-/* Now that copyout is done, we don't need a recovery point */
-               mfsprg  r6,1                                                    /* Get the current act */
-               addi    r1,r1,FM_SIZE+16
-               lwz             r10,ACT_THREAD(r6)
-               li              r3,0
-               lwz             r0,FM_LR_SAVE(r1)
-               stw             r3,THREAD_RECOVER(r10)                  /* Clear recovery */
+copyJoin:
+        crclr  kkNull                                                  // (dst==NULL) convention not used with this call
+copyJoin1:                                                                             // enter from copyinstr with kkNull set
+               mflr    r0                                                              // get return address
+        cmplwi r6,0                                                    // buffer length 0?
+        lis            r9,0x1000                                               // r9 <- 0x10000000 (256MB)
+               stw             r0,FM_LR_SAVE(r1)                               // save return
+        cmplw  cr1,r6,r9                                               // buffer length > 256MB ?
+        mfsprg r8,2                                                    // get the features
+        beq--  copyinout_0                                             // 0 length is degenerate case
+               stwu    r1,-kkFrameSize(r1)                             // set up stack frame
+        stw            r2,kkCR3(r1)                    // save caller's cr3, which we use for flags
+        mtcrf  0x02,r8                                                 // move pf64Bit to cr6
+        stw            r3,kkSource(r1)                                 // save args across MapUserMemoryWindow
+        stw            r5,kkDest(r1)
+        stw            r6,kkBufSize(r1)
+        crmove kk64bit,pf64Bitb                                // remember if this is a 64-bit processor
+        stw            r7,kkCountPtr(r1)
+        stw            r31,kkR31Save(r1)                               // we use r31 globally for mapped user ptr
+        li             r31,0                                                   // no mapped ptr yet
+        
+        
+// Handle buffer length > 256MB.  This is an error (ENAMETOOLONG) on copyin and copyout.
+// The string ops are passed -1 lengths by some BSD callers, so for them we silently clamp
+// the buffer length to 256MB.  This isn't an issue if the string is less than 256MB
+// (as most are!), but if they are >256MB we eventually return ENAMETOOLONG.  This restriction
+// is due to MapUserMemoryWindow; we don't want to consume more than two segments for
+// the mapping. 
+
+        ble++  cr1,copyin0                                             // skip if buffer length <= 256MB
+        bf             kkString,copyinout_too_big              // error if not string op
+        mr             r6,r9                                                   // silently clamp buffer length to 256MB
+        stw            r9,kkBufSize(r1)                                // update saved copy too
+
+
+// Set up thread_recover in case we hit an illegal address.
+
+copyin0:
+               mfsprg  r8,1                                                    // Get the current thread 
+               lis             r2,hi16(copyinout_error)
+               ori             r2,r2,lo16(copyinout_error)
+               lwz             r4,THREAD_RECOVER(r8)
+               lwz             r3,ACT_VMMAP(r8)                                // r3 <- vm_map virtual address
+               stw             r2,THREAD_RECOVER(r8)
+               stw             r4,kkThrErrJmp(r1)
+
+
+// Map user segment into kernel map, turn on 64-bit mode.  At this point:
+//             r3 = vm map
+//             r6 = buffer length
+// r10/r11 = 64-bit user-space ptr (source if copyin, dest if copyout)
+//
+// When we call MapUserMemoryWindow, we pass:
+//      r3 = vm map ptr
+//   r4/r5 = 64-bit user space address as an addr64_t
+        
+        mr      r4,r10                          // copy user ptr into r4/r5
+        mr      r5,r11
+#if INSTRUMENT
+        mfspr  r12,pmc1                                                ; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace
+        stw            r12,0x6100+(13*16)+0x0(0)               ; INSTRUMENT - Save it
+        mfspr  r12,pmc2                                                ; INSTRUMENT - Get stamp
+        stw            r12,0x6100+(13*16)+0x4(0)               ; INSTRUMENT - Save it
+        mfspr  r12,pmc3                                                ; INSTRUMENT - Get stamp
+        stw            r12,0x6100+(13*16)+0x8(0)               ; INSTRUMENT - Save it
+        mfspr  r12,pmc4                                                ; INSTRUMENT - Get stamp
+        stw            r12,0x6100+(13*16)+0xC(0)               ; INSTRUMENT - Save it
+#endif                 
+        bl             EXT(MapUserMemoryWindow)                // get r3/r4 <- 64-bit address in kernel map of user operand
+#if INSTRUMENT
+        mfspr  r12,pmc1                                                ; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace
+        stw            r12,0x6100+(14*16)+0x0(0)               ; INSTRUMENT - Save it
+        mfspr  r12,pmc2                                                ; INSTRUMENT - Get stamp
+        stw            r12,0x6100+(14*16)+0x4(0)               ; INSTRUMENT - Save it
+        mfspr  r12,pmc3                                                ; INSTRUMENT - Get stamp
+        stw            r12,0x6100+(14*16)+0x8(0)               ; INSTRUMENT - Save it
+        mfspr  r12,pmc4                                                ; INSTRUMENT - Get stamp
+        stw            r12,0x6100+(14*16)+0xC(0)               ; INSTRUMENT - Save it
+#endif                 
+        mr             r31,r4                                                  // r31 <- mapped ptr into user space (may be 64-bit)
+        bf--   kk64bit,copyin1                                 // skip if a 32-bit processor
+               rldimi  r31,r3,32,0                                             // slam high-order bits into mapped ptr
+        mfmsr  r4                                                              // if 64-bit, turn on SF so we can use returned ptr
+        li             r0,1
+        rldimi r4,r0,63,MSR_SF_BIT                             // light bit 0
+        mtmsrd r4                                                              // turn on 64-bit mode
+        isync                                                                  // wait for mode to change
+        
+        
+// Load r3-r5, substituting mapped ptr as appropriate.
+
+copyin1:
+        lwz            r5,kkBufSize(r1)                                // restore length to copy
+        bf             kkIn,copyin2                                    // skip if copyout
+        lwz            r4,kkDest(r1)                                   // copyin: dest is kernel ptr
+        mr             r3,r31                                                  // source is mapped ptr
+        b              copyin3
+copyin2:                                                                               // handle copyout
+        lwz            r3,kkSource(r1)                                 // source is kernel buffer (r3 at entry)
+        mr             r4,r31                                                  // dest is mapped ptr into user space
+        
+        
+// Finally, all set up to copy:
+//             r3 = source ptr (mapped if copyin)
+//             r4 = dest ptr (mapped if copyout)
+//             r5 = length
+//        r31 = mapped ptr returned by MapUserMemoryWindow
+//        cr3 = kkIn, kkString, kk64bit, and kkNull flags
+
+copyin3:
+        bt             kkString,copyString                             // handle copyinstr and copyoutstr
+        bl             EXT(bcopy)                                              // copyin and copyout: let bcopy do the work
+        li             r3,0                                                    // return success
+        
+        
+// Main exit point for copyin, copyout, copyinstr, and copyoutstr.  Also reached
+// from error recovery if we get a DSI accessing user space.  Clear recovery ptr, 
+// and pop off frame.
+//             r3 = 0, EFAULT, or ENAMETOOLONG
+
+copyinx: 
+        lwz            r2,kkCR3(r1)                    // get callers cr3
+               mfsprg  r6,1                                                    // Get the current thread 
+        bf--   kk64bit,copyinx1                                // skip if 32-bit processor
+        mfmsr  r12
+        rldicl r12,r12,0,MSR_SF_BIT+1                  // if 64-bit processor, turn 64-bit mode off
+        mtmsrd r12                                                             // turn SF off
+        isync                                                                  // wait for the mode to change
+copyinx1:
+               lwz             r0,FM_LR_SAVE+kkFrameSize(r1)   // get return address
+        lwz            r31,kkR31Save(r1)                               // restore callers r31
+        lwz            r4,kkThrErrJmp(r1)                              // load saved thread recover
+        addi   r1,r1,kkFrameSize                               // pop off our stack frame
                mtlr    r0
                mtlr    r0
+               stw             r4,THREAD_RECOVER(r6)                   // restore thread recover
+        mtcrf  0x10,r2                                                 // restore cr3
                blr
 
                blr
 
-.L_call_copyout_multiple:
-       /* unwind the stack */
-       addi    r1,     r1,     FM_SIZE+16
-       lwz     r0,     FM_LR_SAVE(r1)
-       mtlr    r0
 
 
-       b       EXT(copyout_multiple)                                   /* not a call - a jump! */
+/* We get here via the exception handler if an illegal
+ * user memory reference was made.  This error handler is used by
+ * copyin, copyout, copyinstr, and copyoutstr.  Registers are as
+ * they were at point of fault, so for example cr3 flags are valid.
+ */
+
+copyinout_error:
+        li             r3,EFAULT                                               // return error
+        b              copyinx
+
+copyinout_0:                                                                   // degenerate case: 0-length copy
+               mtcrf   0x10,r2                                                 // restore cr3
+        li             r3,0                                                    // return success
+        blr
+        
+copyinout_too_big:                                                             // degenerate case
+        mtcrf  0x10,r2                                                 // restore cr3
+        lwz            r1,0(r1)                                                // pop off stack frame
+        li             r3,ENAMETOOLONG
+        blr
+        
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// Handle copyinstr and copyoutstr.  At this point the stack frame is set up,
+// the recovery ptr is set, the user's buffer is mapped, we're in 64-bit mode
+// if necessary, and:
+//             r3 = source ptr, mapped if copyinstr
+//             r4 = dest ptr, mapped if copyoutstr
+//             r5 = buffer length
+//        r31 = mapped ptr returned by MapUserMemoryWindow
+//     cr3 = kkIn, kkString, kkNull, and kk64bit flags
+// We do word copies unless the buffer is very short, then use a byte copy loop
+// for the leftovers if necessary.  The crossover at which the word loop becomes
+// faster is about seven bytes, counting the zero.
+//
+// We first must word-align the source ptr, in order to avoid taking a spurious
+// page fault.
+
+copyString:
+        cmplwi cr1,r5,15                                               // is buffer very short?
+        mr      r12,r3                          // remember ptr to 1st source byte
+        mtctr  r5                                                              // assuming short, set up loop count for bytes
+        blt--   cr1,copyinstr8                                 // too short for word loop
+        rlwinm  r2,r3,0,0x3                     // get byte offset of 1st byte within word
+        rlwinm  r9,r3,3,0x18                    // get bit offset of 1st byte within word
+        li      r7,-1
+        sub     r3,r3,r2                        // word-align source address
+        add     r6,r5,r2                        // get length starting at byte 0 in word
+        srw     r7,r7,r9                        // get mask for bytes in first word
+        srwi   r0,r6,2                                                 // get #words in buffer
+        lwz     r5,0(r3)                        // get aligned word with first source byte
+        lis            r10,hi16(0xFEFEFEFF)                    // load magic constants into r10 and r11
+        lis            r11,hi16(0x80808080)
+        mtctr  r0                                                              // set up word loop count
+        addi    r3,r3,4                         // advance past the source word
+        ori            r10,r10,lo16(0xFEFEFEFF)
+        ori            r11,r11,lo16(0x80808080)
+        orc     r8,r5,r7                        // map bytes preceeding first source byte into 0xFF
+        bt--   kkNull,copyinstr5enter          // enter loop that just counts
+        
+// Special case 1st word, which has been 0xFF filled on left.  Note that we use
+// "and.", even though we execute both in 32 and 64-bit mode.  This is OK.
+
+        slw     r5,r5,r9                        // left justify payload bytes
+        add            r9,r10,r8                                               // r9 =  data + 0xFEFEFEFF
+        andc   r7,r11,r8                                               // r7 = ~data & 0x80808080
+               subfic  r0,r2,4                                                 // get r0 <- #payload bytes in 1st word
+        and.    r7,r9,r7                                               // if r7==0, then all bytes in r8 are nonzero
+        stw     r5,0(r4)                        // copy payload bytes to dest buffer
+        add            r4,r4,r0                                                // then point to next byte in dest buffer
+        bdnzt   cr0_eq,copyinstr6               // use loop that copies if 0 not found
+        
+        b              copyinstr7                      // 0 found (buffer can't be full)
+        
+        
+// Word loop(s).  They do a word-parallel search for 0s, using the following
+// inobvious but very efficient test:
+//             y =  data + 0xFEFEFEFF
+//             z = ~data & 0x80808080
+// If (y & z)==0, then all bytes in dataword are nonzero.  There are two copies
+// of this loop, one that just counts and another that copies.
+//             r3 = ptr to next word of source (word aligned)
+//             r4 = ptr to next byte in buffer
+//      r6 = original buffer length (adjusted to be word origin)
+//     r10 = 0xFEFEFEFE
+//     r11 = 0x80808080
+//     r12 = ptr to 1st source byte (used to determine string length)
+
+        .align 5                                                               // align inner loops for speed
+copyinstr5:                                                                            // version that counts but does not copy
+        lwz     r8,0(r3)                                               // get next word of source
+        addi    r3,r3,4                         // advance past it
+copyinstr5enter:
+        add            r9,r10,r8                                               // r9 =  data + 0xFEFEFEFF
+        andc   r7,r11,r8                                               // r7 = ~data & 0x80808080
+        and.    r7,r9,r7                        // r7 = r9 & r7 ("." ok even in 64-bit mode)
+        bdnzt   cr0_eq,copyinstr5                              // if r7==0, then all bytes in r8 are nonzero
+
+        b              copyinstr7
+
+        .align 5                                                               // align inner loops for speed
+copyinstr6:                                                                            // version that counts and copies
+        lwz     r8,0(r3)                                               // get next word of source
+        addi    r3,r3,4                         // advance past it
+        addi   r4,r4,4                                                 // increment dest ptr while we wait for data
+        add            r9,r10,r8                                               // r9 =  data + 0xFEFEFEFF
+        andc   r7,r11,r8                                               // r7 = ~data & 0x80808080
+        and.    r7,r9,r7                        // r7 = r9 & r7 ("." ok even in 64-bit mode)
+        stw            r8,-4(r4)                                               // pack all 4 bytes into buffer
+        bdnzt  cr0_eq,copyinstr6                               // if r7==0, then all bytes are nonzero
+
+
+// Either 0 found or buffer filled.  The above algorithm has mapped nonzero bytes to 0
+// and 0 bytes to 0x80 with one exception: 0x01 bytes preceeding the first 0 are also
+// mapped to 0x80.  We must mask out these false hits before searching for an 0x80 byte.
+//             r3 = word aligned ptr to next word of source (ie, r8==mem(r3-4))
+//      r6 = original buffer length (adjusted to be word origin)
+//      r7 = computed vector of 0x00 and 0x80 bytes
+//      r8 = original source word, coming from -4(r3), possibly padded with 0xFFs on left if 1st word
+//     r12 = ptr to 1st source byte (used to determine string length)
+//     cr0 = beq set iff 0 not found
+
+copyinstr7:
+        rlwinm r2,r8,7,0,31                                    // move 0x01 bits to 0x80 position
+               rlwinm  r6,r6,0,0x3                                             // mask down to partial byte count in last word
+        andc   r7,r7,r2                                                // turn off false hits from 0x0100 worst case
+        crnot  kkZero,cr0_eq                                   // 0 found iff cr0_eq is off
+        srwi    r7,r7,8                         // we want to count the 0 as a byte xferred
+               cmpwi   r6,0                                                    // any bytes left over in last word?
+        cntlzw r7,r7                                                   // now we can find the 0 byte (ie, the 0x80)
+        subi    r3,r3,4                         // back up r3 to point to 1st byte in r8
+        srwi   r7,r7,3                                                 // convert 8,16,24,32 to 1,2,3,4
+        add     r3,r3,r7                        // now r3 points one past 0 byte, or at 1st byte not xferred
+        bt++   kkZero,copyinstr10                              // 0 found, so done
+        
+        beq            copyinstr10                                             // r6==0, so buffer truly full
+        mtctr  r6                                                              // 0 not found, loop over r6 bytes
+        b              copyinstr8                                              // enter byte loop for last 1-3 leftover bytes
+        
+
+// Byte loop.  This is used for very small buffers and for the odd bytes left over
+// after searching and copying words at a time.
+//      r3 = ptr to next byte of source
+//      r4 = ptr to next dest byte
+//     r12 = ptr to first byte of source
+//     ctr = count of bytes to check
+    
+        .align 5                                                               // align inner loops for speed
+copyinstr8:                                                                            // loop over bytes of source
+        lbz            r0,0(r3)                                                // get next byte of source
+        addi   r3,r3,1
+        addi   r4,r4,1                                                 // increment dest addr whether we store or not
+        cmpwi  r0,0                                                    // the 0?
+        bt--   kkNull,copyinstr9                               // don't store if copyinstr with NULL ptr
+        stb            r0,-1(r4)
+copyinstr9:
+        bdnzf  cr0_eq,copyinstr8                               // loop if byte not 0 and more room in buffer
+        
+        crmove kkZero,cr0_eq                                   // remember if 0 found or buffer filled
+
+        
+// Buffer filled or 0 found.  Unwind and return.
+//      r3 = ptr to 1st source byte not transferred
+//     r12 = ptr to 1st source byte
+//     r31 = mapped ptr returned by MapUserMemoryWindow
+//     cr3 = kkZero set iff 0 found
 
 
+copyinstr10:
+        lwz            r9,kkCountPtr(r1)                               // get ptr to place to store count of bytes moved
+        sub     r2,r3,r12                       // compute #bytes copied (including the 0)
+        li             r3,0                                                    // assume success return status
+        stw            r2,0(r9)                                                // store #bytes moved
+        bt++   kkZero,copyinx                                  // we did find the 0 so return 0
+        li             r3,ENAMETOOLONG                                 // buffer filled
+        b              copyinx                                                 // join main exit routine
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
 /*
- * boolean_t
- * copyinstr(src, dst, count, maxcount)
- *     vm_offset_t     src;
- *     vm_offset_t     dst;
- *     vm_size_t       maxcount; 
- *     vm_size_t*      count;
+ * int
+ * copypv(source, sink, size, which)
+ *     addr64_t        src;        // r3 and r4
+ *     addr64_t        dst;        // r5 and r6
+ *     size_t          size;           // r7
+ *     int                     which;          // r8
  *
  *
- * Set *count to the number of bytes copied
- * 
- * If dst == NULL, don't copy, just count bytes.
- * Only currently called from klcopyinstr. 
+ * Operand size bytes are copied from operand src into operand dst. The source and
+ * destination operand addresses are given as addr64_t, and may designate starting
+ * locations in physical or virtual memory in any combination except where both are
+ * virtual. Virtual memory locations may be in either the kernel or the current thread's
+ * address space. Operand size may be up to 256MB.
+ *
+ * Operation is controlled by operand which, which offers these options:
+ *             cppvPsrc : source operand is (1) physical or (0) virtual
+ *             cppvPsnk : destination operand is (1) physical or (0) virtual
+ *             cppvKmap : virtual operand is in (1) kernel or (0) current thread
+ *             cppvFsnk : (1) flush destination before and after transfer
+ *             cppvFsrc : (1) flush source before and after transfer
+ *             cppvNoModSnk : (1) don't set source operand's changed bit(s)
+ *             cppvNoRefSrc : (1) don't set destination operand's referenced bit(s)
+ *
+ * Implementation is now split into this new 64-bit path and the old path, hw_copypv_32().
+ * This section describes the operation of the new 64-bit path.
+ *
+ * The 64-bit path utilizes the more capacious 64-bit kernel address space to create a
+ * window in the kernel address space into all of physical RAM plus the I/O hole. Since
+ * the window's mappings specify the proper access policies for the underlying memory,
+ * the new path does not have to flush caches to avoid a cache paradox, so cppvFsnk
+ * and cppvFsrc are ignored. Physical operand adresses are relocated into the physical
+ * memory window, and are accessed with data relocation on. Virtual addresses are either
+ * within the kernel, or are mapped into the kernel address space through the user memory
+ * window. Because accesses to a virtual operand are performed with data relocation on,
+ * the new path does not have to translate the address, disable/enable interrupts, lock
+ * the mapping, or update referenced and changed bits.
+ *
+ * The IBM 970 (a.k.a. G5) processor treats real-mode accesses as guarded, so there is
+ * a substantial performance penalty for copypv operating in real mode. Utilizing the
+ * new 64-bit path, transfer performance increases >100% on the G5.
+ *
+ * The attentive reader may notice that mtmsrd ops are not followed by isync ops as 
+ * might be expected. The 970 follows PowerPC architecture version 2.01, which defines
+ * mtmsrd with L=0 as a context synchronizing op, so a following isync is no longer
+ * required.
+ *
+ * To keep things exciting, we develop 64-bit values in non-volatiles, but we also need
+ * to call 32-bit functions, which would lead to the high-order 32 bits of our values
+ * getting clobbered unless we do something special. So, we preserve our 64-bit non-volatiles
+ * in our own stack frame across calls to 32-bit functions.
+ *             
  */
 
  */
 
-ENTRY(copyinstr, TAG_NO_FRAME_USED)
+// Map operand which bits into non-volatile CR2 and CR3 bits.
+#define whichAlign     ((3+1)*4)
+#define whichMask      0x007F0000
+#define pvPsnk         (cppvPsnkb - whichAlign)
+#define pvPsrc         (cppvPsrcb - whichAlign)
+#define pvFsnk         (cppvFsnkb - whichAlign)
+#define pvFsrc         (cppvFsrcb - whichAlign)
+#define pvNoModSnk     (cppvNoModSnkb - whichAlign)
+#define pvNoRefSrc     (cppvNoRefSrcb - whichAlign)
+#define pvKmap         (cppvKmapb - whichAlign)
+#define pvNoCache      cr2_lt
 
 
-/* Preamble allowing us to call a sub-function */
-               mflr    r0
-               stw             r0,FM_LR_SAVE(r1)
-               stwu    r1,-(FM_SIZE+16)(r1)
+               .align  5
+               .globl  EXT(copypv)
 
 
-#if 0
-               stw             r3,FM_SIZE+0(r1)                                /* (TEST/DEBUG) */
-               stw             r4,FM_SIZE+4(r1)                                /* (TEST/DEBUG) */
-               stw             r5,FM_SIZE+8(r1)                                /* (TEST/DEBUG) */
-               stw             r6,FM_SIZE+12(r1)                               /* (TEST/DEBUG) */
-               mr              r7,r0                                                   /* (TEST/DEBUG) */
+LEXT(copypv)
+        mfsprg r10,2                                                   // get feature flags
+        mtcrf  0x02,r10                                                // we need to test pf64Bit
+        bt++   pf64Bitb,copypv_64                              // skip if 64-bit processor (only they take hint)
+        
+        b              EXT(hw_copypv_32)                               // carry on with 32-bit copypv
+
+// Push a 32-bit ABI-compliant stack frame and preserve all non-volatiles that we'll clobber.        
+copypv_64:
+               mfsprg  r9,1                                                    // get current thread
+               stwu    r1,-(FM_ALIGN((31-26+11)*4)+FM_SIZE)(r1)
+                                                                                               // allocate stack frame and link it
+               mflr    r0                                                              // get return address
+               mfcr    r10                                                             // get cr2 and cr3
+               lwz             r12,THREAD_RECOVER(r9)                  // get error callback
+               stw             r26,FM_ARG0+0x00(r1)                    // save non-volatile r26
+               stw             r27,FM_ARG0+0x04(r1)                    // save non-volatile r27
+               stw             r28,FM_ARG0+0x08(r1)                    // save non-volatile r28
+               stw             r29,FM_ARG0+0x0C(r1)                    // save non-volatile r29
+               stw             r30,FM_ARG0+0x10(r1)                    // save non-volatile r30
+               stw             r31,FM_ARG0+0x14(r1)                    // save non-volatile r31
+               stw             r12,FM_ARG0+0x20(r1)                    // save error callback
+               stw             r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
+                                                                                               // save return address
+               stw             r10,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
+                                                                                               // save non-volatile cr2 and cr3
+
+// Non-volatile register usage in this routine is:
+//     r26: saved msr image
+//     r27: current pmap_t / virtual source address
+//     r28: destination virtual address
+//     r29: source address
+//     r30: destination address
+//     r31: byte count to copy
+//     cr2/3: parameter 'which' bits
+
+               rlwinm  r8,r8,whichAlign,whichMask              // align and mask which bits
+               mr              r31,r7                                                  // copy size to somewhere non-volatile
+               mtcrf   0x20,r8                                                 // insert which bits into cr2 and cr3
+               mtcrf   0x10,r8                                                 // insert which bits into cr2 and cr3
+               rlwinm  r29,r3,0,1,0                                    // form source address high-order bits
+               rlwinm  r30,r5,0,1,0                                    // form destination address high-order bits
+               rlwimi  r29,r4,0,0,31                                   // form source address low-order bits
+               rlwimi  r30,r6,0,0,31                                   // form destination address low-order bits
+               crand   cr7_lt,pvPsnk,pvPsrc                    // are both operand addresses physical?
+               cntlzw  r0,r31                                                  // count leading zeroes in byte count
+               cror    cr7_eq,pvPsnk,pvPsrc                    // cr7_eq <- source or destination is physical
+               bf--    cr7_eq,copypv_einval                    // both operands may not be virtual
+               cmplwi  r0,4                                                    // byte count greater than or equal 256M (2**28)?
+               blt--   copypv_einval                                   // byte count too big, give EINVAL
+               cmplwi  r31,0                                                   // byte count zero?
+               beq--   copypv_zero                                             // early out
+               bt              cr7_lt,copypv_phys                              // both operand addresses are physical
+               mr              r28,r30                                                 // assume destination is virtual
+               bf              pvPsnk,copypv_dv                                // is destination virtual?
+               mr              r28,r29                                                 // no, so source must be virtual
+copypv_dv:
+               lis             r27,ha16(EXT(kernel_pmap))              // get kernel's pmap_t *, high-order
+               lwz             r27,lo16(EXT(kernel_pmap))(r27) // get kernel's pmap_t
+               bt              pvKmap,copypv_kern                              // virtual address in kernel map?
+               lwz             r3,ACT_VMMAP(r9)                                // get user's vm_map *
+               rldicl  r4,r28,32,32                                    // r4, r5 <- addr64_t virtual address 
+               rldicl  r5,r28,0,32
+               std             r29,FM_ARG0+0x30(r1)                    // preserve 64-bit r29 across 32-bit call
+               std             r30,FM_ARG0+0x38(r1)                    // preserve 64-bit r30 across 32-bit call
+               bl              EXT(MapUserMemoryWindow)                // map slice of user space into kernel space
+               ld              r29,FM_ARG0+0x30(r1)                    // restore 64-bit r29
+               ld              r30,FM_ARG0+0x38(r1)                    // restore 64-bit r30
+               rlwinm  r28,r3,0,1,0                                    // convert relocated addr64_t virtual address 
+               rlwimi  r28,r4,0,0,31                                   //  into a single 64-bit scalar
+copypv_kern:
+
+// Since we'll be accessing the virtual operand with data-relocation on, we won't need to 
+// update the referenced and changed bits manually after the copy. So, force the appropriate
+// flag bit on for the virtual operand.
+               crorc   pvNoModSnk,pvNoModSnk,pvPsnk    // for virtual dest, let hardware do ref/chg bits
+               crorc   pvNoRefSrc,pvNoRefSrc,pvPsrc    // for virtual source, let hardware do ref bit
                
                
-               bl              EXT(tracecopystr)                               /* (TEST/DEBUG) */
+// We'll be finding a mapping and looking at, so we need to disable 'rupts.
+               lis             r0,hi16(MASK(MSR_VEC))                  // get vector mask
+               ori             r0,r0,lo16(MASK(MSR_FP))                // insert fp mask
+               mfmsr   r26                                                             // save current msr
+               andc    r26,r26,r0                                              // turn off VEC and FP in saved copy
+               ori             r0,r0,lo16(MASK(MSR_EE))                // add EE to our mask
+               andc    r0,r26,r0                                               // disable EE in our new msr image
+               mtmsrd  r0                                                              // introduce new msr image
+
+// We're now holding the virtual operand's pmap_t in r27 and its virtual address in r28. We now
+// try to find a mapping corresponding to this address in order to determine whether the address
+// is cacheable. If we don't find a mapping, we can safely assume that the operand is cacheable
+// (a non-cacheable operand must be a block mapping, which will always exist); otherwise, we
+// examine the mapping's caching-inhibited bit.
+               mr              r3,r27                                                  // r3 <- pmap_t pmap
+               rldicl  r4,r28,32,32                                    // r4, r5 <- addr64_t va
+               rldicl  r5,r28,0,32
+               la              r6,FM_ARG0+0x18(r1)                             // r6 <- addr64_t *nextva
+               li              r7,1                                                    // r7 <- int full, search nested mappings
+               std             r26,FM_ARG0+0x28(r1)                    // preserve 64-bit r26 across 32-bit calls
+               std             r28,FM_ARG0+0x30(r1)                    // preserve 64-bit r28 across 32-bit calls
+               std             r29,FM_ARG0+0x38(r1)                    // preserve 64-bit r29 across 32-bit calls
+               std             r30,FM_ARG0+0x40(r1)                    // preserve 64-bit r30 across 32-bit calls
+               bl              EXT(mapping_find)                               // find mapping for virtual operand
+               mr.             r3,r3                                                   // did we find it?
+               beq             copypv_nomapping                                // nope, so we'll assume it's cacheable
+               lwz             r4,mpVAddr+4(r3)                                // get low half of virtual addr for hw flags
+               rlwinm. r4,r4,0,mpIb-32,mpIb-32                 // caching-inhibited bit set?
+               crnot   pvNoCache,cr0_eq                                // if it is, use bcopy_nc
+               bl              EXT(mapping_drop_busy)                  // drop busy on the mapping
+copypv_nomapping:
+               ld              r26,FM_ARG0+0x28(r1)                    // restore 64-bit r26
+               ld              r28,FM_ARG0+0x30(r1)                    // restore 64-bit r28
+               ld              r29,FM_ARG0+0x38(r1)                    // restore 64-bit r29
+               ld              r30,FM_ARG0+0x40(r1)                    // restore 64-bit r30
+               mtmsrd  r26                                                             // restore msr to it's previous state
+
+// Set both the source and destination virtual addresses to the virtual operand's address --
+// we'll overlay one of them with the physical operand's address.
+               mr              r27,r28                                                 // make virtual operand BOTH source AND destination
+
+// Now we're ready to relocate the physical operand address(es) into the physical memory window.
+// Recall that we've mapped physical memory (including the I/O hole) into the kernel's address
+// space somewhere at or over the 2**32 line. If one or both of the operands are in the I/O hole,
+// we'll set the pvNoCache flag, forcing use of non-caching bcopy_nc() to do the copy.
+copypv_phys:
+               ld              r6,lgPMWvaddr(0)                                // get physical memory window virtual address
+               bf              pvPsnk,copypv_dstvirt                   // is destination address virtual?
+               cntlzd  r4,r30                                                  // count leading zeros in destination address
+               cmplwi  r4,32                                                   // if it's 32, then it's in the I/O hole (2**30 to 2**31-1)
+               cror    pvNoCache,cr0_eq,pvNoCache              // use bcopy_nc for I/O hole locations          
+               add             r28,r30,r6                                              // relocate physical destination into physical window
+copypv_dstvirt:
+               bf              pvPsrc,copypv_srcvirt                   // is source address virtual?
+               cntlzd  r4,r29                                                  // count leading zeros in source address
+               cmplwi  r4,32                                                   // if it's 32, then it's in the I/O hole (2**30 to 2**31-1)
+               cror    pvNoCache,cr0_eq,pvNoCache              // use bcopy_nc for I/O hole locations          
+               add             r27,r29,r6                                              // relocate physical source into physical window
+copypv_srcvirt:
+
+// Once the copy is under way (bcopy or bcopy_nc), we will want to get control if anything
+// funny happens during the copy. So, we set a pointer to our error handler in the per-thread
+// control block.
+               mfsprg  r8,1                                                    // get current threads stuff
+               lis             r3,hi16(copypv_error)                   // get our error callback's address, high
+               ori             r3,r3,lo16(copypv_error)                // get our error callback's address, low
+               stw             r3,THREAD_RECOVER(r8)                   // set our error callback
                
                
-               lwz             r3,FM_SIZE+0(r1)                                /* (TEST/DEBUG) */
-               lwz             r4,FM_SIZE+4(r1)                                /* (TEST/DEBUG) */
-               lwz             r5,FM_SIZE+8(r1)                                /* (TEST/DEBUG) */
-               stw             r6,FM_SIZE+12(r1)                               /* (TEST/DEBUG) */
+// Since our physical operand(s) are relocated at or above the 2**32 line, we must enter
+// 64-bit mode.
+               li              r0,1                                                    // get a handy one bit
+               mfmsr   r3                                                              // get current msr
+               rldimi  r3,r0,63,MSR_SF_BIT                             // set SF bit on in our msr copy
+               mtmsrd  r3                                                              // enter 64-bit mode
+
+// If requested, flush data cache
+// Note that we don't flush, the code is being saved "just in case".
+#if 0
+               bf              pvFsrc,copypv_nfs                               // do we flush the source?
+               rldicl  r3,r27,32,32                                    // r3, r4 <- addr64_t source virtual address
+               rldicl  r4,r27,0,32
+               mr              r5,r31                                                  // r5 <- count (in bytes)
+               li              r6,0                                                    // r6 <- boolean phys (false, not physical)
+               bl              EXT(flush_dcache)                               // flush the source operand
+copypv_nfs:
+               bf              pvFsnk,copypv_nfdx                              // do we flush the destination?
+               rldicl  r3,r28,32,32                                    // r3, r4 <- addr64_t destination virtual address
+               rldicl  r4,r28,0,32
+               mr              r5,r31                                                  // r5 <- count (in bytes)
+               li              r6,0                                                    // r6 <- boolean phys (false, not physical)
+               bl              EXT(flush_dcache)                               // flush the destination operand
+copypv_nfdx:
 #endif
 #endif
-                               
-               cmpli   cr0,r5,0
-               ble-    cr0,.L_copyinout_trivial
 
 
-/* we know we have a valid copyin to do now */
-/* Set up thread_recover in case we hit an illegal address */
-               
-               li              r0,0                                                    
-               mfsprg  r8,1                                                    /* Get the current act */
-               lwz             r10,ACT_THREAD(r8)
-               stw             r0,0(r6)                                                /* Clear result length */
-               lis             r11,HIGH_ADDR(.L_copyinout_error)
-               lwz             r8,ACT_VMMAP(r8)                                ; Get the map for this activation
-               rlwinm  r12,r3,6,26,29                                  ; Get index to the segment slot
-               lwz             r8,VMMAP_PMAP(r8)
-               ori             r11,r11,LOW_ADDR(.L_copyinout_error)
-               add             r8,r8,r12                                               ; Start indexing to the segment value
-               stw             r11,THREAD_RECOVER(r10)
-               rlwinm  r3,r3,0,4,31
-               lwz             r7,PMAP_SEGS(r8)                                ; Get the source SR value
-               oris    r3,r3,(SR_COPYIN_NUM << (28-16))        /* Set the copyin segment as the source */
-
-/* Copy byte by byte for now - TODO NMGS speed this up with
- * some clever (but fairly standard) logic for word copies.
- * We don't use a copyinstr_multiple since copyinstr is called
- * with INT_MAX in the linux server. Eugh.
- */
+// Call bcopy or bcopy_nc to perform the copy.
+               mr              r3,r27                                                  // r3 <- source virtual address
+               mr              r4,r28                                                  // r4 <- destination virtual address
+               mr              r5,r31                                                  // r5 <- bytes to copy
+               bt              pvNoCache,copypv_nc                             // take non-caching route
+               bl              EXT(bcopy)                                              // call bcopy to do the copying
+               b               copypv_copydone
+copypv_nc:
+               bl              EXT(bcopy_nc)                                   // call bcopy_nc to do the copying
+copypv_copydone:
 
 
-               li              r9,0                                                    /* Clear byte counter */
+// If requested, flush data cache
+// Note that we don't flush, the code is being saved "just in case".
+#if 0
+               bf              pvFsrc,copypv_nfsx                              // do we flush the source?
+               rldicl  r3,r27,32,32                                    // r3, r4 <- addr64_t source virtual address
+               rldicl  r4,r27,0,32
+               mr              r5,r31                                                  // r5 <- count (in bytes)
+               li              r6,0                                                    // r6 <- boolean phys (false, not physical)
+               bl              EXT(flush_dcache)                               // flush the source operand
+copypv_nfsx:
+               bf              pvFsnk,copypv_nfd                               // do we flush the destination?
+               rldicl  r3,r28,32,32                                    // r3, r4 <- addr64_t destination virtual address
+               rldicl  r4,r28,0,32
+               mr              r5,r31                                                  // r5 <- count (in bytes)
+               li              r6,0                                                    // r6 <- boolean phys (false, not physical)
+               bl              EXT(flush_dcache)                               // flush the destination operand
+copypv_nfd:
+#endif
 
 
-/* If the destination is NULL, don't do writes,
- * just count bytes. We set CR7 outside the loop to save time
- */
-               cmpwi   cr7,r4,0                                                /* Is the destination null? */
-               
-nxtseg:        mtsr    SR_COPYIN,r7                                    /* Set the source SR */
-               isync
+// Leave 64-bit mode.
+               mfmsr   r3                                                              // get current msr
+               rldicl  r3,r3,0,MSR_SF_BIT+1                    // clear SF bit in our copy
+               mtmsrd  r3                                                              // leave 64-bit mode
 
 
-.L_copyinstr_loop:
-               lbz             r0,0(r3)                                                /* Get the source */
-               addic.  r5,r5,-1                                                /* Have we gone far enough? */
-               addi    r3,r3,1                                                 /* Bump source pointer */
+// If requested, set ref/chg on source/dest physical operand(s). It is possible that copy is
+// from/to a RAM disk situated outside of mapped physical RAM, so we check each page by calling
+// mapping_phys_lookup() before we try to set its ref/chg bits; otherwise, we might panic.
+// Note that this code is page-size sensitive, so it should probably be a part of our low-level
+// code in hw_vm.s.
+               bt              pvNoModSnk,copypv_nomod                 // skip destination update if not requested
+               std             r29,FM_ARG0+0x30(r1)                    // preserve 64-bit r29 across 32-bit calls
+               li              r26,1                                                   // r26 <- 4K-page count                                         
+               mr              r27,r31                                                 // r27 <- byte count
+               rlwinm  r3,r30,0,20,31                                  // does destination cross a page boundary?
+               subfic  r3,r3,4096                                              //
+               cmplw   r3,r27                                                  // 
+               blt             copypv_modnox                                   // skip if not crossing case
+               subf    r27,r3,r27                                              // r27 <- byte count less initial fragment
+               addi    r26,r26,1                                               // increment page count
+copypv_modnox:
+               srdi    r3,r27,12                                               // pages to update (not including crosser)
+               add             r26,r26,r3                                              // add in crosser
+               srdi    r27,r30,12                                              // r27 <- destination page number
+copypv_modloop:
+               mr              r3,r27                                                  // r3 <- destination page number                                
+               la              r4,FM_ARG0+0x18(r1)                             // r4 <- unsigned int *pindex
+               bl              EXT(mapping_phys_lookup)                // see if page is really there
+               mr.             r3,r3                                                   // is it?
+               beq--   copypv_modend                                   // nope, break out of modify loop
+               mr              r3,r27                                                  // r3 <- destination page number
+               bl              EXT(mapping_set_mod)                    // set page changed status
+               subi    r26,r26,1                                               // decrement page count
+               cmpwi   r26,0                                                   // done yet?
+               bgt             copypv_modloop                                  // nope, iterate
+copypv_modend:
+               ld              r29,FM_ARG0+0x30(r1)                    // restore 64-bit r29
+copypv_nomod:
+               bt              pvNoRefSrc,copypv_done                  // skip source update if not requested
+copypv_debugref:
+               li              r26,1                                                   // r26 <- 4K-page count                                         
+               mr              r27,r31                                                 // r27 <- byte count
+               rlwinm  r3,r29,0,20,31                                  // does source cross a page boundary?
+               subfic  r3,r3,4096                                              //
+               cmplw   r3,r27                                                  // 
+               blt             copypv_refnox                                   // skip if not crossing case
+               subf    r27,r3,r27                                              // r27 <- byte count less initial fragment
+               addi    r26,r26,1                                               // increment page count
+copypv_refnox:
+               srdi    r3,r27,12                                               // pages to update (not including crosser)
+               add             r26,r26,r3                                              // add in crosser
+               srdi    r27,r29,12                                              // r27 <- source page number
+copypv_refloop:
+               mr              r3,r27                                                  // r3 <- source page number
+               la              r4,FM_ARG0+0x18(r1)                             // r4 <- unsigned int *pindex
+               bl              EXT(mapping_phys_lookup)                // see if page is really there
+               mr.             r3,r3                                                   // is it?
+               beq--   copypv_done                                             // nope, break out of modify loop
+               mr              r3,r27                                                  // r3 <- source  page number
+               bl              EXT(mapping_set_ref)                    // set page referenced status
+               subi    r26,r26,1                                               // decrement page count
+               cmpwi   r26,0                                                   // done yet?
+               bgt             copypv_refloop                                  // nope, iterate
                
                
-               cmpwi   cr1,r0,0                                                /* Did we hit a null? */
+// Return, indicating success.
+copypv_done:
+copypv_zero:
+               li              r3,0                                                    // our efforts were crowned with success
 
 
-               beq             cr7,.L_copyinstr_no_store               /* If we are just counting, skip the store... */
-       
-               stb             r0,0(r4)                                                /* Move to sink */
-               addi    r4,r4,1                                                 /* Advance sink pointer */
+// Pop frame, restore caller's non-volatiles, clear recovery routine pointer.
+copypv_return:
+               mfsprg  r9,1                                                    // get current threads stuff
+               lwz             r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
+                                                                                               // get return address
+               lwz             r4,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
+                                                                                               // get non-volatile cr2 and cr3
+               lwz             r26,FM_ARG0+0x00(r1)                    // restore non-volatile r26
+               lwz             r27,FM_ARG0+0x04(r1)                    // restore non-volatile r27
+               mtlr    r0                                                              // restore return address
+               lwz             r28,FM_ARG0+0x08(r1)                    // restore non-volatile r28
+               mtcrf   0x20,r4                                                 // restore non-volatile cr2
+               mtcrf   0x10,r4                                                 // restore non-volatile cr3
+               lwz             r11,FM_ARG0+0x20(r1)                    // save error callback
+               lwz             r29,FM_ARG0+0x0C(r1)                    // restore non-volatile r29
+               lwz             r30,FM_ARG0+0x10(r1)                    // restore non-volatile r30
+               lwz             r31,FM_ARG0+0x14(r1)                    // restore non-volatile r31
+               stw             r11,THREAD_RECOVER(r9)                  // restore our error callback
+               lwz             r1,0(r1)                                                // release stack frame
+                                                                                               
+               blr                                                                             // y'all come back now
 
 
-.L_copyinstr_no_store:
+// Invalid argument handler.
+copypv_einval:
+               li              r3,EINVAL                                               // invalid argument
+               b               copypv_return                                   // return
 
 
-               addi    r9,r9,1                                                 /* Count the character */
-               beq-    cr1,.L_copyinstr_done                   /* We're done if we did a null... */
-               beq-    cr0,L_copyinstr_toobig                  /* Also if we maxed the count... */
-       
-/* Check to see if the copyin pointer has moved out of the
- * copyin segment, if it has we must remap.
- */
-
-               rlwinm. r0,r3,0,4,31                                    /* Did we wrap around to 0? */
-               bne+    cr0,.L_copyinstr_loop                   /* Nope... */
-
-               lwz             r7,PMAP_SEGS+4(r8)                              ; Get the next source SR value
-               addi    r8,r8,4                                                 ; Point to the next segment
-               oris    r3,r0,(SR_COPYIN_NUM << (28-16))        /* Reset the segment number */
-               b               nxtseg                                                  /* Keep going... */
-       
-L_copyinstr_toobig:
-               li              r3,ENAMETOOLONG
-               b               L_copyinstr_return
-.L_copyinstr_done:
-               li              r3,0                                                    /* Normal return */
-L_copyinstr_return:
-               li              r4,0                                                    /* to clear thread_recover */
-               stw             r9,0(r6)                                                /* Set how many bytes we did */
-               stw             r4,THREAD_RECOVER(r10)                  /* Clear recovery exit */
-
-               addi    r1,     r1,     FM_SIZE+16
-               lwz             r0,     FM_LR_SAVE(r1)
-               mtlr    r0
-               blr
+// Error encountered during bcopy or bcopy_nc.         
+copypv_error:
+               mfmsr   r3                                                              // get current msr
+               rldicl  r3,r3,0,MSR_SF_BIT+1                    // clear SF bit in our copy
+               mtmsrd  r3                                                              // leave 64-bit mode
+               li              r3,EFAULT                                               // it was all his fault
+               b               copypv_return                                   // return