]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/ppc/movc.s
xnu-517.12.7.tar.gz
[apple/xnu.git] / osfmk / ppc / movc.s
index 2a17ac245590c1eae2f961ae602f8fff12d9009c..6ef231f2555f4edd363fc6873f6e70c97b157c13 100644 (file)
@@ -1,24 +1,21 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
- * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
  * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
  * 
  * @APPLE_LICENSE_HEADER_END@
  */
 #include <assym.s>
 #include <sys/errno.h>
 
+#define INSTRUMENT 0
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
  * void pmap_zero_page(vm_offset_t pa)
  *
- * zero a page of physical memory.
+ * Zero a page of physical memory.  This routine runs in 32 or 64-bit mode,
+ * and handles 32 and 128-byte cache lines.
  */
 
-#if DEBUG
-       /* C debug stub in pmap.c calls this */
-ENTRY(pmap_zero_page_assembler, TAG_NO_FRAME_USED)
-#else
-ENTRY(pmap_zero_page, TAG_NO_FRAME_USED)
-#endif /* DEBUG */
-
-               mfmsr   r6                                                              /* Get the MSR */
-               rlwinm  r6,r6,0,MSR_FP_BIT+1,MSR_FP_BIT-1       ; Force floating point off
-               rlwinm  r6,r6,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1     ; Force vectors off
-               rlwinm  r7,     r6,     0,      MSR_DR_BIT+1,   MSR_DR_BIT-1    /* Turn off DR */
-               rlwinm  r7,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1       ; Disable interruptions
-               li              r4,PPC_PGBYTES-CACHE_LINE_SIZE  /* Point to the end of the page */
-               mtmsr   r7                                                              /* Set MSR to DR off */
-               isync                                                                   /* Ensure data translations are off */
-
-
-.L_phys_zero_loop:     
-               subic.  r5,r4,CACHE_LINE_SIZE                   /* Point to the next one */
-               dcbz    r4, r3                                                  /* Clear the whole thing to 0s */
-               subi    r4,r5,CACHE_LINE_SIZE                   /* Point to the next one */
-               dcbz    r5, r3                                                  /* Clear the next to zeros */
-               bgt+    .L_phys_zero_loop                               /* Keep going until we do the page... */
-
-               sync                                                                    /* Make sure they're all done */
-               li              r4,PPC_PGBYTES-CACHE_LINE_SIZE  /* Point to the end of the page */
-
-.L_inst_inval_loop:    
-               subic.  r5,r4,CACHE_LINE_SIZE                   /* Point to the next one */
-               icbi    r4, r3                                                  /* Clear the whole thing to 0s */
-               subi    r4,r5,CACHE_LINE_SIZE                   /* Point to the next one */
-               icbi    r5, r3                                                  /* Clear the next to zeros */
-               bgt+    .L_inst_inval_loop                              /* Keep going until we do the page... */
-
-               sync                                                                    /* Make sure they're all done */
-
-               mtmsr   r6              /* Restore original translations */
-               isync                   /* Ensure data translations are on */
 
-               blr
+               .align  5
+               .globl  EXT(pmap_zero_page)
+
+LEXT(pmap_zero_page)
 
+        mflr   r12                                                             // save return address
+        bl             EXT(ml_set_physical_disabled)   // turn DR and EE off, SF on, get features in r10
+        mtlr   r12                                                             // restore return address
+        andi.  r9,r10,pf32Byte+pf128Byte               // r9 <- cache line size
+
+        subfic r4,r9,PPC_PGBYTES                               // r4 <- starting offset in page
+               
+               bt++    pf64Bitb,page0S4                                // Go do the big guys...
+               
+               slwi    r3,r3,12                                                // get page address from page num
+               b               page_zero_1                                             // Jump to line aligned loop...
+
+        .align 5
+
+               nop
+               nop
+               nop
+               nop
+               nop
+               nop
+               nop
+               
+page0S4:
+               sldi    r3,r3,12                                                // get page address from page num
+
+page_zero_1:                                                                   // loop zeroing cache lines
+        sub.   r5,r4,r9                                                // more to go?
+        dcbz128        r3,r4                                                   // zero either 32 or 128 bytes
+        sub            r4,r5,r9                                                // generate next offset
+        dcbz128        r3,r5
+        bne--  page_zero_1
+        
+        b              EXT(ml_restore)                                 // restore MSR and do the isync
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /* void
  * phys_copy(src, dst, bytecount)
- *      vm_offset_t     src;
- *      vm_offset_t     dst;
+ *      addr64_t           src;
+ *      addr64_t           dst;
  *      int             bytecount
  *
  * This routine will copy bytecount bytes from physical address src to physical
- * address dst. 
+ * address dst.  It runs in 64-bit mode if necessary, but does not handle
+ * overlap or make any attempt to be optimal.  Length must be a signed word.
+ * Not performance critical.
  */
 
-ENTRY(phys_copy, TAG_NO_FRAME_USED)
-
-       /* Switch off data translations */
-       mfmsr   r6
-       rlwinm  r6,r6,0,MSR_FP_BIT+1,MSR_FP_BIT-1       ; Force floating point off
-       rlwinm  r6,r6,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1     ; Force vectors off
-       rlwinm  r7,     r6,     0,      MSR_DR_BIT+1,   MSR_DR_BIT-1
-       rlwinm  r7,     r7,     0,      MSR_EE_BIT+1,   MSR_EE_BIT-1
-       mtmsr   r7
-       isync                   /* Ensure data translations are off */
-
-       subi    r3,     r3,     4
-       subi    r4,     r4,     4
-
-       cmpwi   r5,     3
-       ble-    .L_phys_copy_bytes
-.L_phys_copy_loop:
-       lwz     r0,     4(r3)
-       addi    r3,     r3,     4
-       subi    r5,     r5,     4
-       stw     r0,     4(r4)
-       addi    r4,     r4,     4
-       cmpwi   r5,     3
-       bgt+    .L_phys_copy_loop
-
-       /* If no leftover bytes, we're done now */
-       cmpwi   r5,     0
-       beq+    .L_phys_copy_done
-       
-.L_phys_copy_bytes:
-       addi    r3,     r3,     3
-       addi    r4,     r4,     3
-.L_phys_copy_byte_loop:        
-       lbz     r0,     1(r3)
-       addi    r3,     r3,     1
-       subi    r5,     r5,     1
-       stb     r0,     1(r4)
-       addi    r4,     r4,     1
-       cmpwi   r5,     0
-       bne+    .L_phys_copy_byte_loop
-
-.L_phys_copy_done:
-       mtmsr   r6              /* Restore original translations */
-       isync                   /* Ensure data translations are off */
-
-       blr
 
+               .align  5
+               .globl  EXT(phys_copy)
+
+LEXT(phys_copy)
+
+               rlwinm  r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
+        mflr   r12                                                             // get return address
+               rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
+               rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
+        bl             EXT(ml_set_physical_disabled)   // turn DR and EE off, SF on, get features in r10
+               rlwimi  r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
+        mtlr   r12                                                             // restore return address
+        subic. r5,r7,4                                                 // a word to copy?
+        b              phys_copy_2
+        
+               .align  5
+         
+phys_copy_1:                                                                   // loop copying words
+        subic. r5,r5,4                                                 // more to go?
+        lwz            r0,0(r3)
+        addi   r3,r3,4
+        stw            r0,0(r4)
+        addi   r4,r4,4
+phys_copy_2:
+        bge            phys_copy_1
+        addic. r5,r5,4                                                 // restore count
+        ble            phys_copy_4                                             // no more
+        
+                                                                                       // Loop is aligned here
+        
+phys_copy_3:                                                                   // loop copying bytes
+        subic. r5,r5,1                                                 // more to go?
+        lbz            r0,0(r3)
+        addi   r3,r3,1
+        stb            r0,0(r4)
+        addi   r4,r4,1
+        bgt            phys_copy_3
+phys_copy_4:        
+        b              EXT(ml_restore)                                 // restore MSR and do the isync
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /* void
  * pmap_copy_page(src, dst)
- *      vm_offset_t     src;
- *      vm_offset_t     dst;
+ *      ppnum_t     src;
+ *      ppnum_t     dst;
  *
  * This routine will copy the physical page src to physical page dst
  * 
- * This routine assumes that the src and dst are page aligned and that the
- * destination is cached.
- *
- * We also must assume that noone will be executing within the destination
- * page.  We also assume that this will be used for paging
+ * This routine assumes that the src and dst are page numbers and that the
+ * destination is cached.  It runs on 32 and 64 bit processors, with and
+ * without altivec, and with 32 and 128 byte cache lines.
+ * We also must assume that no-one will be executing within the destination
+ * page, and that this will be used for paging.  Because this
+ * is a common routine, we have tuned loops for each processor class.
  *
  */
+#define        kSFSize (FM_SIZE+160)
 
-#if DEBUG
-       /* if debug, we have a little piece of C around this
-        * in pmap.c that gives some trace ability
-        */
-ENTRY(pmap_copy_page_assembler, TAG_NO_FRAME_USED)
-#else
 ENTRY(pmap_copy_page, TAG_NO_FRAME_USED)
-#endif /* DEBUG */
-
-#if 0
-                       mfpvr   r9                                                      ; Get the PVR
-                       rlwinm  r9,r9,16,16,31                          ; Isolate the PPC processor
-                       cmplwi  r9,PROCESSOR_VERSION_Max        ; Do we have Altivec?
-                       beq+    wegotaltivec                            ; Yeah...
-#endif
-               
-                       mfmsr   r9                                                      ; Get the MSR
-                       rlwinm  r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1       ; Force floating point off
-                       rlwinm  r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1     ; Force vectors off
-                       stwu    r1,-(FM_SIZE+32)(r1)            ; Make a frame for us
-                       rlwinm  r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1       ; Disable interruptions
-                       ori             r7,r7,lo16(MASK(MSR_FP))        ; Turn on the FPU
-                       mtmsr   r7                                                      ; Disable rupts and enable FPU
-                       isync
-               
-                       stfd    f0,FM_SIZE+0(r1)                        ; Save an FP register
-                       rlwinm  r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1       ; Clear the DDAT bit
-                       stfd    f1,FM_SIZE+8(r1)                        ; Save an FP register
-                       addi    r6,r3,PPC_PGBYTES                       ; Point to the start of the next page
-                       stfd    f2,FM_SIZE+16(r1)                       ; Save an FP register
-                       mr              r8,r4                                           ; Save the destination
-                       stfd    f3,FM_SIZE+24(r1)                       ; Save an FP register
-               
-                       mtmsr   r7                                                      ; Set the new MSR
-                       isync                                                           ; Ensure data translations are off
-
-                       dcbt    br0, r3                                         /* Start in first input line */
-                       li              r5,     CACHE_LINE_SIZE                 /* Get the line size */
-
-.L_pmap_copy_page_loop:
-                       dcbz    0, r4                                           /* Allocate a line for the output */
-                       lfd             f0, 0(r3)                                       /* Get first 8 */
-                       lfd             f1, 8(r3)                                       /* Get second 8 */
-                       lfd             f2, 16(r3)                                      /* Get third 8 */
-                       stfd    f0, 0(r4)                                       /* Put first 8 */
-                       dcbt    r5, r3                                          /* Start next line coming in */
-                       lfd             f3, 24(r3)                                      /* Get fourth 8 */
-                       stfd    f1,     8(r4)                                   /* Put second 8 */
-                       addi    r3,r3,CACHE_LINE_SIZE           /* Point to the next line in */
-                       stfd    f2,     16(r4)                                  /* Put third 8 */
-                       cmplw   cr0,r3,r6                                       /* See if we're finished yet */
-                       stfd    f3,     24(r4)                                  /* Put fourth 8 */
-                       dcbst   br0,r4                                          /* Force it out */
-                       addi    r4,r4,CACHE_LINE_SIZE           /* Point to the next line out */
-                       blt+    .L_pmap_copy_page_loop          /* Copy the whole page */
-                       
-                       sync                                                            /* Make sure they're all done */
-                       li              r4,PPC_PGBYTES-CACHE_LINE_SIZE  /* Point to the end of the page */
-
-invalinst:     
-                       subic.  r5,r4,CACHE_LINE_SIZE           /* Point to the next one */
-                       icbi    r4, r8                                          /* Trash the i-cache */
-                       subi    r4,r5,CACHE_LINE_SIZE           /* Point to the next one */
-                       icbi    r5, r8                                          /* Trash the i-cache */
-                       bgt+    invalinst                                       /* Keep going until we do the page... */
-       
-                       rlwimi  r7,r9,0,MSR_DR_BIT,MSR_DR_BIT   ; Set DDAT if on
-                       sync                                                            ; Make sure all invalidates done
-                       
-                       mtmsr   r7                                                      ; Set DDAT correctly
-                       isync           
-                       
-                       lfd             f0,FM_SIZE+0(r1)                        ; Restore an FP register
-                       lfd             f1,FM_SIZE+8(r1)                        ; Restore an FP register
-                       lfd             f2,FM_SIZE+16(r1)                       ; Restore an FP register
-                       lfd             f3,FM_SIZE+24(r1)                       ; Restore an FP register
-                       
-                       lwz             r1,0(r1)                                        ; Pop up the stack
-       
-                       mtmsr   r9                                                      ; Turn off FPU now and maybe rupts back on
-                       isync                                                           
-                       blr
-               
-#if 0
-;
-;                      This is not very optimal.  We just do it here for a test of 
-;                      Altivec in the kernel.
-;
-wegotaltivec:
-                       mfmsr   r9                                                      ; Get the MSR
-                       lis             r8,hi16(0xC0000000)                     ; Make sure we keep the first 2 vector registers
-                       rlwinm  r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1       ; Disable interruptions
-                       lis             r6,lo16(2*256+128)                      ; Specify 128 blocks of 2 vectors each
-                       rlwinm  r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1       ; Clear the DDAT bit
-                       ori             r6,r6,32                                        ; Set a 32-byte stride
-                       mtsprg  256,r8                                          ; Set VRSave
-                       mtmsr   r7                                                      ; Disable rupts and turn xlate off
-                       isync
-       
-                       addi    r11,r3,4096                                     ; Point to the next page
-                       li              r10,16                                          ; Get vector size
-
-avmovepg:      lvxl    v0,br0,r3                                       ; Get first half of line
-                       dcba    br0,r4                                          ; Allocate output
-                       lvxl    v1,r10,r3                                       ; Get second half of line
-                       stvxl   v0,br0,r4                                       ; Save first half of line
-                       addi    r3,r3,32                                        ; Point to the next line
-                       icbi    br0,r4                                          ; Make the icache go away also
-                       stvxl   v1,r10,r4                                       ; Save second half of line
-                       cmplw   r3,r11                                          ; Have we reached the next page?
-                       dcbst   br0,r4                                          ; Make sure the line is on its way out
-                       addi    r4,r4,32                                        ; Point to the next line
-                       blt+    avmovepg                                        ; Move the next line...
-                       
-                       li              r8,0                                            ; Clear this
-                       sync                                                            ; Make sure all the memory stuff is done
-                       mtsprg  256,r8                                          ; Show we are not using VRs any more
-                       mtmsr   r9                                                      ; Translation and interruptions back on
-                       isync
-                       blr
-#endif
-               
-
-       
-
-/*
- * int
- * copyin(src, dst, count)
- *     vm_offset_t     src;
- *     vm_offset_t     dst;
- *     int             count;
- *
- */
-
-ENTRY2(copyin, copyinmsg, TAG_NO_FRAME_USED)
-
-/* Preamble allowing us to call a sub-function */
-               mflr    r0
-               stw             r0,FM_LR_SAVE(r1)
-               stwu    r1,-(FM_SIZE+16)(r1)
-               
-               cmpli   cr0,r5,0
-               ble-    cr0,.L_copyinout_trivial
 
-/* we know we have a valid copyin to do now */
-/* Set up thread_recover in case we hit an illegal address */
+               lis             r2,hi16(MASK(MSR_VEC))                  ; Get the vector flag
+        mflr   r0                                                              // get return
+               ori             r2,r2,lo16(MASK(MSR_FP))                ; Add the FP flag
+               stw             r0,8(r1)                                                // save
+        stwu   r1,-kSFSize(r1)                                 // set up a stack frame for VRs or FPRs
+        mfmsr  r11                                                             // save MSR at entry
+        mfsprg r10,2                                                   // get feature flags
+        andc   r11,r11,r2                                              // Clear out vec and fp
+        ori            r2,r2,lo16(MASK(MSR_EE))                // Get EE on also
+        andc   r2,r11,r2                                               // Clear out EE as well
+        mtcrf  0x02,r10                                                // we need to test pf64Bit
+        ori            r2,r2,MASK(MSR_FP)                              // must enable FP for G3...
+        mtcrf  0x80,r10                                                // we need to test pfAltivec too
+        oris   r2,r2,hi16(MASK(MSR_VEC))               // enable altivec for G4 (ignored if G3)
+        mtmsr  r2                                                              // turn EE off, FP and VEC on
+        isync
+        bt++   pf64Bitb,pmap_copy_64                   // skip if 64-bit processor (only they take hint)
+               slwi    r3,r3,12                                                // get page address from page num
+               slwi    r4,r4,12                                                // get page address from page num
+        rlwinm r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1      // get ready to turn off DR
+        bt             pfAltivecb,pmap_copy_g4                 // altivec but not 64-bit means G4
+        
+        
+        // G3 -- copy using FPRs
+        
+        stfd   f0,FM_SIZE+0(r1)                                // save the 4 FPRs we use to copy
+        stfd   f1,FM_SIZE+8(r1)
+        li             r5,PPC_PGBYTES/32                               // count of cache lines in a page
+        stfd   f2,FM_SIZE+16(r1)
+        mtctr  r5
+        stfd   f3,FM_SIZE+24(r1)
+        mtmsr  r12                                                             // turn off DR after saving FPRs on stack
+        isync
+        
+pmap_g3_copy_loop:                                                             // loop over 32-byte cache lines
+        dcbz   0,r4                                                    // avoid read of dest line
+        lfd            f0,0(r3)
+        lfd            f1,8(r3)
+        lfd            f2,16(r3)
+        lfd            f3,24(r3)
+        addi   r3,r3,32
+        stfd   f0,0(r4)
+        stfd   f1,8(r4)
+        stfd   f2,16(r4)
+        stfd   f3,24(r4)
+        dcbst  0,r4                                                    // flush dest line to RAM
+        addi   r4,r4,32
+        bdnz   pmap_g3_copy_loop
+        
+        sync                                                                   // wait for stores to take
+        subi   r4,r4,PPC_PGBYTES                               // restore ptr to destintation page
+        li             r6,PPC_PGBYTES-32                               // point to last line in page
+pmap_g3_icache_flush:
+        subic. r5,r6,32                                                // more to go?
+        icbi   r4,r6                                                   // flush another line in icache
+        subi   r6,r5,32                                                // get offset to next line
+        icbi   r4,r5
+        bne            pmap_g3_icache_flush
+        
+        sync
+        mtmsr  r2                                                              // turn DR back on
+        isync
+        lfd            f0,FM_SIZE+0(r1)                                // restore the FPRs
+        lfd            f1,FM_SIZE+8(r1)
+        lfd            f2,FM_SIZE+16(r1)
+        lfd            f3,FM_SIZE+24(r1)        
+        
+        b              pmap_g4_restore                                 // restore MSR and done
+
+        
+        // G4 -- copy using VRs
+
+pmap_copy_g4:                                                                  // r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR
+        la             r9,FM_SIZE+16(r1)                               // place where we save VRs to r9
+        li             r5,16                                                   // load x-form offsets into r5-r9
+        li             r6,32                                                   // another offset
+        stvx   v0,0,r9                                                 // save some VRs so we can use to copy
+        li             r7,48                                                   // another offset
+        stvx   v1,r5,r9
+        li             r0,PPC_PGBYTES/64                               // we loop over 64-byte chunks
+        stvx   v2,r6,r9
+        mtctr  r0
+        li             r8,96                                                   // get look-ahead for touch
+        stvx   v3,r7,r9
+        li             r9,128
+        mtmsr  r12                                                             // now we've saved VRs on stack, turn off DR
+        isync                                                                  // wait for it to happen
+        b              pmap_g4_copy_loop
+        
+        .align 5                                                               // align inner loops
+pmap_g4_copy_loop:                                                             // loop over 64-byte chunks
+        dcbt   r3,r8                                                   // touch 3 lines ahead
+        nop                                                                            // avoid a 17-word loop...
+        dcbt   r3,r9                                                   // touch 4 lines ahead
+        nop                                                                            // more padding
+        dcba   0,r4                                                    // avoid pre-fetch of 1st dest line
+        lvx            v0,0,r3                                                 // offset 0
+        lvx            v1,r5,r3                                                // offset 16
+        lvx            v2,r6,r3                                                // offset 32
+        lvx            v3,r7,r3                                                // offset 48
+        addi   r3,r3,64
+        dcba   r6,r4                                                   // avoid pre-fetch of 2nd line
+        stvx   v0,0,r4                                                 // offset 0
+        stvx   v1,r5,r4                                                // offset 16
+        stvx   v2,r6,r4                                                // offset 32
+        stvx   v3,r7,r4                                                // offset 48
+        dcbf   0,r4                                                    // push line 1
+        dcbf   r6,r4                                                   // and line 2
+        addi   r4,r4,64
+        bdnz   pmap_g4_copy_loop
+
+        sync                                                                   // wait for stores to take
+        subi   r4,r4,PPC_PGBYTES                               // restore ptr to destintation page
+        li             r8,PPC_PGBYTES-32                               // point to last line in page
+pmap_g4_icache_flush:
+        subic. r9,r8,32                                                // more to go?
+        icbi   r4,r8                                                   // flush from icache
+        subi   r8,r9,32                                                // get offset to next line
+        icbi   r4,r9
+        bne            pmap_g4_icache_flush
+        
+        sync
+        mtmsr  r2                                                              // turn DR back on
+        isync
+        la             r9,FM_SIZE+16(r1)                               // get base of VR save area
+        lvx            v0,0,r9                                                 // restore the VRs
+        lvx            v1,r5,r9
+        lvx            v2,r6,r9
+        lvx            v3,r7,r9        
+        
+pmap_g4_restore:                                                               // r11=MSR
+        mtmsr  r11                                                             // turn EE on, VEC and FR off
+        isync                                                                  // wait for it to happen
+        addi   r1,r1,kSFSize                                   // pop off our stack frame
+        lwz            r0,8(r1)                                                // restore return address
+        mtlr   r0
+        blr
+        
+        
+        // 64-bit/128-byte processor: copy using VRs
+        
+pmap_copy_64:                                                                  // r10=features, r11=old MSR
+               sldi    r3,r3,12                                                // get page address from page num
+               sldi    r4,r4,12                                                // get page address from page num
+               la              r9,FM_SIZE+16(r1)                               // get base of VR save area
+        li             r5,16                                                   // load x-form offsets into r5-r9
+        li             r6,32                                                   // another offset
+        bf             pfAltivecb,pmap_novmx_copy              // altivec suppressed...
+        stvx   v0,0,r9                                                 // save 8 VRs so we can copy wo bubbles
+        stvx   v1,r5,r9
+        li             r7,48                                                   // another offset
+        li             r0,PPC_PGBYTES/128                              // we loop over 128-byte chunks
+        stvx   v2,r6,r9
+        stvx   v3,r7,r9
+        addi   r9,r9,64                                                // advance base ptr so we can store another 4
+        mtctr  r0
+        li             r0,MASK(MSR_DR)                                 // get DR bit
+        stvx   v4,0,r9
+        stvx   v5,r5,r9
+        andc   r12,r2,r0                                               // turn off DR bit
+        li             r0,1                                                    // get a 1 to slam into SF
+        stvx   v6,r6,r9
+        stvx   v7,r7,r9
+        rldimi r12,r0,63,MSR_SF_BIT                    // set SF bit (bit 0)
+        li             r8,-128                                                 // offset so we can reach back one line
+        mtmsrd r12                                                             // now we've saved VRs, turn DR off and SF on
+        isync                                                                  // wait for it to happen
+        dcbt128        0,r3,1                                                  // start a forward stream
+        b              pmap_64_copy_loop
+        
+        .align 5                                                               // align inner loops
+pmap_64_copy_loop:                                                             // loop over 128-byte chunks
+        dcbz128        0,r4                                                    // avoid read of destination line
+        lvx            v0,0,r3                                                 // offset 0
+        lvx            v1,r5,r3                                                // offset 16
+        lvx            v2,r6,r3                                                // offset 32
+        lvx            v3,r7,r3                                                // offset 48
+        addi   r3,r3,64                                                // don't have enough GPRs so add 64 2x
+        lvx            v4,0,r3                                                 // offset 64
+        lvx            v5,r5,r3                                                // offset 80
+        lvx            v6,r6,r3                                                // offset 96
+        lvx            v7,r7,r3                                                // offset 112
+        addi   r3,r3,64
+        stvx   v0,0,r4                                                 // offset 0
+        stvx   v1,r5,r4                                                // offset 16
+        stvx   v2,r6,r4                                                // offset 32
+        stvx   v3,r7,r4                                                // offset 48
+        addi   r4,r4,64
+        stvx   v4,0,r4                                                 // offset 64
+        stvx   v5,r5,r4                                                // offset 80
+        stvx   v6,r6,r4                                                // offset 96
+        stvx   v7,r7,r4                                                // offset 112
+        addi   r4,r4,64
+        dcbf   r8,r4                                                   // flush the line we just wrote
+        bdnz   pmap_64_copy_loop
+
+        sync                                                                   // wait for stores to take
+        subi   r4,r4,PPC_PGBYTES                               // restore ptr to destintation page
+        li             r8,PPC_PGBYTES-128                              // point to last line in page
+pmap_64_icache_flush:
+        subic. r9,r8,128                                               // more to go?
+        icbi   r4,r8                                                   // flush from icache
+        subi   r8,r9,128                                               // get offset to next line
+        icbi   r4,r9
+        bne            pmap_64_icache_flush
+        
+        sync
+        mtmsrd r2                                                              // turn DR back on, SF off
+        isync
+        la             r9,FM_SIZE+16(r1)                               // get base address of VR save area on stack
+        lvx            v0,0,r9                                                 // restore the VRs
+        lvx            v1,r5,r9
+        lvx            v2,r6,r9
+        lvx            v3,r7,r9
+        addi   r9,r9,64        
+        lvx            v4,0,r9
+        lvx            v5,r5,r9
+        lvx            v6,r6,r9
+        lvx            v7,r7,r9
+
+        b              pmap_g4_restore                                 // restore lower half of MSR and return
+
+ //
+ //            Copy on 64-bit without VMX
+ //
+
+pmap_novmx_copy:        
+               li              r0,PPC_PGBYTES/128                              // we loop over 128-byte chunks
+               mtctr   r0
+               li              r0,MASK(MSR_DR)                                 // get DR bit
+               andc    r12,r2,r0                                               // turn off DR bit
+               li              r0,1                                                    // get a 1 to slam into SF
+               rldimi  r12,r0,63,MSR_SF_BIT                    // set SF bit (bit 0)
+               mtmsrd  r12                                                             // now we've saved VRs, turn DR off and SF on
+               isync                                                                   // wait for it to happen
+               dcbt128 0,r3,1                                                  // start a forward stream 
+       
+pmap_novmx_copy_loop:                                                  // loop over 128-byte cache lines
+        dcbz128        0,r4                                                    // avoid read of dest line
+        
+        ld             r0,0(r3)                                                // Load half a line
+        ld             r12,8(r3)
+        ld             r5,16(r3)
+        ld             r6,24(r3)
+        ld             r7,32(r3)
+        ld             r8,40(r3)
+        ld             r9,48(r3)
+        ld             r10,56(r3)
+        
+        std            r0,0(r4)                                                // Store half a line
+        std            r12,8(r4)
+        std            r5,16(r4)
+        std            r6,24(r4)
+        std            r7,32(r4)
+        std            r8,40(r4)
+        std            r9,48(r4)
+        std            r10,56(r4)
+        
+        ld             r0,64(r3)                                               // Load half a line
+        ld             r12,72(r3)
+        ld             r5,80(r3)
+        ld             r6,88(r3)
+        ld             r7,96(r3)
+        ld             r8,104(r3)
+        ld             r9,112(r3)
+        ld             r10,120(r3)
+        
+        addi   r3,r3,128
+        std            r0,64(r4)                                               // Store half a line
+        std            r12,72(r4)
+        std            r5,80(r4)
+        std            r6,88(r4)
+        std            r7,96(r4)
+        std            r8,104(r4)
+        std            r9,112(r4)
+        std            r10,120(r4)
+        
+        dcbf   0,r4                                                    // flush the line we just wrote
+               addi    r4,r4,128
+        bdnz   pmap_novmx_copy_loop
+
+        sync                                                                   // wait for stores to take
+        subi   r4,r4,PPC_PGBYTES                               // restore ptr to destintation page
+        li             r8,PPC_PGBYTES-128                              // point to last line in page
+
+pmap_novmx_icache_flush:
+        subic. r9,r8,128                                               // more to go?
+        icbi   r4,r8                                                   // flush from icache
+        subi   r8,r9,128                                               // get offset to next line
+        icbi   r4,r9
+        bne            pmap_novmx_icache_flush
+        
+        sync
+        mtmsrd r2                                                              // turn DR back on, SF off
+        isync
+
+        b              pmap_g4_restore                                 // restore lower half of MSR and return
+
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>   
                
-               mfsprg  r8,1                                                    /* Get the current act */ 
-               lwz             r10,ACT_THREAD(r8)
-               lis             r11,hi16(.L_copyinout_error)
-               lwz             r8,ACT_VMMAP(r8)
-               ori             r11,r11,lo16(.L_copyinout_error)
-               add             r9,r3,r5                                                /* Get the end of the source */
-               lwz             r8,VMMAP_PMAP(r8)                               ; Get the pmap
-               rlwinm  r12,r3,6,26,29                                  ; Get index to the segment slot
-               subi    r9,r9,1                                                 /* Make sure we don't go too far */
-               add             r8,r8,r12                                               ; Start indexing to the segment value
-               stw             r11,THREAD_RECOVER(r10)
-               xor             r9,r9,r3                                                /* Smoosh 'em together */
-               lwz             r8,PMAP_SEGS(r8)                                ; Get the source SR value
-               rlwinm. r9,r9,0,1,3                                             /* Top nybble equal? */
-               mtsr    SR_COPYIN,r8                                    ; Set the SR
-               isync
-#if 0
-               lis             r0,HIGH_ADDR(EXT(dbgRegsCall))  /* (TEST/DEBUG) */      
-               ori             r0,r0,LOW_ADDR(EXT(dbgRegsCall))        /* (TEST/DEBUG) */      
-               sc                                                                              /* (TEST/DEBUG) */
-#endif
-       
-/* For optimization, we check if the copyin lies on a segment
- * boundary. If it doesn't, we can use a simple copy. If it
- * does, we split it into two separate copies in some C code.
- */
-       
-               bne-    .L_call_copyin_multiple                 /* Nope, we went past the segment boundary... */
-
-               rlwinm  r3,r3,0,4,31
-               oris    r3,r3,(SR_COPYIN_NUM << (28-16))        /* Set the copyin segment as the source */
-       
-               bl              EXT(bcopy)
-               
-/* Now that copyin is done, we don't need a recovery point */
-
-               addi    r1,r1,FM_SIZE+16
-               mfsprg  r6,1                                                    /* Get the current act */ 
-               lwz             r10,ACT_THREAD(r6)
-               li              r3,0
-               lwz             r0,FM_LR_SAVE(r1)
-               stw             r3,THREAD_RECOVER(r10)                  /* Clear recovery */
-               mtlr    r0
-               blr
-       
-/* we get here via the exception handler if an illegal
- * user memory reference was made.
- */
-.L_copyinout_error:
-
-/* Now that copyin is done, we don't need a recovery point */
-       
-               mfsprg  r6,1                                                    /* Get the current act */ 
-               addi    r1,r1,FM_SIZE+16
-               lwz             r10,ACT_THREAD(r6)
-               li              r4,0
-               lwz             r0,FM_LR_SAVE(r1)
-               stw             r4,THREAD_RECOVER(r10)                  /* Clear recovery */
-               mtlr    r0
-               li              r3,EFAULT                                                       ; Indicate error (EFAULT) 
-               blr
-
-.L_copyinout_trivial:
-       /* The copyin/out was for either 0 bytes or a negative
-        * number of bytes, return an appropriate value (0 == SUCCESS).
-        * cr0 still contains result of comparison of len with 0.
-        */
-       li      r3,     0
-       beq+    cr0,    .L_copyinout_negative
-       li      r3,     1
-.L_copyinout_negative:
-
-       /* unwind the stack */
-       addi    r1,     r1,     FM_SIZE+16
-       lwz     r0,     FM_LR_SAVE(r1)
-       mtlr    r0
-
-       blr
-
-.L_call_copyin_multiple:
-
-       /* unwind the stack */
-       addi    r1,     r1,     FM_SIZE+16
-       lwz     r0,     FM_LR_SAVE(r1)
-       mtlr    r0
-
-       b       EXT(copyin_multiple)                            /* not a call - a jump! */
-
+// Stack frame format used by copyin, copyout, copyinstr and copyoutstr.
+// These routines all run both on 32 and 64-bit machines, though because they are called
+// by the BSD kernel they are always in 32-bit mode when entered.  The mapped ptr returned
+// by MapUserAddressSpace will be 64 bits however on 64-bit machines.  Beware to avoid
+// using compare instructions on this ptr.  This mapped ptr is kept globally in r31, so there
+// is no need to store or load it, which are mode-dependent operations since it could be
+// 32 or 64 bits.
+
+#define        kkFrameSize     (FM_SIZE+32)
+
+#define        kkBufSize       (FM_SIZE+0)
+#define        kkCR            (FM_SIZE+4)
+#define        kkSource        (FM_SIZE+8)
+#define        kkDest          (FM_SIZE+12)
+#define        kkCountPtr      (FM_SIZE+16)
+#define        kkR31Save       (FM_SIZE+20)
+// nonvolatile CR bits we use as flags in cr3
+
+#define        kk64bit         12
+#define        kkNull          13
+#define        kkIn            14
+#define        kkString        15
+#define        kkZero          15
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
  * int
- * copyout(src, dst, count)
+ * copyoutstr(src, dst, maxcount, count)
  *     vm_offset_t     src;
  *     vm_offset_t     dst;
- *     int             count;
+ *     vm_size_t       maxcount; 
+ *     vm_size_t*      count;
  *
+ * Set *count to the number of bytes copied.
  */
 
-ENTRY2(copyout, copyoutmsg, TAG_NO_FRAME_USED)
+ENTRY(copyoutstr, TAG_NO_FRAME_USED)
+        mfcr   r2                                                              // we use nonvolatile cr3
+        li             r0,0
+        crset  kkString                                                // flag as a string op
+        mr             r10,r4                                                  // for copyout, dest ptr (r4) is in user space
+        stw            r0,0(r6)                                                // initialize #bytes moved
+        crclr  kkIn                                                    // flag as copyout
+        b              copyJoin
 
-/* Preamble allowing us to call a sub-function */
-
-               mflr    r0
-               stw             r0,FM_LR_SAVE(r1)
-               stwu    r1,-(FM_SIZE+16)(r1)
-               
-#if 0
-               stw             r3,FM_SIZE+0(r1)                                /* (TEST/DEBUG) */
-               stw             r4,FM_SIZE+4(r1)                                /* (TEST/DEBUG) */
-               stw             r5,FM_SIZE+8(r1)                                /* (TEST/DEBUG) */
-               mr              r6,r0                                                   /* (TEST/DEBUG) */
-               
-               bl              EXT(tracecopyout)                               /* (TEST/DEBUG) */
-               
-               lwz             r3,FM_SIZE+0(r1)                                /* (TEST/DEBUG) */
-               lwz             r4,FM_SIZE+4(r1)                                /* (TEST/DEBUG) */
-               lwz             r5,FM_SIZE+8(r1)                                /* (TEST/DEBUG) */
-#endif
-       
-               cmpli   cr0,r5,0
-               ble-    cr0,.L_copyinout_trivial
-/* we know we have a valid copyout to do now */
-/* Set up thread_recover in case we hit an illegal address */
-               
-
-               mfsprg  r8,1                                                    /* Get the current act */
-               lwz             r10,ACT_THREAD(r8)
-               lis             r11,HIGH_ADDR(.L_copyinout_error)
-               lwz             r8,ACT_VMMAP(r8)
-               rlwinm  r12,r4,6,26,29                                  ; Get index to the segment slot
-               ori             r11,r11,LOW_ADDR(.L_copyinout_error)
-               add             r9,r4,r5                                                /* Get the end of the destination */
-               lwz             r8,VMMAP_PMAP(r8)
-               subi    r9,r9,1                                                 /* Make sure we don't go too far */
-               add             r8,r8,r12                                               ; Start indexing to the segment value
-               stw             r11,THREAD_RECOVER(r10)
-               xor             r9,r9,r4                                                /* Smoosh 'em together */
-               lwz             r8,PMAP_SEGS(r8)                                ; Get the source SR value
-               rlwinm. r9,r9,0,1,3                                             /* Top nybble equal? */
-               mtsr    SR_COPYIN,r8
-               isync
-       
-       
-/* For optimisation, we check if the copyout lies on a segment
- * boundary. If it doesn't, we can use a simple copy. If it
- * does, we split it into two separate copies in some C code.
- */
-       
-               bne-    .L_call_copyout_multiple                /* Nope, we went past the segment boundary... */
-
-               rlwinm  r4,r4,0,4,31
-               oris    r4,r4,(SR_COPYIN_NUM << (28-16))        /* Set the copyin segment as the source */
-       
-               bl      EXT(bcopy)
-               
-/* Now that copyout is done, we don't need a recovery point */
-               mfsprg  r6,1                                                    /* Get the current act */
-               addi    r1,r1,FM_SIZE+16
-               lwz             r10,ACT_THREAD(r6)
-               li              r3,0
-               lwz             r0,FM_LR_SAVE(r1)
-               stw             r3,THREAD_RECOVER(r10)                  /* Clear recovery */
-               mtlr    r0
-               blr
-
-.L_call_copyout_multiple:
-       /* unwind the stack */
-       addi    r1,     r1,     FM_SIZE+16
-       lwz     r0,     FM_LR_SAVE(r1)
-       mtlr    r0
-
-       b       EXT(copyout_multiple)                                   /* not a call - a jump! */
 
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 /*
- * boolean_t
- * copyinstr(src, dst, count, maxcount)
+ * int
+ * copyinstr(src, dst, maxcount, count)
  *     vm_offset_t     src;
  *     vm_offset_t     dst;
  *     vm_size_t       maxcount; 
  *     vm_size_t*      count;
  *
  * Set *count to the number of bytes copied
- * 
  * If dst == NULL, don't copy, just count bytes.
  * Only currently called from klcopyinstr. 
  */
 
 ENTRY(copyinstr, TAG_NO_FRAME_USED)
-
-/* Preamble allowing us to call a sub-function */
-               mflr    r0
-               stw             r0,FM_LR_SAVE(r1)
-               stwu    r1,-(FM_SIZE+16)(r1)
-
-#if 0
-               stw             r3,FM_SIZE+0(r1)                                /* (TEST/DEBUG) */
-               stw             r4,FM_SIZE+4(r1)                                /* (TEST/DEBUG) */
-               stw             r5,FM_SIZE+8(r1)                                /* (TEST/DEBUG) */
-               stw             r6,FM_SIZE+12(r1)                               /* (TEST/DEBUG) */
-               mr              r7,r0                                                   /* (TEST/DEBUG) */
-               
-               bl              EXT(tracecopystr)                               /* (TEST/DEBUG) */
-               
-               lwz             r3,FM_SIZE+0(r1)                                /* (TEST/DEBUG) */
-               lwz             r4,FM_SIZE+4(r1)                                /* (TEST/DEBUG) */
-               lwz             r5,FM_SIZE+8(r1)                                /* (TEST/DEBUG) */
-               stw             r6,FM_SIZE+12(r1)                               /* (TEST/DEBUG) */
-#endif
-                               
-               cmpli   cr0,r5,0
-               ble-    cr0,.L_copyinout_trivial
-
-/* we know we have a valid copyin to do now */
-/* Set up thread_recover in case we hit an illegal address */
-               
-               li              r0,0                                                    
-               mfsprg  r8,1                                                    /* Get the current act */
-               lwz             r10,ACT_THREAD(r8)
-               stw             r0,0(r6)                                                /* Clear result length */
-               lis             r11,HIGH_ADDR(.L_copyinout_error)
-               lwz             r8,ACT_VMMAP(r8)                                ; Get the map for this activation
-               rlwinm  r12,r3,6,26,29                                  ; Get index to the segment slot
-               lwz             r8,VMMAP_PMAP(r8)
-               ori             r11,r11,LOW_ADDR(.L_copyinout_error)
-               add             r8,r8,r12                                               ; Start indexing to the segment value
-               stw             r11,THREAD_RECOVER(r10)
-               rlwinm  r3,r3,0,4,31
-               lwz             r7,PMAP_SEGS(r8)                                ; Get the source SR value
-               oris    r3,r3,(SR_COPYIN_NUM << (28-16))        /* Set the copyin segment as the source */
-
-/* Copy byte by byte for now - TODO NMGS speed this up with
- * some clever (but fairly standard) logic for word copies.
- * We don't use a copyinstr_multiple since copyinstr is called
- * with INT_MAX in the linux server. Eugh.
+        mfcr   r2                                                              // we use nonvolatile cr3
+        cmplwi r4,0                                                    // dst==NULL?
+        li             r0,0
+        crset  kkString                                                // flag as a string op
+        mr             r10,r3                                                  // for copyin, source ptr (r3) is in user space
+        crmove kkNull,cr0_eq                                   // remember if (dst==NULL)
+        stw            r0,0(r6)                                                // initialize #bytes moved
+        crset  kkIn                                                    // flag as copyin (rather than copyout)
+        b              copyJoin1                                               // skip over the "crclr kkNull"
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+/*
+ * int
+ * copyout(src, dst, count)
+ *     vm_offset_t     src;
+ *     vm_offset_t     dst;
+ *     size_t          count;
  */
 
-               li              r9,0                                                    /* Clear byte counter */
-
-/* If the destination is NULL, don't do writes,
- * just count bytes. We set CR7 outside the loop to save time
+                       .align  5
+                       .globl  EXT(copyout)
+                       .globl  EXT(copyoutmsg)
+
+LEXT(copyout)
+LEXT(copyoutmsg)
+
+#if INSTRUMENT
+                       mfspr   r12,pmc1                                                ; INSTRUMENT - saveinstr[12] - Take stamp at copyout
+                       stw             r12,0x6100+(12*16)+0x0(0)               ; INSTRUMENT - Save it
+                       mfspr   r12,pmc2                                                ; INSTRUMENT - Get stamp
+                       stw             r12,0x6100+(12*16)+0x4(0)               ; INSTRUMENT - Save it
+                       mfspr   r12,pmc3                                                ; INSTRUMENT - Get stamp
+                       stw             r12,0x6100+(12*16)+0x8(0)               ; INSTRUMENT - Save it
+                       mfspr   r12,pmc4                                                ; INSTRUMENT - Get stamp
+                       stw             r12,0x6100+(12*16)+0xC(0)               ; INSTRUMENT - Save it
+#endif                 
+        mfcr   r2                                                              // save caller's CR
+        crclr  kkString                                                // not a string version
+        mr             r10,r4                                                  // dest (r4) is user-space ptr
+        crclr  kkIn                                                    // flag as copyout
+        b              copyJoin
+        
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+/*
+ * int
+ * copyin(src, dst, count)
+ *     vm_offset_t     src;
+ *     vm_offset_t     dst;
+ *     size_t          count;
  */
-               cmpwi   cr7,r4,0                                                /* Is the destination null? */
-               
-nxtseg:        mtsr    SR_COPYIN,r7                                    /* Set the source SR */
-               isync
 
-.L_copyinstr_loop:
-               lbz             r0,0(r3)                                                /* Get the source */
-               addic.  r5,r5,-1                                                /* Have we gone far enough? */
-               addi    r3,r3,1                                                 /* Bump source pointer */
-               
-               cmpwi   cr1,r0,0                                                /* Did we hit a null? */
 
-               beq             cr7,.L_copyinstr_no_store               /* If we are just counting, skip the store... */
-       
-               stb             r0,0(r4)                                                /* Move to sink */
-               addi    r4,r4,1                                                 /* Advance sink pointer */
+                       .align  5
+                       .globl  EXT(copyin)
+                       .globl  EXT(copyinmsg)
+
+LEXT(copyin)
+LEXT(copyinmsg)
+
+        mfcr   r2                                                              // save caller's CR
+        crclr  kkString                                                // not a string version
+        mr             r10,r3                                                  // source (r3) is user-space ptr in copyin
+        crset  kkIn                                                    // flag as copyin
+        
+        
+// Common code to handle setup for all the copy variants:
+//             r2 = caller's CR, since we use cr3
+//   r3-r6 = parameters
+//        r10 = user-space ptr (r3 if copyin, r4 if copyout)
+//     cr3 = kkIn, kkString, kkNull flags
+
+copyJoin:
+        crclr  kkNull                                                  // (dst==NULL) convention not used with this call
+copyJoin1:                                                                             // enter from copyinstr with kkNull set
+               mflr    r0                                                              // get return address
+        cmplwi r5,0                                                    // buffer length 0?
+        lis            r9,0x1000                                               // r9 <- 0x10000000 (256MB)
+               stw             r0,FM_LR_SAVE(r1)                               // save return
+        cmplw  cr1,r5,r9                                               // buffer length > 256MB ?
+        mfsprg r8,2                                                    // get the features
+        beq--  copyinout_0                                             // 0 length is degenerate case
+               stwu    r1,-kkFrameSize(r1)                             // set up stack frame
+        stw            r2,kkCR(r1)                                             // save caller's CR since we use cr3
+        mtcrf  0x02,r8                                                 // move pf64Bit to cr6
+        stw            r3,kkSource(r1)                                 // save args across MapUserAddressSpace
+        stw            r4,kkDest(r1)
+        stw            r5,kkBufSize(r1)
+        crmove kk64bit,pf64Bitb                                // remember if this is a 64-bit processor
+        stw            r6,kkCountPtr(r1)
+        stw            r31,kkR31Save(r1)                               // we use r31 globally for mapped user ptr
+        li             r31,0                                                   // no mapped ptr yet
+        
+        
+// Handle buffer length > 256MB.  This is an error (ENAMETOOLONG) on copyin and copyout.
+// The string ops are passed -1 lengths by some BSD callers, so for them we silently clamp
+// the buffer length to 256MB.  This isn't an issue if the string is less than 256MB
+// (as most are!), but if they are >256MB we eventually return ENAMETOOLONG.  This restriction
+// is due to MapUserAddressSpace; we don't want to consume more than two segments for
+// the mapping. 
+
+        ble++  cr1,copyin0                                             // skip if buffer length <= 256MB
+        bf             kkString,copyinout_too_big              // error if not string op
+        mr             r5,r9                                                   // silently clamp buffer length to 256MB
+        stw            r9,kkBufSize(r1)                                // update saved copy too
+
+
+// Set up thread_recover in case we hit an illegal address.
+
+copyin0:
+               mfsprg  r8,1                                                    /* Get the current act */ 
+               lis             r2,hi16(copyinout_error)
+               lwz             r7,ACT_THREAD(r8)
+               ori             r2,r2,lo16(copyinout_error)
+               lwz             r3,ACT_VMMAP(r8)                                // r3 <- vm_map virtual address
+               stw             r2,THREAD_RECOVER(r7)
+
+
+// Map user segment into kernel map, turn on 64-bit mode.
+//             r3 = vm map
+//             r5 = buffer length
+//        r10 = user space ptr (r3 if copyin, r4 if copyout)
+        
+               mr              r6,r5                                                   // Set length to map
+               li              r4,0                                                    // Note: we only do this 32-bit for now
+        mr             r5,r10                                                  // arg2 <- user space ptr
+#if INSTRUMENT
+                       mfspr   r12,pmc1                                                ; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace
+                       stw             r12,0x6100+(13*16)+0x0(0)               ; INSTRUMENT - Save it
+                       mfspr   r12,pmc2                                                ; INSTRUMENT - Get stamp
+                       stw             r12,0x6100+(13*16)+0x4(0)               ; INSTRUMENT - Save it
+                       mfspr   r12,pmc3                                                ; INSTRUMENT - Get stamp
+                       stw             r12,0x6100+(13*16)+0x8(0)               ; INSTRUMENT - Save it
+                       mfspr   r12,pmc4                                                ; INSTRUMENT - Get stamp
+                       stw             r12,0x6100+(13*16)+0xC(0)               ; INSTRUMENT - Save it
+#endif                 
+        bl             EXT(MapUserAddressSpace)                // set r3 <- address in kernel map of user operand
+#if INSTRUMENT
+                       mfspr   r12,pmc1                                                ; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace
+                       stw             r12,0x6100+(14*16)+0x0(0)               ; INSTRUMENT - Save it
+                       mfspr   r12,pmc2                                                ; INSTRUMENT - Get stamp
+                       stw             r12,0x6100+(14*16)+0x4(0)               ; INSTRUMENT - Save it
+                       mfspr   r12,pmc3                                                ; INSTRUMENT - Get stamp
+                       stw             r12,0x6100+(14*16)+0x8(0)               ; INSTRUMENT - Save it
+                       mfspr   r12,pmc4                                                ; INSTRUMENT - Get stamp
+                       stw             r12,0x6100+(14*16)+0xC(0)               ; INSTRUMENT - Save it
+#endif                 
+               or.             r0,r3,r4                                                // Did we fail the mapping?
+        mr             r31,r4                                                  // r31 <- mapped ptr into user space (may be 64-bit)
+        beq--  copyinout_error                                 // was 0, so there was an error making the mapping
+        bf--   kk64bit,copyin1                                 // skip if a 32-bit processor
+               rldimi  r31,r3,32,0                                             // slam high-order bits into mapped ptr
+        mfmsr  r4                                                              // if 64-bit, turn on SF so we can use returned ptr
+        li             r0,1
+        rldimi r4,r0,63,MSR_SF_BIT                             // light bit 0
+        mtmsrd r4                                                              // turn on 64-bit mode
+        isync                                                                  // wait for mode to change
+        
+        
+// Load r3-r5, substituting mapped ptr as appropriate.
+
+copyin1:
+        lwz            r5,kkBufSize(r1)                                // restore length to copy
+        bf             kkIn,copyin2                                    // skip if copyout
+        lwz            r4,kkDest(r1)                                   // copyin: source is mapped, dest is r4 at entry
+        mr             r3,r31                                                  // source is mapped ptr
+        b              copyin3
+copyin2:                                                                               // handle copyout
+        lwz            r3,kkSource(r1)                                 // source is kernel buffer (r3 at entry)
+        mr             r4,r31                                                  // dest is mapped ptr into user space
+        
+        
+// Finally, all set up to copy:
+//             r3 = source ptr (mapped if copyin)
+//             r4 = dest ptr (mapped if copyout)
+//             r5 = length
+//        r31 = mapped ptr returned by MapUserAddressSpace
+//        cr3 = kkIn, kkString, kk64bit, and kkNull flags
+
+copyin3:
+        bt             kkString,copyString                             // handle copyinstr and copyoutstr
+        bl             EXT(bcopy)                                              // copyin and copyout: let bcopy do the work
+        li             r3,0                                                    // return success
+        
+        
+// Main exit point for copyin, copyout, copyinstr, and copyoutstr.  Also reached
+// from error recovery if we get a DSI accessing user space.  Clear recovery ptr, 
+// and pop off frame.  Note that we have kept
+// the mapped ptr into user space in r31, as a reg64_t type (ie, a 64-bit ptr on
+// 64-bit machines.)  We must unpack r31 into an addr64_t in (r3,r4) before passing
+// it to ReleaseUserAddressSpace.
+//             r3 = 0, EFAULT, or ENAMETOOLONG
+
+copyinx: 
+        lwz            r2,kkCR(r1)                                             // get callers cr3
+               mfsprg  r6,1                                                    // Get the current act 
+               lwz             r10,ACT_THREAD(r6)
+               
+        bf--   kk64bit,copyinx1                                // skip if 32-bit processor
+        mfmsr  r12
+        rldicl r12,r12,0,MSR_SF_BIT+1                  // if 64-bit processor, turn 64-bit mode off
+        mtmsrd r12                                                             // turn SF off and EE back on
+        isync                                                                  // wait for the mode to change
+copyinx1:
+        lwz            r31,kkR31Save(r1)                               // restore callers r31
+        addi   r1,r1,kkFrameSize                               // pop off our stack frame
+               lwz             r0,FM_LR_SAVE(r1)
+               li              r4,0
+               stw             r4,THREAD_RECOVER(r10)                  // Clear recovery
+               mtlr    r0
+        mtcrf  0x10,r2                                                 // restore cr3
+               blr
 
-.L_copyinstr_no_store:
 
-               addi    r9,r9,1                                                 /* Count the character */
-               beq-    cr1,.L_copyinstr_done                   /* We're done if we did a null... */
-               beq-    cr0,L_copyinstr_toobig                  /* Also if we maxed the count... */
-       
-/* Check to see if the copyin pointer has moved out of the
- * copyin segment, if it has we must remap.
+/* We get here via the exception handler if an illegal
+ * user memory reference was made.  This error handler is used by
+ * copyin, copyout, copyinstr, and copyoutstr.  Registers are as
+ * they were at point of fault, so for example cr3 flags are valid.
  */
 
-               rlwinm. r0,r3,0,4,31                                    /* Did we wrap around to 0? */
-               bne+    cr0,.L_copyinstr_loop                   /* Nope... */
-
-               lwz             r7,PMAP_SEGS+4(r8)                              ; Get the next source SR value
-               addi    r8,r8,4                                                 ; Point to the next segment
-               oris    r3,r0,(SR_COPYIN_NUM << (28-16))        /* Reset the segment number */
-               b               nxtseg                                                  /* Keep going... */
-       
-L_copyinstr_toobig:
-               li              r3,ENAMETOOLONG
-               b               L_copyinstr_return
-.L_copyinstr_done:
-               li              r3,0                                                    /* Normal return */
-L_copyinstr_return:
-               li              r4,0                                                    /* to clear thread_recover */
-               stw             r9,0(r6)                                                /* Set how many bytes we did */
-               stw             r4,THREAD_RECOVER(r10)                  /* Clear recovery exit */
-
-               addi    r1,     r1,     FM_SIZE+16
-               lwz             r0,     FM_LR_SAVE(r1)
-               mtlr    r0
-               blr
+copyinout_error:
+        li             r3,EFAULT                                               // return error
+        b              copyinx
+
+copyinout_0:                                                                   // degenerate case: 0-length copy
+               mtcrf   0x10,r2                                                 // restore cr3
+        li             r3,0                                                    // return success
+        blr
+        
+copyinout_too_big:                                                             // degenerate case
+        mtcrf  0x10,r2                                                 // restore cr3
+        lwz            r1,0(r1)                                                // pop off stack frame
+        li             r3,ENAMETOOLONG
+        blr
+        
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// Handle copyinstr and copyoutstr.  At this point the stack frame is set up,
+// the recovery ptr is set, the user's buffer is mapped, we're in 64-bit mode
+// if necessary, and:
+//             r3 = source ptr, mapped if copyinstr
+//             r4 = dest ptr, mapped if copyoutstr
+//             r5 = buffer length
+//        r31 = mapped ptr returned by MapUserAddressSpace
+//     cr3 = kkIn, kkString, kkNull, and kk64bit flags
+// We do word copies unless the buffer is very short, then use a byte copy loop
+// for the leftovers if necessary.
+
+copyString:
+        li             r12,0                                                   // Set header bytes count to zero
+        cmplwi cr1,r5,20                                               // is buffer very short?
+        mtctr  r5                                                              // assuming short, set up loop count for bytes
+        blt            cr1,copyinstr8                                  // too short for word loop
+        andi.  r12,r3,0x3                                              // is source ptr word aligned?
+        bne            copyinstr11                                             //  bytes loop
+copyinstr1:
+        srwi   r6,r5,2                                                 // get #words in buffer
+        mtctr  r6                                                              // set up word loop count
+        lis            r10,hi16(0xFEFEFEFF)                    // load magic constants into r10 and r11
+        lis            r11,hi16(0x80808080)
+        ori            r10,r10,lo16(0xFEFEFEFF)
+        ori            r11,r11,lo16(0x80808080)
+        bf             kkNull,copyinstr6                               // enter loop that copies
+        b              copyinstr5                                              // use loop that just counts
+        
+        
+// Word loop(s).  They do a word-parallel search for 0s, using the following
+// inobvious but very efficient test:
+//             y =  data + 0xFEFEFEFF
+//             z = ~data & 0x80808080
+// If (y & z)==0, then all bytes in dataword are nonzero.  We need two copies of
+// this loop, since if we test kkNull in the loop then it becomes 9 words long.
+
+        .align 5                                                               // align inner loops for speed
+copyinstr5:                                                                            // version that counts but does not copy
+        lwz            r8,0(r3)                                                // get next word of source
+        addi   r3,r3,4                                                 // increment source ptr
+        add            r9,r10,r8                                               // r9 =  data + 0xFEFEFEFF
+        andc   r7,r11,r8                                               // r7 = ~data & 0x80808080
+        and.   r7,r9,r7                                                // r7 = r9 & r7
+        bdnzt  cr0_eq,copyinstr5                               // if r7==0, then all bytes are nonzero
+
+        b              copyinstr7
+
+        .align 5                                                               // align inner loops for speed
+copyinstr6:                                                                            // version that counts and copies
+        lwz            r8,0(r3)                                                // get next word of source
+        addi   r3,r3,4                                                 // increment source ptr
+        addi   r4,r4,4                                                 // increment dest ptr while we wait for data
+        add            r9,r10,r8                                               // r9 =  data + 0xFEFEFEFF
+        andc   r7,r11,r8                                               // r7 = ~data & 0x80808080
+        and.   r7,r9,r7                                                // r7 = r9 & r7
+        stw            r8,-4(r4)                                               // pack all 4 bytes into buffer
+        bdnzt  cr0_eq,copyinstr6                               // if r7==0, then all bytes are nonzero
+
+
+// Either 0 found or buffer filled.  The above algorithm has mapped nonzero bytes to 0
+// and 0 bytes to 0x80 with one exception: 0x01 bytes preceeding the first 0 are also
+// mapped to 0x80.  We must mask out these false hits before searching for an 0x80 byte.
+
+copyinstr7:
+        crnot  kkZero,cr0_eq                                   // 0 found iff cr0_eq is off
+        mfctr  r6                                                              // get #words remaining in buffer
+        rlwinm r2,r8,7,0,31                                    // move 0x01 bits to 0x80 position
+        slwi   r6,r6,2                                                 // convert to #bytes remaining
+        andc   r7,r7,r2                                                // turn off false hits from 0x0100 worst case
+        rlwimi r6,r5,0,30,31                                   // add in odd bytes leftover in buffer
+        srwi   r7,r7,8                                                 // we want to count the 0 as a byte xferred
+        addi   r6,r6,4                                                 // don't count last word xferred (yet)
+        cntlzw r7,r7                                                   // now we can find the 0 byte (ie, the 0x80)
+        srwi   r7,r7,3                                                 // convert 8,16,24,32 to 1,2,3,4
+        sub.   r6,r6,r7                                                // account for nonzero bytes in last word
+        bt++   kkZero,copyinstr10                              // 0 found, so done
+        
+        beq            copyinstr10                                             // r6==0, so buffer truly full
+        mtctr  r6                                                              // 0 not found, loop over r6 bytes
+        b              copyinstr8                                              // enter byte loop for last 1-3 leftover bytes
+        
+
+// Byte loop.  This is used for very small buffers and for the odd bytes left over
+// after searching and copying words at a time.
+    
+        .align 5                                                               // align inner loops for speed
+copyinstr8:                                                                            // loop over bytes of source
+        lbz            r0,0(r3)                                                // get next byte of source
+        addi   r3,r3,1
+        addi   r4,r4,1                                                 // increment dest addr whether we store or not
+        cmpwi  r0,0                                                    // the 0?
+        bt--   kkNull,copyinstr9                               // don't store (was copyinstr with NULL ptr)
+        stb            r0,-1(r4)
+copyinstr9:
+        bdnzf  cr0_eq,copyinstr8                               // loop if byte not 0 and more room in buffer
+        
+        mfctr  r6                                                              // get #bytes left in buffer
+        crmove kkZero,cr0_eq                                   // remember if 0 found or buffer filled
+
+        
+// Buffer filled or 0 found.  Unwind and return.
+//     r5 = kkBufSize, ie buffer length
+//  r6 = untransferred bytes remaining in buffer
+// r31 = mapped ptr returned by MapUserAddressSpace
+// cr3 = kkZero set iff 0 found
+
+copyinstr10:
+        lwz            r9,kkCountPtr(r1)                               // get ptr to place to store count of bytes moved
+        sub            r2,r5,r6                                                // get #bytes we moved, counting the 0 iff any
+        add            r2,r2,r12                                               // add the header bytes count
+        li             r3,0                                                    // assume 0 return status
+        stw            r2,0(r9)                                                // store #bytes moved
+        bt++   kkZero,copyinx                                  // we did find the 0 so return 0
+        li             r3,ENAMETOOLONG                                 // buffer filled
+        b              copyinx                                                 // join main exit routine
+
+// Byte loop.  This is used on the header bytes for unaligned source 
+    
+        .align 5                                                               // align inner loops for speed
+copyinstr11:
+        li             r10,4                                                   // load word size
+        sub            r12,r10,r12                                             // set the header bytes count
+        mtctr  r12                                                             // set up bytes loop count
+copyinstr12:                                                                   // loop over bytes of source
+        lbz            r0,0(r3)                                                // get next byte of source
+        addi   r3,r3,1
+        addi   r4,r4,1                                                 // increment dest addr whether we store or not
+        cmpwi  r0,0                                                    // the 0?
+        bt--   kkNull,copyinstr13                              // don't store (was copyinstr with NULL ptr)
+        stb            r0,-1(r4)
+copyinstr13:
+        bdnzf  cr0_eq,copyinstr12                              // loop if byte not 0 and more room in buffer
+        sub            r5,r5,r12                                               // substract the bytes copied
+        bne            cr0_eq,copyinstr1                               // branch to word loop
+
+        mr             r5,r12                                                  // Get the header bytes count
+        li             r12,0                                                   // Clear the header bytes count
+        mfctr  r6                                                              // get #bytes left in buffer
+        crmove kkZero,cr0_eq                                   // remember if 0 found or buffer filled
+        b              copyinstr10
+