]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/ppc/bcopy.s
xnu-344.23.tar.gz
[apple/xnu.git] / osfmk / ppc / bcopy.s
index 389fe4b2fa45f8bcbd8a12ee3cf28ac66b17b6f5..1a18bf37a9a1e848c1d4012f92afdf4b5602eb82 100644 (file)
@@ -1,24 +1,21 @@
 /*
- * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
- * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
  * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
  * 
  * @APPLE_LICENSE_HEADER_END@
  */
 ;
 #include <ppc/asm.h>
 #include <ppc/proc_reg.h>
-#include <assym.s>
 
 ;              Use CR5_lt to indicate non-cached
 #define noncache 20
-
 ;              Use CR5_gt to indicate that we need to turn data translation back on
 #define fixxlate 21
-
-;              Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off
-;              64-bit mode (if 64-bit) before returning to our caller.  We overload the
-;              bit to reduce the number of conditional branches at bcopy exit.
-#define restorex 22
-
-;              Use CR5_so to indicate that we need to restore real-mode cachability
-;              Only needed on 64-bit machines
-#define flipcache 23
+;              Use CR5_eq to indicate that we need to invalidate bats
+#define killbats 22
 
 ;
 ; bcopy_nc(from, to, nbytes)
@@ -65,24 +53,19 @@ LEXT(bcopy_nc)
 ;      
 ; void bcopy_physvir(from, to, nbytes)
 ; Attempt to copy physically addressed memory with translation on if conditions are met.
-; Otherwise do a normal bcopy_phys.  This routine is used because some 32-bit processors 
-; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
-; for the passed phys addrs and do the copy with translation on.  
+; Otherwise do a normal bcopy_phys.
 ;
 ; Rules are: neither source nor destination can cross a page. 
+; No accesses above the 2GB line (I/O or ROM).
 ;
-; Interrupts must be disabled throughout the copy when this is called.
+; Interrupts must be disabled throughout the copy when this is called
+
 ; To do this, we build a
 ; 128 DBAT for both the source and sink.  If both are the same, only one is
 ; loaded.  We do not touch the IBATs, so there is no issue if either physical page
 ; address is the same as the virtual address of the instructions we are executing.
 ;
-; At the end, we invalidate the used DBATs.
-;
-; Note that the address parameters are long longs.  We will transform these to 64-bit
-; values.  Note that on 32-bit architectures that this will ignore the high half of the
-; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
-; there anyhow.
+; At the end, we invalidate the used DBATs and reenable interrupts.
 ;
 ; Note, this one will not work in user state
 ; 
@@ -92,32 +75,22 @@ LEXT(bcopy_nc)
 
 LEXT(bcopy_physvir)
 
-                       crclr   flipcache                                       ; (HACK) No cache flip needed
-            mfsprg     r8,2                                            ; get processor feature flags
-            rlwinm     r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
-                       addic.  r0,r7,-1                                        ; Get length - 1
-                       rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
+                       addic.  r0,r5,-1                                        ; Get length - 1
                        add             r11,r3,r0                                       ; Point to last byte of sink
-                       rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
-            mtcrf      0x02,r8                                         ; move pf64Bit to cr6 so we can test
-            rlwimi     r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
-                       mr              r5,r7                                           ; Get the length into the right register
-                       cmplw   cr1,r3,r4                                       ; Does source == sink?  
-            bt++       pf64Bitb,bcopy_phys1            ; if 64-bit processor, use standard routine (no BATs)
+                       cmplw   cr1,r3,r4                                       ; Does source == sink?                  
                        add             r12,r4,r0                                       ; Point to last byte of source
                        bltlr-                                                          ; Bail if length is 0 or way too big
                        xor             r7,r11,r3                                       ; See if we went to next page
                        xor             r8,r12,r4                                       ; See if we went to next page
                        or              r0,r7,r8                                        ; Combine wrap
                        
-//                     li              r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
-                       li              r9,((2<<3)|2)                           ; Set default attributes
+                       li              r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
                        rlwinm. r0,r0,0,0,19                            ; Did we overflow a page?
                        li              r7,2                                            ; Set validity flags
                        li              r8,2                                            ; Set validity flags
-                       bne-    bcopy_phys1                                     ; Overflowed page, do normal physical copy...
+                       bne-    EXT(bcopy_phys)                         ; Overflowed page, do normal physical copy...
 
-                       crset   restorex                                        ; Remember to trash BATs on the way out
+                       crset   killbats                                        ; Remember to trash BATs on the way out
                        rlwimi  r11,r9,0,15,31                          ; Set sink lower DBAT value
                        rlwimi  r12,r9,0,15,31                          ; Set source lower DBAT value
                        rlwimi  r7,r11,0,0,14                           ; Set sink upper DBAT value
@@ -136,123 +109,41 @@ LEXT(bcopy_physvir)
 
 bcpvsame:      mr              r6,r3                                           ; Set source
                        crclr   noncache                                        ; Set cached
-                       crclr   fixxlate                                        ; Set translation already ok
                        
-                       b               copyit32                                        ; Go copy it...
+                       b               copyit                                          ; Go copy it...
+
 
 ;      
 ; void bcopy_phys(from, to, nbytes)
 ; Turns off data translation before the copy.  Note, this one will
-; not work in user state.  This routine is used on 32 and 64-bit
-; machines.
-;
-; Note that the address parameters are long longs.  We will transform these to 64-bit
-; values.  Note that on 32-bit architectures that this will ignore the high half of the
-; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
-; there anyhow.
-;
-; Also note that you probably will not be happy if either the sink or source spans across the
-; boundary between RAM and I/O space.  Good chance of hanging the machine and this code 
-; will not check, so be careful.
+; not work in user state
 ;
 
                        .align  5
                        .globl  EXT(bcopy_phys)
 
 LEXT(bcopy_phys)
-                       crclr   flipcache                                       ; (HACK) No cache flip needed
-            rlwinm     r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
-            mfsprg     r8,2                                            ; get processor feature flags
-                       rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
-                       rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
-                       mtcrf   0x02,r8                                         ; move pf64Bit to cr6 so we can test
-                       rlwimi  r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
-                       mr              r5,r7                                           ; Get the length into the right register
-            
-bcopy_phys1:                                                                   ; enter from bcopy_physvir with pf64Bit already in cr6
+
                        mfmsr   r9                                                      ; Get the MSR
+
                        crclr   noncache                                        ; Set cached
-            bt++       pf64Bitb,bcopy_phys64           ; skip if 64-bit (only they take hint)
-
-; 32-bit CPUs
-            
-            sub.       r0,r3,r4                                        ; to==from?
-                       rlwinm  r8,r9,0,MSR_DR_BIT,MSR_DR_BIT   ; was translation on?
-            cmpwi      cr1,r8,0                                        ; set cr1 beq if translation was off
-                       oris    r8,r8,hi16(MASK(MSR_VEC))       ; Get vector enable
+                       rlwinm. r8,r9,0,MSR_DR_BIT,MSR_DR_BIT   ; Is data translation on?
+
+                       cmplw   cr1,r4,r3                                       ; Compare "to" and "from"
                        cmplwi  cr7,r5,0                                        ; Check if we have a 0 length
-            beqlr-                                                             ; bail if to==from
-                       ori             r8,r8,lo16(MASK(MSR_FP))        ; Get FP
                        mr              r6,r3                                           ; Set source
-                       andc    r9,r9,r8                                        ; Turn off translation if it is on (should be) and FP, VEC
+                       beqlr-  cr1                                                     ; Bail if "to" and "from" are the same  
+                       xor             r9,r9,r8                                        ; Turn off translation if it is on (should be)
                        beqlr-  cr7                                                     ; Bail if length is 0
                        
-                       crclr   restorex                                        ; Make sure we do not trash BATs on the way out
+                       rlwinm  r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1       ; Force floating point off
+                       crclr   killbats                                        ; Make sure we do not trash BATs on the way out
+                       rlwinm  r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1     ; Force vectors off
                        mtmsr   r9                                                      ; Set DR translation off
                        isync                                                           ; Wait for it
                        
-                       crnot   fixxlate,cr1_eq                         ; Remember to turn on translation if it was
-                       b               copyit32                                        ; Go copy it...
-            
-; 64-bit: turn DR off and SF on, remember if we need to restore on way out.
-
-bcopy_phys64:                                                                  ; r9 = MSR
-
-                       srdi    r2,r3,31                                        ; (HACK) Get a 1 if source is in I/O memory
-            srdi.      r0,r9,63-MSR_SF_BIT                     ; set cr0 beq on if SF was off when we were called
-            rlwinm     r8,r9,MSR_DR_BIT+1,31,31        ; r8 <- DR bit right justified
-            cmpld      cr1,r3,r4                                       ; to==from?
-            li         r0,1                                            ; Note - we use this in a couple places below
-                       lis             r6,hi16(MASK(MSR_VEC))          ; Get vector enable
-            cmpwi      cr7,r5,0                                        ; length==0 ?
-            ori                r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR))   ; Add in FP and DR
-            beqlr--    cr1                                                     ; bail if to==from
-                       srdi    r10,r4,31                                       ; (HACK) Get a 1 if sink is in I/O memory
-            rldimi     r9,r0,63,MSR_SF_BIT                     ; set SF on
-            beqlr--    cr7                                                     ; bail if length==0
-            andc       r9,r9,r6                                        ; turn DR, VEC, FP off
-            cmpwi      cr1,r8,0                                        ; was DR on?
-            crmove     restorex,cr0_eq                         ; if SF was off, remember to turn back off before we return
-            mtmsrd     r9                                                      ; turn 64-bit addressing on, data translation off
-                       cmpldi  cr0,r2,1                                        ; (HACK) Is source in I/O memory?
-            isync                                                              ; wait for it to happen
-                       mr              r6,r3                                           ; Set source
-                       cmpldi  cr7,r10,1                                       ; (HACK) Is sink in I/O memory?
-            crnot      fixxlate,cr1_eq                         ; if DR was on, remember to turn back on before we return
-
-                       cror    flipcache,cr0_eq,cr7_eq         ; (HACK) See if either source or sink is in I/O area
-
-                       rlwinm  r10,r9,MSR_EE_BIT+1,31,31       ; (HACK GLORIOUS HACK) Isolate the EE bit
-                       sldi    r11,r0,31-MSR_EE_BIT            ; (HACK GLORIOUS HACK)) Get a mask for the EE bit
-                       sldi    r0,r0,32+8                                      ; (HACK) Get the right bit to turn off caching
-                       bf++    flipcache,copyit64                      ; (HACK) No need to mess with caching...
-                       
-;
-;                      HACK GLORIOUS HACK - when we force of caching, we need to also force off
-;                      interruptions.  We are out of CR bits, so we need to stash the entry EE
-;                      somewheres.  It is in the XER....  We NEED to change this!!!!
-;
-
-                       mtxer   r10                                                     ; (HACK GLORIOUS HACK) Remember EE
-                       andc    r9,r9,r11                                       ; (HACK GLORIOUS HACK) Turn off EE bit
-                       mfspr   r2,hid4                                         ; (HACK) Get HID4
-                       crset   noncache                                        ; (HACK) Set non-cached
-                       mtmsrd  r9                                                      ; (HACK GLORIOUS HACK) Force off EE
-                       or              r2,r2,r0                                        ; (HACK) Set bit to make real accesses cache-inhibited
-                       sync                                                            ; (HACK) Sync up
-                       li              r0,1
-                       mtspr   hid4,r2                                         ; (HACK) Make real accesses cache-inhibited
-                       isync                                                           ; (HACK) Toss prefetches
-
-                       lis             r12,0xE000                                      ; (HACK) Get the unlikeliest ESID possible
-                       srdi    r12,r12,1                                       ; (HACK) Make 0x7FFFFFFFF0000000
-                       slbie   r12                                                     ; (HACK) Make sure the ERAT is cleared 
-                       
-                       sync                                                            ; (HACK)
-                       isync                                                           ; (HACK)
-                       
-            b          copyit64
-            
+                       crnot   fixxlate,cr0_eq                         ; Remember to turn on translation if it was
+                       b               copyit                                          ; Go copy it...
 
 ;      
 ; void bcopy(from, to, nbytes)
@@ -265,19 +156,14 @@ LEXT(bcopy)
 
                        crclr   noncache                                        ; Set cached
 
-bcpswap:       
-                       crclr   flipcache                                       ; (HACK) No cache flip needed
-            mfsprg     r8,2                                            ; get processor feature flags
-            sub.       r0,r4,r3                                        ; test for to==from in mode-independent way
-            mtcrf      0x02,r8                                         ; move pf64Bit to cr6 so we can test
-                       cmpwi   cr1,r5,0                                        ; Check if we have a 0 length
-                       crclr   restorex                                        ; Make sure we do not trash BATs on the way out
+bcpswap:       cmplw   cr1,r4,r3                                       ; Compare "to" and "from"
+                       mr.             r5,r5                                           ; Check if we have a 0 length
                        mr              r6,r3                                           ; Set source
+                       crclr   killbats                                        ; Make sure we do not trash BATs on the way out
+                       beqlr-  cr1                                                     ; Bail if "to" and "from" are the same  
+                       beqlr-                                                          ; Bail if length is 0
                        crclr   fixxlate                                        ; Set translation already ok
-                       beqlr-                                                          ; Bail if "to" and "from" are the same  
-                       beqlr-  cr1                                                     ; Bail if length is 0
-            bt++       pf64Bitb,copyit64                       ; handle 64-bit processor
-                       b               copyit32                                        ; Go copy it...
+                       b               copyit                                          ; Go copy it...
 
 ;
 ;                      When we move the memory, forward overlays must be handled.  We
@@ -285,32 +171,19 @@ bcpswap:
 ;                      We need to preserve R3 because it needs to be returned for memcpy.
 ;                      We can be interrupted and lose control here.
 ;
-;                      There is no stack, so in order to use vectors, we would
-;                      need to take the vector exception. Any potential gains by using vectors 
+;                      There is no stack, so in order to used floating point, we would
+;                      need to take the FP exception. Any potential gains by using FP 
 ;                      would be more than eaten up by this.
 ;
-;                      NOTE: this code is called in three "modes":
-;                              - on 32-bit processors (32-byte cache line)
-;                              - on 64-bit processors running in 32-bit mode (128-byte cache line)
-;                              - on 64-bit processors running in 64-bit mode (128-byte cache line)
-;
-;                      ALSO NOTE: bcopy is called from copyin and copyout etc
-;                      with the "thread_recover" ptr set.  This means bcopy must not set up a
-;                      stack frame or touch non-volatile registers, and also means that it
-;                      cannot rely on turning off interrupts, because we expect to get DSIs
-;                      and have execution aborted by a "longjmp" to the thread_recover
-;                      routine.
+;                      Later, we should used Altivec for large moves.
 ;
        
                        .align  5
                        .globl  EXT(memcpy)
-            ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit
-            ; processors...
+
 LEXT(memcpy)
-                       crclr   flipcache                                       ; (HACK) No cache flip needed
-            mfsprg     r8,2                                            ; get processor feature flags
+
                        cmplw   cr1,r3,r4                                       ; "to" and "from" the same?
-            mtcrf      0x02,r8                                         ; move pf64Bit to cr6 so we can test
                        mr              r6,r4                                           ; Set the "from"
                        mr.             r5,r5                                           ; Length zero?
                        crclr   noncache                                        ; Set cached
@@ -318,10 +191,9 @@ LEXT(memcpy)
                        crclr   fixxlate                                        ; Set translation already ok
                        beqlr-  cr1                                                     ; "to" and "from" are the same
                        beqlr-                                                          ; Length is 0
-                       crclr   restorex                                        ; Make sure we do not trash BATs on the way out
-            bt++       pf64Bitb,copyit64                       ; handle 64-bit processors
+                       crclr   killbats                                        ; Make sure we do not trash BATs on the way out
                        
-copyit32:      sub             r12,r4,r6                                       ; Get potential overlap (negative if backward move)
+copyit:                sub             r12,r4,r6                                       ; Get potential overlap (negative if backward move)
                        lis             r8,0x7FFF                                       ; Start up a mask
                        srawi   r11,r12,31                                      ; Propagate the sign bit
                        dcbt    br0,r6                                          ; Touch in the first source line
@@ -334,7 +206,7 @@ copyit32:   sub             r12,r4,r6                                       ; Get potential overlap (negative if backward move)
                        cmplwi  cr7,r9,32                                       ; See if at least a line between  source and sink
                        dcbtst  br0,r4                                          ; Touch in the first sink line
                        cmplwi  cr1,r5,32                                       ; Are we moving more than a line?
-                       cror    noncache,noncache,cr7_lt        ; Set to not DCBZ output line if not enough space
+                       cror    noncache,noncache,28            ; Set to not DCBZ output line if not enough space
                        blt-    fwdovrlap                                       ; This is a forward overlapping area, handle it...
 
 ;
@@ -350,7 +222,6 @@ copyit32:   sub             r12,r4,r6                                       ; Get potential overlap (negative if backward move)
 ;                      We can not do this if noncache is set because we will take an 
 ;                      alignment exception.
 
-G4word:                                                                                        ; enter from 64-bit case with word aligned uncached operands
                        neg             r0,r4                                           ; Get the number of bytes to move to align to a line boundary
                        rlwinm. r0,r0,0,27,31                           ; Clean it up and test it
                        and             r0,r0,r8                                        ; limit to the maximum front end move
@@ -487,45 +358,17 @@ nohalf:           bf              31,bcpydone                                     ; Leave cuz we are all done...
                        lbz             r7,0(r6)                                        ; Get the byte
                        stb             r7,0(r4)                                        ; Save the single
 
-bcpydone:      
+bcpydone:      bt-             killbats,bcclrbat                       ; Jump if we need to clear bats...
+                       bflr    fixxlate                                        ; Leave now if we do not need to fix translation...
                        mfmsr   r9                                                      ; Get the MSR
-                       bf++    flipcache,bcpydone0                     ; (HACK) No need to mess with caching...
-
-                       li              r0,1                                            ; (HACK) Get a 1
-                       mfxer   r10                                                     ; (HACK GLORIOUS HACK) Get the entry EE
-                       sldi    r0,r0,32+8                                      ; (HACK) Get the right bit to turn off caching
-                       mfspr   r2,hid4                                         ; (HACK) Get HID4
-                       rlwinm  r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT     ; (HACK GLORIOUS HACK) Set the EE bit
-                       andc    r2,r2,r0                                        ; (HACK) Clear bit to make real accesses cache-inhibited
-                       or              r9,r9,r10                                       ; (HACK GLORIOUS HACK) Set the EE in MSR
-                       sync                                                            ; (HACK) Sync up
-                       mtspr   hid4,r2                                         ; (HACK) Make real accesses not cache-inhibited
-                       isync                                                           ; (HACK) Toss prefetches
-       
-                       lis             r12,0xE000                                      ; (HACK) Get the unlikeliest ESID possible
-                       srdi    r12,r12,1                                       ; (HACK) Make 0x7FFFFFFFF0000000
-                       slbie   r12                                                     ; (HACK) Make sure the ERAT is cleared 
-
-                       mtmsr   r9                                                      ; (HACK GLORIOUS HACK) Set EE properly
-
-bcpydone0:
-                       lis             r0,hi16(MASK(MSR_VEC))          ; Get the vector bit
-                       ori             r0,r0,lo16(MASK(MSR_FP))        ; Get the float bit
-                       bf++    fixxlate,bcpydone1                      ; skip if we do not need to fix translation...
                        ori             r9,r9,lo16(MASK(MSR_DR))        ; Turn data translation on
-                       andc    r9,r9,r0                                        ; Make sure that FP and VEC are off
+                       rlwinm  r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1       ; Force floating point off
+                       rlwinm  r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1     ; Force vectors off
                        mtmsr   r9                                                      ; Just do it
                        isync                                                           ; Hang in there
-            
-bcpydone1:
-            bflr++     restorex                                        ; done if we do not have to fix up addressing
-            mfsprg     r8,2                                            ; get the feature flags again
-            mtcrf      0x02,r8                                         ; put pf64Bit where we can test it
-            bt++       pf64Bitb,bcpydone2                      ; skip if 64-bit processor
-            
-            ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir
-            
-            li         r0,0                                            ; Get set to invalidate upper half
+                       blr                                                                     ; Leave cuz we are all done...                  
+
+bcclrbat:      li              r0,0                                            ; Get set to invalidate upper half
                        sync                                                            ; Make sure all is well
                        mtdbatu 0,r0                                            ; Clear sink upper DBAT
                        mtdbatu 1,r0                                            ; Clear source upper DBAT
@@ -533,16 +376,6 @@ bcpydone1:
                        isync                   
                        blr
 
-            ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys
-            
-bcpydone2:
-            mfmsr      r9                                                      ; get MSR again
-                       andc    r9,r9,r0                                        ; Make sure that FP and VEC are off
-            rldicl     r9,r9,0,MSR_SF_BIT+1            ; clear SF
-            mtmsrd     r9
-            isync
-            blr
-
 
 ;
 ;                      0123456789ABCDEF0123456789ABCDEF
@@ -563,8 +396,7 @@ bcpydone2:
 ;                      and on in order.  That means that when we are at the second to last DW we
 ;                      have to wait until the whole line is in cache before we can proceed.
 ;
-
-G4reverseWord:                                                                 ; here from 64-bit code with word aligned uncached operands
+       
 fwdovrlap:     add             r4,r5,r4                                        ; Point past the last sink byte
                        add             r6,r5,r6                                        ; Point past the last source byte 
                        and             r0,r4,r8                                        ; Apply movement limit
@@ -711,306 +543,3 @@ bnohalf:  bflr    31                                                      ; Leave cuz we are all done...
                        stb             r7,-1(r4)                                       ; Save the single
                        
                        b               bcpydone                                        ; Go exit cuz we are all done...
-
-
-// Here on 64-bit processors, which have a 128-byte cache line.  This can be
-// called either in 32 or 64-bit mode, which makes the test for reverse moves
-// a little tricky.  We've already filtered out the (sou==dest) and (len==0)
-// special cases.
-//
-// When entered:
-//             r4 = destination (32 or 64-bit ptr)
-//             r5 = length (always 32 bits)
-//             r6 = source (32 or 64-bit ptr)
-//             cr5 = noncache, fixxlate, flipcache, and restorex flags set
-
-        .align 5
-copyit64:
-        lis            r2,0x4000                       // r2 = 0x00000000 40000000
-        neg            r12,r4                          // start to compute #bytes to align dest
-               bt--    noncache,noncache1      // (HACK) Do not even try anything cached...
-        dcbt   0,r6                            // touch in 1st block of source
-noncache1:     
-        add.   r2,r2,r2                        // if 0x00000000 80000000 < 0, we are in 32-bit mode
-        cntlzw r9,r5                           // get highest power-of-2 in length
-        rlwinm r7,r12,0,25,31          // r7 <- bytes to 128-byte align dest
-               bt--    noncache,noncache2      // (HACK) Do not even try anything cached...
-        dcbtst 0,r4                            // touch in 1st destination cache block
-noncache2:
-        sraw   r2,r2,r9                        // get mask with 1s for leading 0s in length, plus 1 more 1-bit
-        bge            copyit64a                       // skip if we are running in 64-bit mode
-        rlwinm r4,r4,0,0,31            // running in 32-bit mode, so truncate ptrs and lengths to 32 bits
-        rlwinm r5,r5,0,0,31
-        rlwinm r6,r6,0,0,31
-copyit64a:                                                     // now we can use 64-bit compares even if running in 32-bit mode
-        sub            r8,r4,r6                        // get (dest-source)
-        andc   r7,r7,r2                        // limit bytes to align by operand length
-        cmpld  cr1,r8,r5                       // if (dest-source)<length, must move reverse
-        bt--   noncache,c64uncached    // skip if uncached
-        blt--  cr1,c64rdouble          // handle cached reverse moves        
-        
-        
-// Forward, cached or doubleword aligned uncached.  This is the common case.
-//   r4-r6 = dest, length, source (as above)
-//             r7 = #bytes 128-byte align dest (limited by copy length)
-//     cr5 = flags, as above
-
-c64double:
-        andi.  r8,r7,7                         // r8 <- #bytes to doubleword align
-        srwi   r9,r7,3                         // r9 <- #doublewords to 128-byte align
-        sub            r5,r5,r7                        // adjust length remaining
-        cmpwi  cr1,r9,0                        // any doublewords to move to cache align?
-        srwi   r10,r5,7                        // r10 <- 128-byte chunks to xfer after aligning dest
-        cmpwi  cr7,r10,0                       // set cr7 on chunk count
-        beq            c64double2                      // dest already doubleword aligned
-        mtctr  r8
-        b              c64double1
-        
-        .align 5                                       // align inner loops
-c64double1:                                                    // copy bytes until dest is doubleword aligned
-        lbz            r0,0(r6)
-        addi   r6,r6,1
-        stb            r0,0(r4)
-        addi   r4,r4,1
-        bdnz   c64double1
-
-c64double2:                                                    // r9/cr1=doublewords, r10=128-byte chunks, cr7=blt if r5==0
-        beq            cr1,c64double4          // no doublewords to xfer in order to cache align
-        mtctr  r9
-        b              c64double3
-
-        .align 5                                       // align inner loops
-c64double3:                                                    // copy doublewords until dest is 128-byte aligned
-        ld             r7,0(r6)
-        addi   r6,r6,8
-        std            r7,0(r4)
-        addi   r4,r4,8
-        bdnz   c64double3
-        
-// Here to xfer 128-byte chunks, if any.  Because the IBM 970 cannot issue two stores/cycle,
-// we pipeline the inner loop so we can pair loads and stores.  Since we only have 8 GPRs for
-// data (64 bytes), we load/store each twice per 128-byte chunk.
-
-c64double4:                                                    // r10/cr7=128-byte chunks
-        rlwinm r0,r5,29,28,31          // r0 <- count of leftover doublewords, after moving chunks
-        cmpwi  cr1,r0,0                        // set cr1 on leftover doublewords
-        beq            cr7,c64double7          // no 128-byte chunks
-        sub            r8,r6,r4                        // r8 <- (source - dest)
-        li             r9,128                          // start at next cache line (we've already touched in 1st line)
-        cmpldi cr7,r8,128                      // if (source-dest)<128, cannot use dcbz128 beacause of overlap
-        cror   noncache,cr7_lt,noncache        // turn on "noncache" flag if (source-dest)<128
-               bt--    noncache,noncache3      // (HACK) Skip cache touch if noncachable
-        dcbt128        r9,r6,1                         // start forward stream
-noncache3:
-        mtctr  r10
-        
-        ld             r0,0(r6)                        // start pipe: load 1st half-line
-        ld             r2,8(r6)
-        ld             r7,16(r6)
-        ld             r8,24(r6)
-        ld             r9,32(r6)
-        ld             r10,40(r6)
-        ld             r11,48(r6)
-        ld             r12,56(r6)
-               b               c64InnerLoopEntryPt
-        
-        .align 5                                       // align inner loop
-c64InnerLoop:                                          // loop copying 128-byte cache lines to 128-aligned destination
-        std            r0,64(r4)                       // store 2nd half of chunk n
-        ld             r0,0(r6)                        // load 1st half of chunk n+1
-        std            r2,72(r4)
-        ld             r2,8(r6)
-        std            r7,80(r4)
-        ld             r7,16(r6)
-        std            r8,88(r4)
-        ld             r8,24(r6)
-        std            r9,96(r4)
-        ld             r9,32(r6)
-        std            r10,104(r4)
-        ld             r10,40(r6)
-        std            r11,112(r4)
-        ld             r11,48(r6)
-        std            r12,120(r4)
-        ld             r12,56(r6)
-        addi   r4,r4,128                       // advance to next dest chunk
-c64InnerLoopEntryPt:                           // initial entry into loop, with 1st halfline loaded        
-        bt             noncache,c64InnerLoop1  // skip if uncached or overlap
-        dcbz128        0,r4                            // avoid prefetch of next cache line
-c64InnerLoop1:
-        std            r0,0(r4)                        // store 1st half of chunk n
-        ld             r0,64(r6)                       // load 2nd half of chunk n
-        std            r2,8(r4)
-        ld             r2,72(r6)
-        std            r7,16(r4)
-        ld             r7,80(r6)
-        std            r8,24(r4)
-        ld             r8,88(r6)
-        std            r9,32(r4)
-        ld             r9,96(r6)
-        std            r10,40(r4)
-        ld             r10,104(r6)
-        std            r11,48(r4)
-        ld             r11,112(r6)
-        std            r12,56(r4)
-        ld             r12,120(r6)
-        addi   r6,r6,128                       // advance to next source chunk if any
-        bdnz   c64InnerLoop            // loop if more chunks
-        
-        std            r0,64(r4)                       // store 2nd half of last chunk
-        std            r2,72(r4)
-        std            r7,80(r4)
-        std            r8,88(r4)
-        std            r9,96(r4)
-        std            r10,104(r4)
-        std            r11,112(r4)
-        std            r12,120(r4)
-        addi   r4,r4,128                       // advance to next dest chunk
-
-c64double7:                        // r5 <- leftover bytes, cr1 set on doubleword count
-        rlwinm r0,r5,29,28,31          // r0 <- count of leftover doublewords (0-15)
-        andi.  r5,r5,7                         // r5/cr0 <- count of leftover bytes (0-7)
-        beq            cr1,c64byte                     // no leftover doublewords
-        mtctr  r0
-        b              c64double8
-        
-        .align 5                                       // align inner loop
-c64double8:                                                    // loop copying leftover doublewords
-        ld             r0,0(r6)
-        addi   r6,r6,8
-        std            r0,0(r4)
-        addi   r4,r4,8
-        bdnz   c64double8
-
-
-// Forward byte loop.
-
-c64byte:                                                       // r5/cr0 <- byte count (can be big if unaligned uncached)
-               beq             bcpydone                        // done if no leftover bytes
-        mtctr  r5
-        b              c64byte1
-        
-        .align 5                                       // align inner loop
-c64byte1:
-        lbz            r0,0(r6)
-        addi   r6,r6,1
-        stb            r0,0(r4)
-        addi   r4,r4,1
-        bdnz   c64byte1
-
-        b              bcpydone
-
-
-// Uncached copies.  We must avoid unaligned accesses, since they always take alignment
-// exceptions on uncached memory on 64-bit processors.  This may mean we copy long operands
-// a byte at a time, but that is still much faster than alignment exceptions.
-//   r4-r6 = dest, length, source (as above)
-//             r2 = mask of 1s for leading 0s in length, plus 1 extra 1
-//             r7 = #bytes to copy to 128-byte align dest (limited by operand length)
-//        cr1 = blt if reverse move required
-
-c64uncached:
-        xor            r0,r6,r4                        // get relative alignment
-        rlwinm r10,r0,0,29,31          // relatively doubleword aligned?
-        rlwinm r11,r0,0,30,31          // relatively word aligned?
-        not            r8,r2                           // get mask to limit initial length of copy for G4word
-        blt            cr1,c64reverseUncached
-        
-        cmpwi  cr0,r10,0                       // set cr0 beq if doubleword aligned
-        cmpwi  cr1,r11,0                       // set cr1 beq if word aligned
-        beq            cr0,c64double           // doubleword aligned
-        beq            cr1,G4word                      // word aligned, use G3/G4 code
-        cmpwi  r5,0                            // set cr0 on byte count
-        b              c64byte                         // unaligned operands
-
-c64reverseUncached:
-        cmpwi  cr0,r10,0                       // set cr0 beq if doubleword aligned
-        cmpwi  cr1,r11,0                       // set cr1 beq if word aligned
-        beq            cr0,c64rdouble          // doubleword aligned so can use LD/STD
-        beq            cr1,G4reverseWord       // word aligned, use G3/G4 code
-        add            r6,r6,r5                        // point to (end+1) of source and dest
-        add            r4,r4,r5
-        cmpwi  r5,0                            // set cr0 on length
-        b              c64rbyte                        // copy a byte at a time
-        
-        
-
-// Reverse doubleword copies.  This is used for all cached copies, and doubleword
-// aligned uncached copies.
-//             r4 = destination (32 or 64-bit ptr)
-//             r5 = length (always 32 bits)
-//             r6 = source (32 or 64-bit ptr)
-//             cr5 = noncache, fixxlate, and restorex flags set
-
-c64rdouble:
-        add            r6,r6,r5                        // point to (end+1) of source and dest
-        add            r4,r4,r5
-        rlwinm.        r7,r4,0,29,31           // r7 <- #bytes to doubleword align dest
-        cmplw  cr1,r7,r5                       // operand long enough to doubleword align?
-        blt            cr1,c64rd0                      // yes
-        mr             r7,r5                           // no
-c64rd0:
-        sub            r5,r5,r7                        // adjust length
-        srwi   r8,r5,6                         // r8 <- 64-byte chunks to xfer
-        cmpwi  cr1,r8,0                        // any chunks?
-        beq            c64rd2                          // source already doubleword aligned
-        mtctr  r7
-
-c64rd1:                                                                // copy bytes until source doublword aligned
-        lbzu   r0,-1(r6)
-        stbu   r0,-1(r4)
-        bdnz   c64rd1
-        
-c64rd2:                                                                // r8/cr1 <- count of 64-byte chunks
-        rlwinm r0,r5,29,29,31          // r0 <- count of leftover doublewords
-        andi.  r5,r5,7                         // r5/cr0 <- count of leftover bytes
-        cmpwi  cr7,r0,0                        // leftover doublewords?
-        beq            cr1,c64rd4                      // no chunks to xfer
-        li             r9,-128                         // start at next cache line
-        mtctr  r8
-        bt             noncache,c64rd3         // (HACK) Do not start a stream if noncachable...
-        dcbt128        r9,r6,3                         // start reverse stream
-        b              c64rd3
-        
-        .align 5                                       // align inner loop
-c64rd3:                                                                // loop copying 64-byte chunks
-        ld             r7,-8(r6)
-        ld             r8,-16(r6)
-        ld             r9,-24(r6)
-        ld             r10,-32(r6)
-        ld             r11,-40(r6)
-        ld             r12,-48(r6)
-        std            r7,-8(r4)
-        std            r8,-16(r4)
-        ld             r7,-56(r6)
-        ldu            r8,-64(r6)
-        std            r9,-24(r4)
-        std            r10,-32(r4)
-        std            r11,-40(r4)
-        std            r12,-48(r4)
-        std            r7,-56(r4)
-        stdu   r8,-64(r4)
-        bdnz   c64rd3
-
-c64rd4:                                                                // r0/cr7 = leftover doublewords  r5/cr0 = leftover bytes
-        beq            cr7,c64rbyte            // no leftover doublewords
-        mtctr  r0
-        
-c64rd5:                                                                // loop copying leftover doublewords
-        ldu            r0,-8(r6)
-        stdu   r0,-8(r4)
-        bdnz   c64rd5
-
-
-// Reverse byte loop.
-
-c64rbyte:                                                      // r5/cr0 <- byte count (can be big if unaligned uncached)
-        beq            bcpydone                        // done if no leftover bytes
-        mtctr  r5
-        
-c64rbyte1:
-        lbzu   r0,-1(r6)
-        stbu   r0,-1(r4)
-        bdnz   c64rbyte1
-
-        b              bcpydone
-