X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/cf03f5cdc65293b4cb5eba3ed23fed26dad903c9..de355530ae67247cbd0da700edb3a2a1dae884c2:/osfmk/ppc/bcopy.s

diff --git a/osfmk/ppc/bcopy.s b/osfmk/ppc/bcopy.s
index 389fe4b2f..1a18bf37a 100644
--- a/osfmk/ppc/bcopy.s
+++ b/osfmk/ppc/bcopy.s
@@ -1,24 +1,21 @@
 /*
- * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
- * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
  * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
  * 
  * @APPLE_LICENSE_HEADER_END@
  */
@@ -30,22 +27,13 @@
 ;
 #include <ppc/asm.h>
 #include <ppc/proc_reg.h>
-#include <assym.s>
 
 ;		Use CR5_lt to indicate non-cached
 #define noncache 20
-
 ;		Use CR5_gt to indicate that we need to turn data translation back on
 #define fixxlate 21
-
-;		Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off
-;		64-bit mode (if 64-bit) before returning to our caller.  We overload the
-;		bit to reduce the number of conditional branches at bcopy exit.
-#define restorex 22
-
-;		Use CR5_so to indicate that we need to restore real-mode cachability
-;		Only needed on 64-bit machines
-#define flipcache 23
+;		Use CR5_eq to indicate that we need to invalidate bats
+#define killbats 22
 
 ;
 ; bcopy_nc(from, to, nbytes)
@@ -65,24 +53,19 @@ LEXT(bcopy_nc)
 ;	
 ; void bcopy_physvir(from, to, nbytes)
 ; Attempt to copy physically addressed memory with translation on if conditions are met.
-; Otherwise do a normal bcopy_phys.  This routine is used because some 32-bit processors 
-; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
-; for the passed phys addrs and do the copy with translation on.  
+; Otherwise do a normal bcopy_phys.
 ;
 ; Rules are: neither source nor destination can cross a page. 
+; No accesses above the 2GB line (I/O or ROM).
 ;
-; Interrupts must be disabled throughout the copy when this is called.
+; Interrupts must be disabled throughout the copy when this is called
+
 ; To do this, we build a
 ; 128 DBAT for both the source and sink.  If both are the same, only one is
 ; loaded.  We do not touch the IBATs, so there is no issue if either physical page
 ; address is the same as the virtual address of the instructions we are executing.
 ;
-; At the end, we invalidate the used DBATs.
-;
-; Note that the address parameters are long longs.  We will transform these to 64-bit
-; values.  Note that on 32-bit architectures that this will ignore the high half of the
-; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
-; there anyhow.
+; At the end, we invalidate the used DBATs and reenable interrupts.
 ;
 ; Note, this one will not work in user state
 ; 
@@ -92,32 +75,22 @@ LEXT(bcopy_nc)
 
 LEXT(bcopy_physvir)
 
-			crclr	flipcache					; (HACK) No cache flip needed
-            mfsprg	r8,2						; get processor feature flags
-            rlwinm	r3,r3,0,1,0					; Duplicate high half of long long paddr into top of reg
-			addic.	r0,r7,-1					; Get length - 1
-			rlwimi	r3,r4,0,0,31				; Combine bottom of long long to full 64-bits
+			addic.	r0,r5,-1					; Get length - 1
 			add		r11,r3,r0					; Point to last byte of sink
-			rlwinm	r4,r5,0,1,0					; Duplicate high half of long long paddr into top of reg
-            mtcrf	0x02,r8						; move pf64Bit to cr6 so we can test
-            rlwimi	r4,r6,0,0,31				; Combine bottom of long long to full 64-bits
-			mr		r5,r7						; Get the length into the right register
-			cmplw	cr1,r3,r4					; Does source == sink?	
-            bt++	pf64Bitb,bcopy_phys1		; if 64-bit processor, use standard routine (no BATs)
+			cmplw	cr1,r3,r4					; Does source == sink?			
 			add		r12,r4,r0					; Point to last byte of source
 			bltlr-								; Bail if length is 0 or way too big
 			xor		r7,r11,r3					; See if we went to next page
 			xor		r8,r12,r4					; See if we went to next page
 			or		r0,r7,r8					; Combine wrap
 			
-//			li		r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2)	; Set default attributes
-			li		r9,((2<<3)|2)				; Set default attributes
+			li		r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2)	; Set default attributes
 			rlwinm.	r0,r0,0,0,19				; Did we overflow a page?
 			li		r7,2						; Set validity flags
 			li		r8,2						; Set validity flags
-			bne-	bcopy_phys1					; Overflowed page, do normal physical copy...
+			bne-	EXT(bcopy_phys)				; Overflowed page, do normal physical copy...
 
-			crset	restorex					; Remember to trash BATs on the way out
+			crset	killbats					; Remember to trash BATs on the way out
 			rlwimi	r11,r9,0,15,31				; Set sink lower DBAT value
 			rlwimi	r12,r9,0,15,31				; Set source lower DBAT value
 			rlwimi	r7,r11,0,0,14				; Set sink upper DBAT value
@@ -136,123 +109,41 @@ LEXT(bcopy_physvir)
 
 bcpvsame:	mr		r6,r3						; Set source
 			crclr	noncache					; Set cached
-			crclr	fixxlate					; Set translation already ok
 			
-			b		copyit32					; Go copy it...
+			b		copyit						; Go copy it...
+
 
 ;	
 ; void bcopy_phys(from, to, nbytes)
 ; Turns off data translation before the copy.  Note, this one will
-; not work in user state.  This routine is used on 32 and 64-bit
-; machines.
-;
-; Note that the address parameters are long longs.  We will transform these to 64-bit
-; values.  Note that on 32-bit architectures that this will ignore the high half of the
-; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
-; there anyhow.
-;
-; Also note that you probably will not be happy if either the sink or source spans across the
-; boundary between RAM and I/O space.  Good chance of hanging the machine and this code 
-; will not check, so be careful.
+; not work in user state
 ;
 
 			.align	5
 			.globl	EXT(bcopy_phys)
 
 LEXT(bcopy_phys)
-			crclr	flipcache					; (HACK) No cache flip needed
-            rlwinm	r3,r3,0,1,0					; Duplicate high half of long long paddr into top of reg
-            mfsprg	r8,2						; get processor feature flags
-			rlwimi	r3,r4,0,0,31				; Combine bottom of long long to full 64-bits
-			rlwinm	r4,r5,0,1,0					; Duplicate high half of long long paddr into top of reg
-			mtcrf	0x02,r8						; move pf64Bit to cr6 so we can test
-			rlwimi	r4,r6,0,0,31				; Combine bottom of long long to full 64-bits
-			mr		r5,r7						; Get the length into the right register
-            
-bcopy_phys1:									; enter from bcopy_physvir with pf64Bit already in cr6
+
 			mfmsr	r9							; Get the MSR
+
 			crclr	noncache					; Set cached
-            bt++	pf64Bitb,bcopy_phys64		; skip if 64-bit (only they take hint)
-
-; 32-bit CPUs
-            
-            sub.	r0,r3,r4					; to==from?
-			rlwinm	r8,r9,0,MSR_DR_BIT,MSR_DR_BIT	; was translation on?
-            cmpwi	cr1,r8,0					; set cr1 beq if translation was off
-			oris	r8,r8,hi16(MASK(MSR_VEC))	; Get vector enable
+			rlwinm.	r8,r9,0,MSR_DR_BIT,MSR_DR_BIT	; Is data translation on?
+
+			cmplw	cr1,r4,r3					; Compare "to" and "from"
 			cmplwi	cr7,r5,0					; Check if we have a 0 length
-            beqlr-								; bail if to==from
-			ori		r8,r8,lo16(MASK(MSR_FP))	; Get FP
 			mr		r6,r3						; Set source
-			andc	r9,r9,r8					; Turn off translation if it is on (should be) and FP, VEC
+			beqlr-	cr1							; Bail if "to" and "from" are the same	
+			xor		r9,r9,r8					; Turn off translation if it is on (should be)
 			beqlr-	cr7							; Bail if length is 0
 			
-			crclr	restorex					; Make sure we do not trash BATs on the way out
+			rlwinm	r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1	; Force floating point off
+			crclr	killbats					; Make sure we do not trash BATs on the way out
+			rlwinm	r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1	; Force vectors off
 			mtmsr	r9							; Set DR translation off
 			isync								; Wait for it
 			
-			crnot	fixxlate,cr1_eq				; Remember to turn on translation if it was
-			b		copyit32					; Go copy it...
-            
-; 64-bit: turn DR off and SF on, remember if we need to restore on way out.
-
-bcopy_phys64:									; r9 = MSR
-
-			srdi	r2,r3,31					; (HACK) Get a 1 if source is in I/O memory
-            srdi.	r0,r9,63-MSR_SF_BIT			; set cr0 beq on if SF was off when we were called
-            rlwinm	r8,r9,MSR_DR_BIT+1,31,31	; r8 <- DR bit right justified
-            cmpld	cr1,r3,r4					; to==from?
-            li		r0,1						; Note - we use this in a couple places below
-			lis		r6,hi16(MASK(MSR_VEC))		; Get vector enable
-            cmpwi	cr7,r5,0					; length==0 ?
-            ori		r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR))	; Add in FP and DR
-            beqlr--	cr1							; bail if to==from
-			srdi	r10,r4,31					; (HACK) Get a 1 if sink is in I/O memory
-            rldimi	r9,r0,63,MSR_SF_BIT			; set SF on
-            beqlr--	cr7							; bail if length==0
-            andc	r9,r9,r6					; turn DR, VEC, FP off
-            cmpwi	cr1,r8,0					; was DR on?
-            crmove	restorex,cr0_eq				; if SF was off, remember to turn back off before we return
-            mtmsrd	r9							; turn 64-bit addressing on, data translation off
-			cmpldi	cr0,r2,1					; (HACK) Is source in I/O memory?
-            isync								; wait for it to happen
-			mr		r6,r3						; Set source
-			cmpldi	cr7,r10,1					; (HACK) Is sink in I/O memory?
-            crnot	fixxlate,cr1_eq				; if DR was on, remember to turn back on before we return
-
-			cror	flipcache,cr0_eq,cr7_eq		; (HACK) See if either source or sink is in I/O area
-
-			rlwinm	r10,r9,MSR_EE_BIT+1,31,31	; (HACK GLORIOUS HACK) Isolate the EE bit
-			sldi	r11,r0,31-MSR_EE_BIT		; (HACK GLORIOUS HACK)) Get a mask for the EE bit
-			sldi	r0,r0,32+8					; (HACK) Get the right bit to turn off caching
-			bf++	flipcache,copyit64			; (HACK) No need to mess with caching...
-			
-;
-;			HACK GLORIOUS HACK - when we force of caching, we need to also force off
-;			interruptions.  We are out of CR bits, so we need to stash the entry EE
-;			somewheres.  It is in the XER....  We NEED to change this!!!!
-;
-
-			mtxer	r10							; (HACK GLORIOUS HACK) Remember EE
-			andc	r9,r9,r11					; (HACK GLORIOUS HACK) Turn off EE bit
-			mfspr	r2,hid4						; (HACK) Get HID4
-			crset	noncache					; (HACK) Set non-cached
-			mtmsrd	r9							; (HACK GLORIOUS HACK) Force off EE
-			or		r2,r2,r0					; (HACK) Set bit to make real accesses cache-inhibited
-			sync								; (HACK) Sync up
-			li		r0,1
-			mtspr	hid4,r2						; (HACK) Make real accesses cache-inhibited
-			isync								; (HACK) Toss prefetches
-
-			lis		r12,0xE000					; (HACK) Get the unlikeliest ESID possible
-			srdi	r12,r12,1					; (HACK) Make 0x7FFFFFFFF0000000
-			slbie	r12							; (HACK) Make sure the ERAT is cleared 
-			
-			sync								; (HACK)
-			isync								; (HACK)
-			
-            b		copyit64
-            
+			crnot	fixxlate,cr0_eq				; Remember to turn on translation if it was
+			b		copyit						; Go copy it...
 
 ;	
 ; void bcopy(from, to, nbytes)
@@ -265,19 +156,14 @@ LEXT(bcopy)
 
 			crclr	noncache					; Set cached
 
-bcpswap:	
-			crclr	flipcache					; (HACK) No cache flip needed
-            mfsprg	r8,2						; get processor feature flags
-            sub.	r0,r4,r3					; test for to==from in mode-independent way
-            mtcrf	0x02,r8						; move pf64Bit to cr6 so we can test
-			cmpwi	cr1,r5,0					; Check if we have a 0 length
-			crclr	restorex					; Make sure we do not trash BATs on the way out
+bcpswap:	cmplw	cr1,r4,r3					; Compare "to" and "from"
+			mr.		r5,r5						; Check if we have a 0 length
 			mr		r6,r3						; Set source
+			crclr	killbats					; Make sure we do not trash BATs on the way out
+			beqlr-	cr1							; Bail if "to" and "from" are the same	
+			beqlr-								; Bail if length is 0
 			crclr	fixxlate					; Set translation already ok
-			beqlr-								; Bail if "to" and "from" are the same	
-			beqlr-	cr1							; Bail if length is 0
-            bt++	pf64Bitb,copyit64			; handle 64-bit processor
-			b		copyit32					; Go copy it...
+			b		copyit						; Go copy it...
 
 ;
 ;			When we move the memory, forward overlays must be handled.  We
@@ -285,32 +171,19 @@ bcpswap:
 ;			We need to preserve R3 because it needs to be returned for memcpy.
 ;			We can be interrupted and lose control here.
 ;
-;			There is no stack, so in order to use vectors, we would
-;			need to take the vector exception. Any potential gains by using vectors 
+;			There is no stack, so in order to used floating point, we would
+;			need to take the FP exception. Any potential gains by using FP 
 ;			would be more than eaten up by this.
 ;
-;			NOTE: this code is called in three "modes":
-;				- on 32-bit processors (32-byte cache line)
-;				- on 64-bit processors running in 32-bit mode (128-byte cache line)
-;				- on 64-bit processors running in 64-bit mode (128-byte cache line)
-;
-;			ALSO NOTE: bcopy is called from copyin and copyout etc
-;			with the "thread_recover" ptr set.  This means bcopy must not set up a
-;			stack frame or touch non-volatile registers, and also means that it
-;			cannot rely on turning off interrupts, because we expect to get DSIs
-;			and have execution aborted by a "longjmp" to the thread_recover
-;			routine.
+;			Later, we should used Altivec for large moves.
 ;
 	
 			.align	5
 			.globl	EXT(memcpy)
-            ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit
-            ; processors...
+
 LEXT(memcpy)
-			crclr	flipcache					; (HACK) No cache flip needed
-            mfsprg	r8,2						; get processor feature flags
+
 			cmplw	cr1,r3,r4					; "to" and "from" the same?
-            mtcrf	0x02,r8						; move pf64Bit to cr6 so we can test
 			mr		r6,r4						; Set the "from"
 			mr.		r5,r5						; Length zero?
 			crclr	noncache					; Set cached
@@ -318,10 +191,9 @@ LEXT(memcpy)
 			crclr	fixxlate					; Set translation already ok
 			beqlr-	cr1							; "to" and "from" are the same
 			beqlr-								; Length is 0
-			crclr	restorex					; Make sure we do not trash BATs on the way out
-            bt++	pf64Bitb,copyit64			; handle 64-bit processors
+			crclr	killbats					; Make sure we do not trash BATs on the way out
 			
-copyit32:	sub		r12,r4,r6					; Get potential overlap (negative if backward move)
+copyit:		sub		r12,r4,r6					; Get potential overlap (negative if backward move)
 			lis		r8,0x7FFF					; Start up a mask
 			srawi	r11,r12,31					; Propagate the sign bit
 			dcbt	br0,r6						; Touch in the first source line
@@ -334,7 +206,7 @@ copyit32:	sub		r12,r4,r6					; Get potential overlap (negative if backward move)
 			cmplwi	cr7,r9,32					; See if at least a line between  source and sink
 			dcbtst	br0,r4						; Touch in the first sink line
 			cmplwi	cr1,r5,32					; Are we moving more than a line?
-			cror	noncache,noncache,cr7_lt	; Set to not DCBZ output line if not enough space
+			cror	noncache,noncache,28		; Set to not DCBZ output line if not enough space
 			blt-	fwdovrlap					; This is a forward overlapping area, handle it...
 
 ;
@@ -350,7 +222,6 @@ copyit32:	sub		r12,r4,r6					; Get potential overlap (negative if backward move)
 ;			We can not do this if noncache is set because we will take an 
 ;			alignment exception.
 
-G4word:											; enter from 64-bit case with word aligned uncached operands
 			neg		r0,r4						; Get the number of bytes to move to align to a line boundary
 			rlwinm.	r0,r0,0,27,31				; Clean it up and test it
 			and		r0,r0,r8					; limit to the maximum front end move
@@ -487,45 +358,17 @@ nohalf:		bf		31,bcpydone					; Leave cuz we are all done...
 			lbz		r7,0(r6)					; Get the byte
 			stb		r7,0(r4)					; Save the single
 
-bcpydone:	
+bcpydone:	bt-		killbats,bcclrbat			; Jump if we need to clear bats...
+			bflr	fixxlate					; Leave now if we do not need to fix translation...
 			mfmsr	r9							; Get the MSR
-			bf++	flipcache,bcpydone0			; (HACK) No need to mess with caching...
-
-			li		r0,1						; (HACK) Get a 1
-			mfxer	r10							; (HACK GLORIOUS HACK) Get the entry EE
-			sldi	r0,r0,32+8					; (HACK) Get the right bit to turn off caching
-			mfspr	r2,hid4						; (HACK) Get HID4
-			rlwinm	r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT	; (HACK GLORIOUS HACK) Set the EE bit
-			andc	r2,r2,r0					; (HACK) Clear bit to make real accesses cache-inhibited
-			or		r9,r9,r10					; (HACK GLORIOUS HACK) Set the EE in MSR
-			sync								; (HACK) Sync up
-			mtspr	hid4,r2						; (HACK) Make real accesses not cache-inhibited
-			isync								; (HACK) Toss prefetches
-	
-			lis		r12,0xE000					; (HACK) Get the unlikeliest ESID possible
-			srdi	r12,r12,1					; (HACK) Make 0x7FFFFFFFF0000000
-			slbie	r12							; (HACK) Make sure the ERAT is cleared 
-
-			mtmsr	r9							; (HACK GLORIOUS HACK) Set EE properly
-
-bcpydone0:
-			lis		r0,hi16(MASK(MSR_VEC))		; Get the vector bit
-			ori		r0,r0,lo16(MASK(MSR_FP))	; Get the float bit
-			bf++	fixxlate,bcpydone1			; skip if we do not need to fix translation...
 			ori		r9,r9,lo16(MASK(MSR_DR))	; Turn data translation on
-			andc	r9,r9,r0					; Make sure that FP and VEC are off
+			rlwinm	r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1	; Force floating point off
+			rlwinm	r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1	; Force vectors off
 			mtmsr	r9							; Just do it
 			isync								; Hang in there
-            
-bcpydone1:
-            bflr++	restorex					; done if we do not have to fix up addressing
-            mfsprg	r8,2						; get the feature flags again
-            mtcrf	0x02,r8						; put pf64Bit where we can test it
-            bt++	pf64Bitb,bcpydone2			; skip if 64-bit processor
-            
-            ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir
-            
-            li		r0,0						; Get set to invalidate upper half
+			blr									; Leave cuz we are all done...			
+
+bcclrbat:	li		r0,0						; Get set to invalidate upper half
 			sync								; Make sure all is well
 			mtdbatu	0,r0						; Clear sink upper DBAT
 			mtdbatu	1,r0						; Clear source upper DBAT
@@ -533,16 +376,6 @@ bcpydone1:
 			isync			
 			blr
 
-            ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys
-            
-bcpydone2:
-            mfmsr	r9							; get MSR again
-			andc	r9,r9,r0					; Make sure that FP and VEC are off
-            rldicl	r9,r9,0,MSR_SF_BIT+1		; clear SF
-            mtmsrd	r9
-            isync
-            blr
-
 
 ;
 ;			0123456789ABCDEF0123456789ABCDEF
@@ -563,8 +396,7 @@ bcpydone2:
 ;			and on in order.  That means that when we are at the second to last DW we
 ;			have to wait until the whole line is in cache before we can proceed.
 ;
-
-G4reverseWord:									; here from 64-bit code with word aligned uncached operands
+	
 fwdovrlap:	add		r4,r5,r4					; Point past the last sink byte
 			add		r6,r5,r6					; Point past the last source byte 
 			and		r0,r4,r8					; Apply movement limit
@@ -711,306 +543,3 @@ bnohalf:	bflr	31							; Leave cuz we are all done...
 			stb		r7,-1(r4)					; Save the single
 			
 			b		bcpydone					; Go exit cuz we are all done...
-
-
-// Here on 64-bit processors, which have a 128-byte cache line.  This can be
-// called either in 32 or 64-bit mode, which makes the test for reverse moves
-// a little tricky.  We've already filtered out the (sou==dest) and (len==0)
-// special cases.
-//
-// When entered:
-//		r4 = destination (32 or 64-bit ptr)
-//		r5 = length (always 32 bits)
-//		r6 = source (32 or 64-bit ptr)
-//		cr5 = noncache, fixxlate, flipcache, and restorex flags set
-
-        .align	5
-copyit64:
-        lis		r2,0x4000			// r2 = 0x00000000 40000000
-        neg		r12,r4				// start to compute #bytes to align dest
-		bt--	noncache,noncache1	// (HACK) Do not even try anything cached...
-        dcbt	0,r6				// touch in 1st block of source
-noncache1:     
-        add.	r2,r2,r2			// if 0x00000000 80000000 < 0, we are in 32-bit mode
-        cntlzw	r9,r5				// get highest power-of-2 in length
-        rlwinm	r7,r12,0,25,31		// r7 <- bytes to 128-byte align dest
-		bt--	noncache,noncache2	// (HACK) Do not even try anything cached...
-        dcbtst	0,r4				// touch in 1st destination cache block
-noncache2:
-        sraw	r2,r2,r9			// get mask with 1s for leading 0s in length, plus 1 more 1-bit
-        bge		copyit64a			// skip if we are running in 64-bit mode
-        rlwinm	r4,r4,0,0,31		// running in 32-bit mode, so truncate ptrs and lengths to 32 bits
-        rlwinm	r5,r5,0,0,31
-        rlwinm	r6,r6,0,0,31
-copyit64a:							// now we can use 64-bit compares even if running in 32-bit mode
-        sub		r8,r4,r6			// get (dest-source)
-        andc	r7,r7,r2			// limit bytes to align by operand length
-        cmpld	cr1,r8,r5			// if (dest-source)<length, must move reverse
-        bt--	noncache,c64uncached	// skip if uncached
-        blt--	cr1,c64rdouble		// handle cached reverse moves        
-        
-        
-// Forward, cached or doubleword aligned uncached.  This is the common case.
-//   r4-r6 = dest, length, source (as above)
-//		r7 = #bytes 128-byte align dest (limited by copy length)
-//     cr5 = flags, as above
-
-c64double:
-        andi.	r8,r7,7				// r8 <- #bytes to doubleword align
-        srwi	r9,r7,3				// r9 <- #doublewords to 128-byte align
-        sub		r5,r5,r7			// adjust length remaining
-        cmpwi	cr1,r9,0			// any doublewords to move to cache align?
-        srwi	r10,r5,7			// r10 <- 128-byte chunks to xfer after aligning dest
-        cmpwi	cr7,r10,0			// set cr7 on chunk count
-        beq		c64double2			// dest already doubleword aligned
-        mtctr	r8
-        b		c64double1
-        
-        .align	5					// align inner loops
-c64double1:							// copy bytes until dest is doubleword aligned
-        lbz		r0,0(r6)
-        addi	r6,r6,1
-        stb		r0,0(r4)
-        addi	r4,r4,1
-        bdnz	c64double1
-
-c64double2:							// r9/cr1=doublewords, r10=128-byte chunks, cr7=blt if r5==0
-        beq		cr1,c64double4		// no doublewords to xfer in order to cache align
-        mtctr	r9
-        b		c64double3
-
-        .align	5					// align inner loops
-c64double3:							// copy doublewords until dest is 128-byte aligned
-        ld		r7,0(r6)
-        addi	r6,r6,8
-        std		r7,0(r4)
-        addi	r4,r4,8
-        bdnz	c64double3
-        
-// Here to xfer 128-byte chunks, if any.  Because the IBM 970 cannot issue two stores/cycle,
-// we pipeline the inner loop so we can pair loads and stores.  Since we only have 8 GPRs for
-// data (64 bytes), we load/store each twice per 128-byte chunk.
-
-c64double4:							// r10/cr7=128-byte chunks
-        rlwinm	r0,r5,29,28,31		// r0 <- count of leftover doublewords, after moving chunks
-        cmpwi	cr1,r0,0			// set cr1 on leftover doublewords
-        beq		cr7,c64double7		// no 128-byte chunks
-        sub		r8,r6,r4			// r8 <- (source - dest)
-        li		r9,128				// start at next cache line (we've already touched in 1st line)
-        cmpldi	cr7,r8,128			// if (source-dest)<128, cannot use dcbz128 beacause of overlap
-        cror	noncache,cr7_lt,noncache	// turn on "noncache" flag if (source-dest)<128
-		bt--	noncache,noncache3	// (HACK) Skip cache touch if noncachable
-        dcbt128	r9,r6,1				// start forward stream
-noncache3:
-        mtctr	r10
-        
-        ld		r0,0(r6)			// start pipe: load 1st half-line
-        ld		r2,8(r6)
-        ld		r7,16(r6)
-        ld		r8,24(r6)
-        ld		r9,32(r6)
-        ld		r10,40(r6)
-        ld		r11,48(r6)
-        ld		r12,56(r6)
-		b		c64InnerLoopEntryPt
-        
-        .align	5					// align inner loop
-c64InnerLoop:						// loop copying 128-byte cache lines to 128-aligned destination
-        std		r0,64(r4)			// store 2nd half of chunk n
-        ld		r0,0(r6)			// load 1st half of chunk n+1
-        std		r2,72(r4)
-        ld		r2,8(r6)
-        std		r7,80(r4)
-        ld		r7,16(r6)
-        std		r8,88(r4)
-        ld		r8,24(r6)
-        std		r9,96(r4)
-        ld		r9,32(r6)
-        std		r10,104(r4)
-        ld		r10,40(r6)
-        std		r11,112(r4)
-        ld		r11,48(r6)
-        std		r12,120(r4)
-        ld		r12,56(r6)
-        addi	r4,r4,128			// advance to next dest chunk
-c64InnerLoopEntryPt:				// initial entry into loop, with 1st halfline loaded        
-        bt		noncache,c64InnerLoop1	// skip if uncached or overlap
-        dcbz128	0,r4				// avoid prefetch of next cache line
-c64InnerLoop1:
-        std		r0,0(r4)			// store 1st half of chunk n
-        ld		r0,64(r6)			// load 2nd half of chunk n
-        std		r2,8(r4)
-        ld		r2,72(r6)
-        std		r7,16(r4)
-        ld		r7,80(r6)
-        std		r8,24(r4)
-        ld		r8,88(r6)
-        std		r9,32(r4)
-        ld		r9,96(r6)
-        std		r10,40(r4)
-        ld		r10,104(r6)
-        std		r11,48(r4)
-        ld		r11,112(r6)
-        std		r12,56(r4)
-        ld		r12,120(r6)
-        addi	r6,r6,128			// advance to next source chunk if any
-        bdnz	c64InnerLoop		// loop if more chunks
-        
-        std		r0,64(r4)			// store 2nd half of last chunk
-        std		r2,72(r4)
-        std		r7,80(r4)
-        std		r8,88(r4)
-        std		r9,96(r4)
-        std		r10,104(r4)
-        std		r11,112(r4)
-        std		r12,120(r4)
-        addi	r4,r4,128			// advance to next dest chunk
-
-c64double7:         	            // r5 <- leftover bytes, cr1 set on doubleword count
-        rlwinm	r0,r5,29,28,31		// r0 <- count of leftover doublewords (0-15)
-        andi.	r5,r5,7				// r5/cr0 <- count of leftover bytes (0-7)
-        beq		cr1,c64byte			// no leftover doublewords
-        mtctr	r0
-        b		c64double8
-        
-        .align	5					// align inner loop
-c64double8:							// loop copying leftover doublewords
-        ld		r0,0(r6)
-        addi	r6,r6,8
-        std		r0,0(r4)
-        addi	r4,r4,8
-        bdnz	c64double8
-
-
-// Forward byte loop.
-
-c64byte:							// r5/cr0 <- byte count (can be big if unaligned uncached)
-		beq		bcpydone			// done if no leftover bytes
-        mtctr	r5
-        b		c64byte1
-        
-        .align	5					// align inner loop
-c64byte1:
-        lbz		r0,0(r6)
-        addi	r6,r6,1
-        stb		r0,0(r4)
-        addi	r4,r4,1
-        bdnz	c64byte1
-
-        b		bcpydone
-
-
-// Uncached copies.  We must avoid unaligned accesses, since they always take alignment
-// exceptions on uncached memory on 64-bit processors.  This may mean we copy long operands
-// a byte at a time, but that is still much faster than alignment exceptions.
-//   r4-r6 = dest, length, source (as above)
-//		r2 = mask of 1s for leading 0s in length, plus 1 extra 1
-//		r7 = #bytes to copy to 128-byte align dest (limited by operand length)
-//	   cr1 = blt if reverse move required
-
-c64uncached:
-        xor		r0,r6,r4			// get relative alignment
-        rlwinm	r10,r0,0,29,31		// relatively doubleword aligned?
-        rlwinm	r11,r0,0,30,31		// relatively word aligned?
-        not		r8,r2				// get mask to limit initial length of copy for G4word
-        blt		cr1,c64reverseUncached
-        
-        cmpwi	cr0,r10,0			// set cr0 beq if doubleword aligned
-        cmpwi	cr1,r11,0			// set cr1 beq if word aligned
-        beq		cr0,c64double		// doubleword aligned
-        beq		cr1,G4word			// word aligned, use G3/G4 code
-        cmpwi	r5,0				// set cr0 on byte count
-        b		c64byte				// unaligned operands
-
-c64reverseUncached:
-        cmpwi	cr0,r10,0			// set cr0 beq if doubleword aligned
-        cmpwi	cr1,r11,0			// set cr1 beq if word aligned
-        beq		cr0,c64rdouble		// doubleword aligned so can use LD/STD
-        beq		cr1,G4reverseWord	// word aligned, use G3/G4 code
-        add		r6,r6,r5			// point to (end+1) of source and dest
-        add		r4,r4,r5
-        cmpwi	r5,0				// set cr0 on length
-        b		c64rbyte			// copy a byte at a time
-        
-        
-
-// Reverse doubleword copies.  This is used for all cached copies, and doubleword
-// aligned uncached copies.
-//		r4 = destination (32 or 64-bit ptr)
-//		r5 = length (always 32 bits)
-//		r6 = source (32 or 64-bit ptr)
-//		cr5 = noncache, fixxlate, and restorex flags set
-
-c64rdouble:
-        add		r6,r6,r5			// point to (end+1) of source and dest
-        add		r4,r4,r5
-        rlwinm.	r7,r4,0,29,31		// r7 <- #bytes to doubleword align dest
-        cmplw	cr1,r7,r5			// operand long enough to doubleword align?
-        blt		cr1,c64rd0			// yes
-        mr		r7,r5				// no
-c64rd0:
-        sub		r5,r5,r7			// adjust length
-        srwi	r8,r5,6				// r8 <- 64-byte chunks to xfer
-        cmpwi	cr1,r8,0			// any chunks?
-        beq		c64rd2				// source already doubleword aligned
-        mtctr	r7
-
-c64rd1:								// copy bytes until source doublword aligned
-        lbzu	r0,-1(r6)
-        stbu	r0,-1(r4)
-        bdnz	c64rd1
-        
-c64rd2:								// r8/cr1 <- count of 64-byte chunks
-        rlwinm	r0,r5,29,29,31		// r0 <- count of leftover doublewords
-        andi.	r5,r5,7				// r5/cr0 <- count of leftover bytes
-        cmpwi	cr7,r0,0			// leftover doublewords?
-        beq		cr1,c64rd4			// no chunks to xfer
-        li		r9,-128				// start at next cache line
-        mtctr	r8
-        bt		noncache,c64rd3		// (HACK) Do not start a stream if noncachable...
-        dcbt128	r9,r6,3				// start reverse stream
-        b		c64rd3
-        
-        .align	5					// align inner loop
-c64rd3:								// loop copying 64-byte chunks
-        ld		r7,-8(r6)
-        ld		r8,-16(r6)
-        ld		r9,-24(r6)
-        ld		r10,-32(r6)
-        ld		r11,-40(r6)
-        ld		r12,-48(r6)
-        std		r7,-8(r4)
-        std		r8,-16(r4)
-        ld		r7,-56(r6)
-        ldu		r8,-64(r6)
-        std		r9,-24(r4)
-        std		r10,-32(r4)
-        std		r11,-40(r4)
-        std		r12,-48(r4)
-        std		r7,-56(r4)
-        stdu	r8,-64(r4)
-        bdnz	c64rd3
-
-c64rd4:								// r0/cr7 = leftover doublewords  r5/cr0 = leftover bytes
-        beq		cr7,c64rbyte		// no leftover doublewords
-        mtctr	r0
-        
-c64rd5:								// loop copying leftover doublewords
-        ldu		r0,-8(r6)
-        stdu	r0,-8(r4)
-        bdnz	c64rd5
-
-
-// Reverse byte loop.
-
-c64rbyte:							// r5/cr0 <- byte count (can be big if unaligned uncached)
-        beq		bcpydone			// done if no leftover bytes
-        mtctr	r5
-        
-c64rbyte1:
-        lbzu	r0,-1(r6)
-        stbu	r0,-1(r4)
-        bdnz	c64rbyte1
-
-        b		bcpydone
-