/*
- * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
*
- * @APPLE_LICENSE_HEADER_START@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License"). You may not use this file except in compliance with the
- * License. Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
*
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
- * License for the specific language governing rights and limitations
- * under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
*
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
;
-; Copy bytes of data around. handles overlapped data.
-;
-; Change this to use Altivec later on, and maybe floating point.
+; Copy bytes of data around. Handles overlapped data.
;
;
#include <ppc/asm.h>
#include <ppc/proc_reg.h>
#include <assym.s>
-; Use CR5_lt to indicate non-cached
+; These routines use CR5 for certain flags:
+; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
#define noncache 20
-; Use CR5_gt to indicate that we need to turn data translation back on
-#define fixxlate 21
-
-; Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off
-; 64-bit mode (if 64-bit) before returning to our caller. We overload the
-; bit to reduce the number of conditional branches at bcopy exit.
-#define restorex 22
-; Use CR5_so to indicate that we need to restore real-mode cachability
-; Only needed on 64-bit machines
-#define flipcache 23
+; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
+#define BCOPY_SF_SIZE 32 // total size
+#define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
-;
-; bcopy_nc(from, to, nbytes)
-;
-; bcopy_nc operates on non-cached memory so we can not use any kind
-; of cache instructions.
-;
- .align 5
- .globl EXT(bcopy_nc)
+#define kShort 32 // short operands are special cased
-LEXT(bcopy_nc)
-
- crset noncache ; Set non-cached
- b bcpswap
-;
-; void bcopy_physvir(from, to, nbytes)
+; void bcopy_physvir_32(from, to, nbytes)
+;
; Attempt to copy physically addressed memory with translation on if conditions are met.
; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
; for the passed phys addrs and do the copy with translation on.
;
-; Rules are: neither source nor destination can cross a page.
+; Rules are: - neither source nor destination can cross a page.
+; - Interrupts must be disabled when this routine is called.
+; - Translation must be on when called.
;
-; Interrupts must be disabled throughout the copy when this is called.
-; To do this, we build a
-; 128 DBAT for both the source and sink. If both are the same, only one is
-; loaded. We do not touch the IBATs, so there is no issue if either physical page
+; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
+; is loaded. We do not touch the IBATs, so there is no issue if either physical page
; address is the same as the virtual address of the instructions we are executing.
;
; At the end, we invalidate the used DBATs.
; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
; there anyhow.
;
-; Note, this one will not work in user state
-;
+; Note also that this routine is used only on 32-bit machines. If you're contemplating use
+; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
+; for an example of how this is done.
.align 5
- .globl EXT(bcopy_physvir)
-
-LEXT(bcopy_physvir)
+ .globl EXT(bcopy_physvir_32)
- crclr flipcache ; (HACK) No cache flip needed
- mfsprg r8,2 ; get processor feature flags
+LEXT(bcopy_physvir_32)
+ mflr r0 ; get return address
rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
- addic. r0,r7,-1 ; Get length - 1
+ mfsprg r8,2 ; get processor feature flags
+ stw r0,8(r1) ; save return address
rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
- add r11,r3,r0 ; Point to last byte of sink
- rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
- rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
+ subi r0,r7,1 ; get length - 1
+ rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ add r11,r3,r0 ; Point to last byte of sink
mr r5,r7 ; Get the length into the right register
- cmplw cr1,r3,r4 ; Does source == sink?
- bt++ pf64Bitb,bcopy_phys1 ; if 64-bit processor, use standard routine (no BATs)
+ rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
+
+; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
+; to bcopy_physvir() on 32-bit machines, and will result in a panic.
+
add r12,r4,r0 ; Point to last byte of source
- bltlr- ; Bail if length is 0 or way too big
xor r7,r11,r3 ; See if we went to next page
xor r8,r12,r4 ; See if we went to next page
or r0,r7,r8 ; Combine wrap
li r8,2 ; Set validity flags
bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
- crset restorex ; Remember to trash BATs on the way out
rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
mtdbatl 1,r12 ; Set source lower DBAT
mtdbatu 1,r8 ; Set source upper DBAT
-
+
bcpvsame:
- sync ; wait for BAT to stabilize
+ sync ; wait for the BATs to stabilize
isync
- mr r6,r3 ; Set source
- crclr noncache ; Set cached
- crclr fixxlate ; Set translation already ok
-
- b copyit32 ; Go copy it...
+
+ bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
+
+ li r0,0 ; Get set to invalidate upper half of BATs
+ sync ; Make sure all is well
+ mtdbatu 0,r0 ; Clear sink upper DBAT
+ mtdbatu 1,r0 ; Clear source upper DBAT
+ sync
+ isync
+
+ lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
+ addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
+ mtlr r0
+ blr
+
-;
; void bcopy_phys(from, to, nbytes)
-; Turns off data translation before the copy. Note, this one will
-; not work in user state. This routine is used on 32 and 64-bit
-; machines.
+;
+; Turns off data translation before the copy. This one will not work in user state.
+; This routine is used on 32 and 64-bit machines.
;
; Note that the address parameters are long longs. We will transform these to 64-bit
; values. Note that on 32-bit architectures that this will ignore the high half of the
; boundary between RAM and I/O space. Good chance of hanging the machine and this code
; will not check, so be careful.
;
+; NOTE: when called, translation must be on, and we must be in 32-bit mode.
+; Interrupts may or may not be disabled.
.align 5
.globl EXT(bcopy_phys)
LEXT(bcopy_phys)
- crclr flipcache ; (HACK) No cache flip needed
+ mflr r0 ; get return address
rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ stw r0,8(r1) ; save
mfsprg r8,2 ; get processor feature flags
+ stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
mr r5,r7 ; Get the length into the right register
-
-bcopy_phys1: ; enter from bcopy_physvir with pf64Bit already in cr6
+
+bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
mfmsr r9 ; Get the MSR
- crclr noncache ; Set cached
+ lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
+ ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
+ andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
; 32-bit CPUs
-
- sub. r0,r3,r4 ; to==from?
- rlwinm r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; was translation on?
- cmpwi cr1,r8,0 ; set cr1 beq if translation was off
- oris r8,r8,hi16(MASK(MSR_VEC)) ; Get vector enable
- cmplwi cr7,r5,0 ; Check if we have a 0 length
- beqlr- ; bail if to==from
- ori r8,r8,lo16(MASK(MSR_FP)) ; Get FP
- mr r6,r3 ; Set source
- andc r9,r9,r8 ; Turn off translation if it is on (should be) and FP, VEC
- beqlr- cr7 ; Bail if length is 0
-
- crclr restorex ; Make sure we do not trash BATs on the way out
- mtmsr r9 ; Set DR translation off
+
+ mtmsr r9 ; turn DR, FP, and VEC off
isync ; Wait for it
- crnot fixxlate,cr1_eq ; Remember to turn on translation if it was
- b copyit32 ; Go copy it...
+ bl EXT(bcopy) ; do the copy with translation off and caching on
-; 64-bit: turn DR off and SF on, remember if we need to restore on way out.
+ mfmsr r9 ; Get the MSR
+ ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
+ mtmsr r9 ; restore msr
+ isync ; wait for it to happen
+ lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
+ mtlr r0
+ addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
+ blr
-bcopy_phys64: ; r9 = MSR
+
+; 64-bit: turn DR off and SF on.
- srdi r2,r3,31 ; (HACK) Get a 1 if source is in I/O memory
- srdi. r0,r9,63-MSR_SF_BIT ; set cr0 beq on if SF was off when we were called
- rlwinm r8,r9,MSR_DR_BIT+1,31,31 ; r8 <- DR bit right justified
- cmpld cr1,r3,r4 ; to==from?
+bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
+ ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
+ srdi r2,r3,31 ; Get a 1 if source is in I/O memory
li r0,1 ; Note - we use this in a couple places below
- lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
- cmpwi cr7,r5,0 ; length==0 ?
- ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
- beqlr-- cr1 ; bail if to==from
- srdi r10,r4,31 ; (HACK) Get a 1 if sink is in I/O memory
- rldimi r9,r0,63,MSR_SF_BIT ; set SF on
- beqlr-- cr7 ; bail if length==0
- andc r9,r9,r6 ; turn DR, VEC, FP off
- cmpwi cr1,r8,0 ; was DR on?
- crmove restorex,cr0_eq ; if SF was off, remember to turn back off before we return
+ srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
+ std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
+ rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
+ cmpldi cr0,r2,1 ; Is source in I/O memory?
+ cmpldi cr7,r10,1 ; Is sink in I/O memory?
mtmsrd r9 ; turn 64-bit addressing on, data translation off
- cmpldi cr0,r2,1 ; (HACK) Is source in I/O memory?
isync ; wait for it to happen
- mr r6,r3 ; Set source
- cmpldi cr7,r10,1 ; (HACK) Is sink in I/O memory?
- crnot fixxlate,cr1_eq ; if DR was on, remember to turn back on before we return
-
- cror flipcache,cr0_eq,cr7_eq ; (HACK) See if either source or sink is in I/O area
-
- rlwinm r10,r9,MSR_EE_BIT+1,31,31 ; (HACK GLORIOUS HACK) Isolate the EE bit
- sldi r11,r0,31-MSR_EE_BIT ; (HACK GLORIOUS HACK)) Get a mask for the EE bit
- sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching
- bf++ flipcache,copyit64 ; (HACK) No need to mess with caching...
-
-;
-; HACK GLORIOUS HACK - when we force of caching, we need to also force off
-; interruptions. We are out of CR bits, so we need to stash the entry EE
-; somewheres. It is in the XER.... We NEED to change this!!!!
-;
+ cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
+ beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
+
+ bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
+
+bcopy_phys64x:
+ mfmsr r9 ; Get the MSR we used to copy
+ rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
+ ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
+ mtmsrd r9 ; turn 64-bit mode off, translation back on
+ isync ; wait for it to happen
+ lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
+ ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
+ mtlr r0
+ mtmsrd r8,1 ; turn EE back on if necessary
+ addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
+ blr
- mtxer r10 ; (HACK GLORIOUS HACK) Remember EE
- andc r9,r9,r11 ; (HACK GLORIOUS HACK) Turn off EE bit
- mfspr r2,hid4 ; (HACK) Get HID4
- crset noncache ; (HACK) Set non-cached
- mtmsrd r9 ; (HACK GLORIOUS HACK) Force off EE
- or r2,r2,r0 ; (HACK) Set bit to make real accesses cache-inhibited
- sync ; (HACK) Sync up
- li r0,1
- mtspr hid4,r2 ; (HACK) Make real accesses cache-inhibited
- isync ; (HACK) Toss prefetches
-
- lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible
- srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000
- slbie r12 ; (HACK) Make sure the ERAT is cleared
+; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
+; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
+; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
+; this state, so we have to disable interrupts as well. This is an unpleasant hack.
+
+io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
+ sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
+ sldi r0,r0,32+8 ; Get the right bit to turn off caching
+ andc r9,r9,r11 ; Turn off EE bit
+ mfspr r2,hid4 ; Get HID4
+ mtmsrd r9,1 ; Force off EE
+ or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
+ sync ; Sync up
+ mtspr hid4,r2 ; Make real accesses cache-inhibited
+ isync ; Toss prefetches
+
+ lis r12,0xE000 ; Get the unlikeliest ESID possible
+ srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
+ slbie r12 ; Make sure the ERAT is cleared
- sync ; (HACK)
- isync ; (HACK)
+ sync
+ isync
- b copyit64
+ bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
+ li r0,1 ; Get a 1
+ sldi r0,r0,32+8 ; Get the right bit to turn off caching
+ mfspr r2,hid4 ; Get HID4
+ andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
+ sync ; Sync up
+ mtspr hid4,r2 ; Make real accesses not cache-inhibited
+ isync ; Toss prefetches
+
+ lis r12,0xE000 ; Get the unlikeliest ESID possible
+ srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
+ slbie r12 ; Make sure the ERAT is cleared
+ b bcopy_phys64x
+
+;
+; shortcopy
+;
+; Special case short operands (<32 bytes), which are very common. Note that the check for
+; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
+; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
+; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
+; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
+; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
+; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
+; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
+; might do unaligned accesses this code cannot be called from bcopy_nc().
+; r4 = destination
+; r5 = length (<32)
+; r6 = source
+; r12 = (dest - source)
+
+ .align 5
+shortcopy:
+ cmplw r12,r5 ; must move reverse if (dest-source)<length
+ mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
+ mtcrf 1,r5 ; ...which is faster on G4 and G5
+ bge++ backend ; handle forward moves (most common case)
+ add r6,r6,r5 ; point one past end of operands in reverse moves
+ add r4,r4,r5
+ b bbackend ; handle reverse moves
+
;
; void bcopy(from, to, nbytes)
;
+; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
+; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
+; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
+; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
+; registers to accelerate the copy.
+;
+; NOTE: this code can be called in any of three "modes":
+; - on 32-bit processors (32-byte cache line)
+; - on 64-bit processors running in 32-bit mode (128-byte cache line)
+; - on 64-bit processors running in 64-bit mode (128-byte cache line)
.align 5
.globl EXT(bcopy)
+ .globl EXT(bcopy_nop_if_32bit)
LEXT(bcopy)
-
+ cmplwi cr1,r5,kShort ; less than 32 bytes?
+ sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
+ mr r6,r3 ; Set source (must preserve r3 for memcopy return)
+ blt cr1,shortcopy ; special case short operands
crclr noncache ; Set cached
+LEXT(bcopy_nop_if_32bit)
+ bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
+ bne+ copyit32 ; handle 32-bit processor
+ blr ; to==from so nothing to do
+
+;
+; bcopy_nc(from, to, nbytes)
+;
+; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
+; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
+; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
+; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
-bcpswap:
- crclr flipcache ; (HACK) No cache flip needed
- mfsprg r8,2 ; get processor feature flags
- sub. r0,r4,r3 ; test for to==from in mode-independent way
- mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
+ .align 5
+ .globl EXT(bcopy_nc)
+ .globl EXT(bcopy_nc_nop_if_32bit)
+
+LEXT(bcopy_nc)
cmpwi cr1,r5,0 ; Check if we have a 0 length
- crclr restorex ; Make sure we do not trash BATs on the way out
- mr r6,r3 ; Set source
- crclr fixxlate ; Set translation already ok
- beqlr- ; Bail if "to" and "from" are the same
- beqlr- cr1 ; Bail if length is 0
- bt++ pf64Bitb,copyit64 ; handle 64-bit processor
- b copyit32 ; Go copy it...
+ sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
+ mr r6,r3 ; Set source (must preserve r3 for memcopy return)
+ crset noncache ; Set non-cached
+ cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
+LEXT(bcopy_nc_nop_if_32bit)
+ bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
+ bne+ copyit32 ; handle 32-bit processor
+ blr ; either zero length or to==from
;
-; When we move the memory, forward overlays must be handled. We
-; also can not use the cache instructions if we are from bcopy_nc.
-; We need to preserve R3 because it needs to be returned for memcpy.
-; We can be interrupted and lose control here.
-;
-; There is no stack, so in order to use vectors, we would
-; need to take the vector exception. Any potential gains by using vectors
-; would be more than eaten up by this.
+; void* memcpy(to, from, nbytes)
+; void* memmove(to, from, nbytes)
;
-; NOTE: this code is called in three "modes":
-; - on 32-bit processors (32-byte cache line)
-; - on 64-bit processors running in 32-bit mode (128-byte cache line)
-; - on 64-bit processors running in 64-bit mode (128-byte cache line)
-;
-; ALSO NOTE: bcopy is called from copyin and copyout etc
-; with the "thread_recover" ptr set. This means bcopy must not set up a
-; stack frame or touch non-volatile registers, and also means that it
-; cannot rely on turning off interrupts, because we expect to get DSIs
-; and have execution aborted by a "longjmp" to the thread_recover
-; routine.
-;
-
+; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
+; However, they would work correctly if called in 64-bit mode.
+
.align 5
.globl EXT(memcpy)
- ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit
- ; processors...
+ .globl EXT(memmove)
+ .globl EXT(memcpy_nop_if_32bit)
+
LEXT(memcpy)
- crclr flipcache ; (HACK) No cache flip needed
- mfsprg r8,2 ; get processor feature flags
- cmplw cr1,r3,r4 ; "to" and "from" the same?
- mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
- mr r6,r4 ; Set the "from"
- mr. r5,r5 ; Length zero?
+LEXT(memmove)
+ cmplwi cr1,r5,kShort ; less than 32 bytes?
+ sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
+ mr r6,r4 ; Set source
+ mr r4,r3 ; Set the "to" (must preserve r3 for return value)
+ blt cr1,shortcopy ; special case short operands
crclr noncache ; Set cached
- mr r4,r3 ; Set the "to"
- crclr fixxlate ; Set translation already ok
- beqlr- cr1 ; "to" and "from" are the same
- beqlr- ; Length is 0
- crclr restorex ; Make sure we do not trash BATs on the way out
- bt++ pf64Bitb,copyit64 ; handle 64-bit processors
-
-copyit32: sub r12,r4,r6 ; Get potential overlap (negative if backward move)
- lis r8,0x7FFF ; Start up a mask
- srawi r11,r12,31 ; Propagate the sign bit
- dcbt br0,r6 ; Touch in the first source line
- cntlzw r7,r5 ; Get the highest power of 2 factor of the length
- ori r8,r8,0xFFFF ; Make limit 0x7FFFFFFF
- xor r9,r12,r11 ; If sink - source was negative, invert bits
- srw r8,r8,r7 ; Get move length limitation
- sub r9,r9,r11 ; If sink - source was negative, add 1 and get absolute value
- cmplw r12,r5 ; See if we actually forward overlap
- cmplwi cr7,r9,32 ; See if at least a line between source and sink
- dcbtst br0,r4 ; Touch in the first sink line
- cmplwi cr1,r5,32 ; Are we moving more than a line?
- cror noncache,noncache,cr7_lt ; Set to not DCBZ output line if not enough space
- blt- fwdovrlap ; This is a forward overlapping area, handle it...
+LEXT(memcpy_nop_if_32bit)
+ bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
+ beqlr- ; exit if to==from
+
+; Here to copy on 32-bit processors.
;
-; R4 = sink
-; R5 = length
-; R6 = source
-;
-
+; When we move the memory, forward overlays must be handled. We
+; also can not use the cache instructions if we are from bcopy_nc.
+; We need to preserve R3 because it needs to be returned for memcpy.
+; We can be interrupted and lose control here.
;
-; Here we figure out how much we have to move to get the sink onto a
-; cache boundary. If we can, and there are still more that 32 bytes
-; left to move, we can really speed things up by DCBZing the sink line.
-; We can not do this if noncache is set because we will take an
-; alignment exception.
-
-G4word: ; enter from 64-bit case with word aligned uncached operands
- neg r0,r4 ; Get the number of bytes to move to align to a line boundary
- rlwinm. r0,r0,0,27,31 ; Clean it up and test it
- and r0,r0,r8 ; limit to the maximum front end move
- mtcrf 3,r0 ; Make branch mask for partial moves
- sub r5,r5,r0 ; Set the length left to move
+; When entered:
+; r4 = destination
+; r5 = length (>0)
+; r6 = source
+; r12 = (dest - source)
+; cr5 = noncache flag
+
+copyit32: ; WARNING! can drop down to this label
+ cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
+ cntlzw r11,r5 ; get magnitude of length
+ dcbt 0,r6 ; start to touch in source
+ lis r10,hi16(0x80000000) ; get 0x80000000
+ neg r9,r4 ; start to get alignment for destination
+ dcbtst 0,r4 ; start to touch in destination
+ sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
+ blt- cr1,reverse32bit ; reverse move required
+
+; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
+; NOTE: we never do an unaligned access if the source and destination are "relatively"
+; word aligned. We depend on this in the uncached case on 64-bit processors.
+; r4 = destination
+; r5 = length (>0)
+; r6 = source
+; r8 = inverse of largest mask smaller than operand length
+; r9 = neg(dest), used to compute alignment
+; cr5 = noncache flag
+
+forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
+ rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
+ andc. r0,r7,r8 ; limit to the maximum front end move
+ mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
beq alline ; Already on a line...
+ mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
+ sub r5,r5,r0 ; Set the length left to move
+
bf 31,alhalf ; No single byte to do...
lbz r7,0(r6) ; Get the byte
addi r6,r6,1 ; Point to the next
; Sink is line aligned here
alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
- mtcrf 3,r5 ; Make branch mask for backend partial moves
- rlwinm r11,r5,0,0,26 ; Get number of bytes we are going to move
+ mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
+ mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
beq- backend ; No full lines to move
-
- sub r5,r5,r11 ; Calculate the residual
- li r10,96 ; Stride for touch ahead
-
-nxtline: subic. r0,r0,1 ; Account for the line now
-
+
+ mtctr r0 ; set up loop count
+ li r0,96 ; Stride for touch ahead
+ b nxtline
+
+ .align 4
+nxtline:
+ lwz r2,0(r6) ; Get the first word
+ lwz r5,4(r6) ; Get the second word
+ lwz r7,8(r6) ; Get the third word
+ lwz r8,12(r6) ; Get the fourth word
+ lwz r9,16(r6) ; Get the fifth word
+ lwz r10,20(r6) ; Get the sixth word
+ lwz r11,24(r6) ; Get the seventh word
+ lwz r12,28(r6) ; Get the eighth word
bt- noncache,skipz ; Skip if we are not cached...
- dcbz br0,r4 ; Blow away the whole line because we are replacing it
- dcbt r6,r10 ; Touch ahead a bit
-
-skipz: lwz r7,0(r6) ; Get the first word
- lwz r8,4(r6) ; Get the second word
- lwz r9,8(r6) ; Get the third word
- stw r7,0(r4) ; Save the first word
- lwz r11,12(r6) ; Get the fourth word
- stw r8,4(r4) ; Save the second word
- lwz r7,16(r6) ; Get the fifth word
- stw r9,8(r4) ; Save the third word
- lwz r8,20(r6) ; Get the sixth word
- stw r11,12(r4) ; Save the fourth word
- lwz r9,24(r6) ; Get the seventh word
- stw r7,16(r4) ; Save the fifth word
- lwz r11,28(r6) ; Get the eighth word
+ dcbz 0,r4 ; Blow away the whole line because we are replacing it
+ dcbt r6,r0 ; Touch ahead a bit
+skipz:
addi r6,r6,32 ; Point to the next
- stw r8,20(r4) ; Save the sixth word
- stw r9,24(r4) ; Save the seventh word
- stw r11,28(r4) ; Save the eighth word
+ stw r2,0(r4) ; Save the first word
+ stw r5,4(r4) ; Save the second word
+ stw r7,8(r4) ; Save the third word
+ stw r8,12(r4) ; Save the fourth word
+ stw r9,16(r4) ; Save the fifth word
+ stw r10,20(r4) ; Save the sixth word
+ stw r11,24(r4) ; Save the seventh word
+ stw r12,28(r4) ; Save the eighth word
addi r4,r4,32 ; Bump sink
- bgt+ nxtline ; Do the next line, if any...
+ bdnz+ nxtline ; Do the next line, if any...
; Move backend quadword
-backend: bf 27,noquad ; No quad to do...
+backend: ; Join here from "shortcopy" for forward moves <32 bytes
+ bf 27,noquad ; No quad to do...
lwz r7,0(r6) ; Get the first word
lwz r8,4(r6) ; Get the second word
lwz r9,8(r6) ; Get the third word
; Move backend byte
-nohalf: bf 31,bcpydone ; Leave cuz we are all done...
+nohalf: bflr 31 ; Leave cuz we are all done...
lbz r7,0(r6) ; Get the byte
stb r7,0(r4) ; Save the single
-
-bcpydone:
- mfmsr r9 ; Get the MSR
- bf++ flipcache,bcpydone0 ; (HACK) No need to mess with caching...
-
- li r0,1 ; (HACK) Get a 1
- mfxer r10 ; (HACK GLORIOUS HACK) Get the entry EE
- sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching
- mfspr r2,hid4 ; (HACK) Get HID4
- rlwinm r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT ; (HACK GLORIOUS HACK) Set the EE bit
- andc r2,r2,r0 ; (HACK) Clear bit to make real accesses cache-inhibited
- or r9,r9,r10 ; (HACK GLORIOUS HACK) Set the EE in MSR
- sync ; (HACK) Sync up
- mtspr hid4,r2 ; (HACK) Make real accesses not cache-inhibited
- isync ; (HACK) Toss prefetches
-
- lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible
- srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000
- slbie r12 ; (HACK) Make sure the ERAT is cleared
-
- mtmsr r9 ; (HACK GLORIOUS HACK) Set EE properly
-
-bcpydone0:
- lis r0,hi16(MASK(MSR_VEC)) ; Get the vector bit
- ori r0,r0,lo16(MASK(MSR_FP)) ; Get the float bit
- bf++ fixxlate,bcpydone1 ; skip if we do not need to fix translation...
- ori r9,r9,lo16(MASK(MSR_DR)) ; Turn data translation on
- andc r9,r9,r0 ; Make sure that FP and VEC are off
- mtmsr r9 ; Just do it
- isync ; Hang in there
-
-bcpydone1:
- bflr++ restorex ; done if we do not have to fix up addressing
- mfsprg r8,2 ; get the feature flags again
- mtcrf 0x02,r8 ; put pf64Bit where we can test it
- bt++ pf64Bitb,bcpydone2 ; skip if 64-bit processor
-
- ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir
-
- li r0,0 ; Get set to invalidate upper half
- sync ; Make sure all is well
- mtdbatu 0,r0 ; Clear sink upper DBAT
- mtdbatu 1,r0 ; Clear source upper DBAT
- sync
- isync
- blr
-
- ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys
-
-bcpydone2:
- mfmsr r9 ; get MSR again
- andc r9,r9,r0 ; Make sure that FP and VEC are off
- rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
- mtmsrd r9
- isync
blr
-;
-; 0123456789ABCDEF0123456789ABCDEF
-; 0123456789ABCDEF0123456789ABCDEF
-; F
-; DE
-; 9ABC
-; 12345678
-; 123456789ABCDEF0
-; 0
+; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
+; NOTE: we never do an unaligned access if the source and destination are "relatively"
+; word aligned. We depend on this in the uncached case on 64-bit processors.
+; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
+; r4 = destination
+; r5 = length (>0)
+; r6 = source
+; r8 = inverse of largest mask smaller than operand length
+; cr5 = noncache flag (but we don't dcbz anyway)
-;
-; Here is where we handle a forward overlapping move. These will be slow
-; because we can not kill the cache of the destination until after we have
-; loaded/saved the source area. Also, because reading memory backwards is
-; slower when the cache line needs to be loaded because the critical
-; doubleword is loaded first, i.e., the last, then it goes back to the first,
-; and on in order. That means that when we are at the second to last DW we
-; have to wait until the whole line is in cache before we can proceed.
-;
-
-G4reverseWord: ; here from 64-bit code with word aligned uncached operands
-fwdovrlap: add r4,r5,r4 ; Point past the last sink byte
+reverse32bit: ; here from 64-bit code with word aligned uncached operands
+ add r4,r5,r4 ; Point past the last sink byte
add r6,r5,r6 ; Point past the last source byte
- and r0,r4,r8 ; Apply movement limit
- li r12,-1 ; Make sure we touch in the actual line
- mtcrf 3,r0 ; Figure out the best way to move backwards
+ rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
+ li r12,-1 ; Make sure we touch in the actual line
+ andc. r0,r7,r8 ; Apply movement limit
dcbt r12,r6 ; Touch in the last line of source
- rlwinm. r0,r0,0,27,31 ; Calculate the length to adjust to cache boundary
+ mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
dcbtst r12,r4 ; Touch in the last line of the sink
- beq- balline ; Aready on cache line boundary
+ mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
+ beq- balline ; Aready on cache line boundary (or too short to bother)
sub r5,r5,r0 ; Precaculate move length left after alignment
; Sink is line aligned here
balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
- mtcrf 3,r5 ; Make branch mask for backend partial moves
+ mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
+ mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
beq- bbackend ; No full lines to move
-
-
-; Registers in use: R0, R1, R3, R4, R5, R6
-; Registers not in use: R2, R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
+ mtctr r0 ; set up loop count
+ b bnxtline
-bnxtline: subic. r0,r0,1 ; Account for the line now
-
+ .align 4
+bnxtline:
lwz r7,-32(r6) ; Get the first word
lwz r5,-28(r6) ; Get the second word
lwz r2,-24(r6) ; Get the third word
subi r6,r6,32 ; Point to the next
stw r7,-32(r4) ; Get the first word
- ble- bnotouch ; Last time, skip touch of source...
- dcbt br0,r6 ; Touch in next source line
-
-bnotouch: stw r5,-28(r4) ; Get the second word
+ stw r5,-28(r4) ; Get the second word
stw r2,-24(r4) ; Get the third word
stw r12,-20(r4) ; Get the third word
stw r11,-16(r4) ; Get the fifth word
stw r8,-4(r4) ; Get the eighth word
subi r4,r4,32 ; Bump sink
- bgt+ bnxtline ; Do the next line, if any...
+ bdnz+ bnxtline ; Do the next line, if any...
;
; Note: We touched these lines in at the beginning
; Move backend quadword
-bbackend: bf 27,bnoquad ; No quad to do...
+bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
+ bf 27,bnoquad ; No quad to do...
lwz r7,-16(r6) ; Get the first word
lwz r8,-12(r6) ; Get the second word
lwz r9,-8(r6) ; Get the third word
; Move backend byte
-bnohalf: bf 31,bcpydone ; Leave cuz we are all done...
+bnohalf: bflr 31 ; Leave cuz we are all done...
lbz r7,-1(r6) ; Get the byte
stb r7,-1(r4) ; Save the single
-
- b bcpydone ; Go exit cuz we are all done...
+ blr
// Here on 64-bit processors, which have a 128-byte cache line. This can be
// r4 = destination (32 or 64-bit ptr)
// r5 = length (always 32 bits)
// r6 = source (32 or 64-bit ptr)
-// cr5 = noncache, fixxlate, flipcache, and restorex flags set
+// r12 = (dest - source), reverse move required if (dest-source)<length
+// cr5 = noncache flag
.align 5
copyit64:
- lis r2,0x4000 // r2 = 0x00000000 40000000
- neg r12,r4 // start to compute #bytes to align dest
- bt-- noncache,noncache1 // (HACK) Do not even try anything cached...
+ rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
+ cntlzw r11,r5 // get magnitude of length
dcbt 0,r6 // touch in 1st block of source
-noncache1:
- add. r2,r2,r2 // if 0x00000000 80000000 < 0, we are in 32-bit mode
- cntlzw r9,r5 // get highest power-of-2 in length
- rlwinm r7,r12,0,25,31 // r7 <- bytes to 128-byte align dest
- bt-- noncache,noncache2 // (HACK) Do not even try anything cached...
dcbtst 0,r4 // touch in 1st destination cache block
-noncache2:
- sraw r2,r2,r9 // get mask with 1s for leading 0s in length, plus 1 more 1-bit
- bge copyit64a // skip if we are running in 64-bit mode
- rlwinm r4,r4,0,0,31 // running in 32-bit mode, so truncate ptrs and lengths to 32 bits
- rlwinm r5,r5,0,0,31
- rlwinm r6,r6,0,0,31
-copyit64a: // now we can use 64-bit compares even if running in 32-bit mode
- sub r8,r4,r6 // get (dest-source)
- andc r7,r7,r2 // limit bytes to align by operand length
- cmpld cr1,r8,r5 // if (dest-source)<length, must move reverse
- bt-- noncache,c64uncached // skip if uncached
- blt-- cr1,c64rdouble // handle cached reverse moves
-
+ subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
+ li r0,0 // get a 0
+ lis r10,hi16(0x80000000)// get 0x80000000
+ addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
+ neg r9,r4 // start to get alignment for destination
+ sraw r8,r10,r11 // get mask based on operand length, to limit alignment
+ bt-- noncache,c64uncached// skip if uncached
+ beq-- c64rdouble // handle cached reverse moves
+
// Forward, cached or doubleword aligned uncached. This is the common case.
-// r4-r6 = dest, length, source (as above)
-// r7 = #bytes 128-byte align dest (limited by copy length)
-// cr5 = flags, as above
+// NOTE: we never do an unaligned access if the source and destination are "relatively"
+// doubleword aligned. We depend on this in the uncached case.
+// r4 = destination
+// r5 = length (>0)
+// r6 = source
+// r8 = inverse of largest mask smaller than operand length
+// r9 = neg(dest), used to compute alignment
+// cr5 = noncache flag
c64double:
+ rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
+ andc r7,r7,r8 // limit by operand length
andi. r8,r7,7 // r8 <- #bytes to doubleword align
srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
sub r5,r5,r7 // adjust length remaining
addi r4,r4,1
bdnz c64double1
-c64double2: // r9/cr1=doublewords, r10=128-byte chunks, cr7=blt if r5==0
+c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
beq cr1,c64double4 // no doublewords to xfer in order to cache align
mtctr r9
b c64double3
addi r4,r4,8
bdnz c64double3
-// Here to xfer 128-byte chunks, if any. Because the IBM 970 cannot issue two stores/cycle,
-// we pipeline the inner loop so we can pair loads and stores. Since we only have 8 GPRs for
+// Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
// data (64 bytes), we load/store each twice per 128-byte chunk.
c64double4: // r10/cr7=128-byte chunks
rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
cmpwi cr1,r0,0 // set cr1 on leftover doublewords
beq cr7,c64double7 // no 128-byte chunks
- sub r8,r6,r4 // r8 <- (source - dest)
- li r9,128 // start at next cache line (we've already touched in 1st line)
- cmpldi cr7,r8,128 // if (source-dest)<128, cannot use dcbz128 beacause of overlap
- cror noncache,cr7_lt,noncache // turn on "noncache" flag if (source-dest)<128
- bt-- noncache,noncache3 // (HACK) Skip cache touch if noncachable
- dcbt128 r9,r6,1 // start forward stream
-noncache3:
- mtctr r10
- ld r0,0(r6) // start pipe: load 1st half-line
- ld r2,8(r6)
- ld r7,16(r6)
- ld r8,24(r6)
- ld r9,32(r6)
- ld r10,40(r6)
- ld r11,48(r6)
- ld r12,56(r6)
- b c64InnerLoopEntryPt
+ ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
+ ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
+ sub r8,r6,r4 // r8 <- (source - dest)
+ rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
+ cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
+ mtctr r10
+ b c64InnerLoop
+
.align 5 // align inner loop
c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
- std r0,64(r4) // store 2nd half of chunk n
- ld r0,0(r6) // load 1st half of chunk n+1
- std r2,72(r4)
+ ld r0,0(r6) // start pipe: load 1st half-line
ld r2,8(r6)
- std r7,80(r4)
ld r7,16(r6)
- std r8,88(r4)
ld r8,24(r6)
- std r9,96(r4)
ld r9,32(r6)
- std r10,104(r4)
ld r10,40(r6)
- std r11,112(r4)
ld r11,48(r6)
- std r12,120(r4)
ld r12,56(r6)
- addi r4,r4,128 // advance to next dest chunk
-c64InnerLoopEntryPt: // initial entry into loop, with 1st halfline loaded
bt noncache,c64InnerLoop1 // skip if uncached or overlap
dcbz128 0,r4 // avoid prefetch of next cache line
c64InnerLoop1:
- std r0,0(r4) // store 1st half of chunk n
- ld r0,64(r6) // load 2nd half of chunk n
+
+ std r0,0(r4)
std r2,8(r4)
- ld r2,72(r6)
std r7,16(r4)
- ld r7,80(r6)
std r8,24(r4)
- ld r8,88(r6)
std r9,32(r4)
- ld r9,96(r6)
std r10,40(r4)
- ld r10,104(r6)
std r11,48(r4)
- ld r11,112(r6)
std r12,56(r4)
- ld r12,120(r6)
- addi r6,r6,128 // advance to next source chunk if any
- bdnz c64InnerLoop // loop if more chunks
- std r0,64(r4) // store 2nd half of last chunk
+ ld r0,64(r6) // load 2nd half of chunk
+ ld r2,72(r6)
+ ld r7,80(r6)
+ ld r8,88(r6)
+ ld r9,96(r6)
+ ld r10,104(r6)
+ ld r11,112(r6)
+ ld r12,120(r6)
+ addi r6,r6,128
+
+ std r0,64(r4)
std r2,72(r4)
std r7,80(r4)
std r8,88(r4)
std r12,120(r4)
addi r4,r4,128 // advance to next dest chunk
+ bdnz c64InnerLoop // loop if more chunks
+
+
c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
// Forward byte loop.
c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
- beq bcpydone // done if no leftover bytes
+ beqlr // done if no leftover bytes
mtctr r5
b c64byte1
addi r4,r4,1
bdnz c64byte1
- b bcpydone
+ blr
// Uncached copies. We must avoid unaligned accesses, since they always take alignment
// exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
// a byte at a time, but that is still much faster than alignment exceptions.
-// r4-r6 = dest, length, source (as above)
-// r2 = mask of 1s for leading 0s in length, plus 1 extra 1
-// r7 = #bytes to copy to 128-byte align dest (limited by operand length)
-// cr1 = blt if reverse move required
+// r4 = destination
+// r5 = length (>0)
+// r6 = source
+// r8 = inverse of largest mask smaller than operand length
+// r9 = neg(dest), used to compute alignment
+// r12 = (dest-source), used to test relative alignment
+// cr0 = beq if reverse move required
+// cr5 = noncache flag
c64uncached:
- xor r0,r6,r4 // get relative alignment
- rlwinm r10,r0,0,29,31 // relatively doubleword aligned?
- rlwinm r11,r0,0,30,31 // relatively word aligned?
- not r8,r2 // get mask to limit initial length of copy for G4word
- blt cr1,c64reverseUncached
-
- cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned
+ rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
+ rlwinm r11,r12,0,30,31 // relatively word aligned?
+ cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
cmpwi cr1,r11,0 // set cr1 beq if word aligned
- beq cr0,c64double // doubleword aligned
- beq cr1,G4word // word aligned, use G3/G4 code
+ beq-- c64reverseUncached
+
+ beq cr7,c64double // doubleword aligned
+ beq cr1,forward32bit // word aligned, use G3/G4 code
cmpwi r5,0 // set cr0 on byte count
b c64byte // unaligned operands
c64reverseUncached:
- cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned
- cmpwi cr1,r11,0 // set cr1 beq if word aligned
- beq cr0,c64rdouble // doubleword aligned so can use LD/STD
- beq cr1,G4reverseWord // word aligned, use G3/G4 code
+ beq cr7,c64rdouble // doubleword aligned so can use LD/STD
+ beq cr1,reverse32bit // word aligned, use G3/G4 code
add r6,r6,r5 // point to (end+1) of source and dest
add r4,r4,r5
cmpwi r5,0 // set cr0 on length
// Reverse doubleword copies. This is used for all cached copies, and doubleword
// aligned uncached copies.
-// r4 = destination (32 or 64-bit ptr)
-// r5 = length (always 32 bits)
-// r6 = source (32 or 64-bit ptr)
-// cr5 = noncache, fixxlate, and restorex flags set
+// r4 = destination
+// r5 = length (>0)
+// r6 = source
+// r8 = inverse of largest mask of low-order 1s smaller than operand length
+// cr5 = noncache flag
c64rdouble:
add r6,r6,r5 // point to (end+1) of source and dest
add r4,r4,r5
- rlwinm. r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
- cmplw cr1,r7,r5 // operand long enough to doubleword align?
- blt cr1,c64rd0 // yes
- mr r7,r5 // no
-c64rd0:
+ rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
+ andc. r7,r7,r8 // limit by operand length
sub r5,r5,r7 // adjust length
srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
cmpwi cr1,r8,0 // any chunks?
andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
cmpwi cr7,r0,0 // leftover doublewords?
beq cr1,c64rd4 // no chunks to xfer
- li r9,-128 // start at next cache line
mtctr r8
- bt noncache,c64rd3 // (HACK) Do not start a stream if noncachable...
- dcbt128 r9,r6,3 // start reverse stream
b c64rd3
.align 5 // align inner loop
// Reverse byte loop.
c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
- beq bcpydone // done if no leftover bytes
+ beqlr // done if no leftover bytes
mtctr r5
c64rbyte1:
stbu r0,-1(r4)
bdnz c64rbyte1
- b bcpydone
+ blr