/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
*
- * @APPLE_LICENSE_HEADER_START@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License"). You may not use this file except in compliance with the
- * License. Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
*
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
- * License for the specific language governing rights and limitations
- * under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
*
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
;
-; Copy bytes of data around. handles overlapped data.
-;
-; Change this to use Altivec later on, and maybe floating point.
+; Copy bytes of data around. Handles overlapped data.
;
;
#include <ppc/asm.h>
#include <ppc/proc_reg.h>
+#include <assym.s>
-; Use CR5_lt to indicate non-cached
+; These routines use CR5 for certain flags:
+; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
#define noncache 20
-; Use CR5_gt to indicate that we need to turn data translation back on
-#define fixxlate 21
-; Use CR5_eq to indicate that we need to invalidate bats
-#define killbats 22
-;
-; bcopy_nc(from, to, nbytes)
-;
-; bcopy_nc operates on non-cached memory so we can not use any kind
-; of cache instructions.
-;
- .align 5
- .globl EXT(bcopy_nc)
+; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
+#define BCOPY_SF_SIZE 32 // total size
+#define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
-LEXT(bcopy_nc)
-
- crset noncache ; Set non-cached
- b bcpswap
-;
-; void bcopy_physvir(from, to, nbytes)
+#define kShort 32 // short operands are special cased
+
+
+; void bcopy_physvir_32(from, to, nbytes)
+;
; Attempt to copy physically addressed memory with translation on if conditions are met.
-; Otherwise do a normal bcopy_phys.
+; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
+; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
+; for the passed phys addrs and do the copy with translation on.
;
-; Rules are: neither source nor destination can cross a page.
-; No accesses above the 2GB line (I/O or ROM).
+; Rules are: - neither source nor destination can cross a page.
+; - Interrupts must be disabled when this routine is called.
+; - Translation must be on when called.
;
-; Interrupts must be disabled throughout the copy when this is called
-
-; To do this, we build a
-; 128 DBAT for both the source and sink. If both are the same, only one is
-; loaded. We do not touch the IBATs, so there is no issue if either physical page
+; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
+; is loaded. We do not touch the IBATs, so there is no issue if either physical page
; address is the same as the virtual address of the instructions we are executing.
;
-; At the end, we invalidate the used DBATs and reenable interrupts.
+; At the end, we invalidate the used DBATs.
;
-; Note, this one will not work in user state
-;
+; Note that the address parameters are long longs. We will transform these to 64-bit
+; values. Note that on 32-bit architectures that this will ignore the high half of the
+; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
+; there anyhow.
+;
+; Note also that this routine is used only on 32-bit machines. If you're contemplating use
+; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
+; for an example of how this is done.
.align 5
- .globl EXT(bcopy_physvir)
-
-LEXT(bcopy_physvir)
-
- addic. r0,r5,-1 ; Get length - 1
+ .globl EXT(bcopy_physvir_32)
+
+LEXT(bcopy_physvir_32)
+ mflr r0 ; get return address
+ rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ mfsprg r8,2 ; get processor feature flags
+ stw r0,8(r1) ; save return address
+ rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
+ stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
+ mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
+ subi r0,r7,1 ; get length - 1
+ rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
add r11,r3,r0 ; Point to last byte of sink
- cmplw cr1,r3,r4 ; Does source == sink?
+ mr r5,r7 ; Get the length into the right register
+ rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
+
+; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
+; to bcopy_physvir() on 32-bit machines, and will result in a panic.
+
add r12,r4,r0 ; Point to last byte of source
- bltlr- ; Bail if length is 0 or way too big
xor r7,r11,r3 ; See if we went to next page
xor r8,r12,r4 ; See if we went to next page
or r0,r7,r8 ; Combine wrap
- li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
+// li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
+ li r9,((2<<3)|2) ; Set default attributes
rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
li r7,2 ; Set validity flags
li r8,2 ; Set validity flags
- bne- EXT(bcopy_phys) ; Overflowed page, do normal physical copy...
+ bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
- crset killbats ; Remember to trash BATs on the way out
rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
mtdbatl 1,r12 ; Set source lower DBAT
mtdbatu 1,r8 ; Set source upper DBAT
-
-bcpvsame: mr r6,r3 ; Set source
- crclr noncache ; Set cached
-
- b copyit ; Go copy it...
+
+bcpvsame:
+ sync ; wait for the BATs to stabilize
+ isync
+
+ bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
+
+ li r0,0 ; Get set to invalidate upper half of BATs
+ sync ; Make sure all is well
+ mtdbatu 0,r0 ; Clear sink upper DBAT
+ mtdbatu 1,r0 ; Clear source upper DBAT
+ sync
+ isync
+
+ lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
+ addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
+ mtlr r0
+ blr
-;
; void bcopy_phys(from, to, nbytes)
-; Turns off data translation before the copy. Note, this one will
-; not work in user state
;
+; Turns off data translation before the copy. This one will not work in user state.
+; This routine is used on 32 and 64-bit machines.
+;
+; Note that the address parameters are long longs. We will transform these to 64-bit
+; values. Note that on 32-bit architectures that this will ignore the high half of the
+; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
+; there anyhow.
+;
+; Also note that you probably will not be happy if either the sink or source spans across the
+; boundary between RAM and I/O space. Good chance of hanging the machine and this code
+; will not check, so be careful.
+;
+; NOTE: when called, translation must be on, and we must be in 32-bit mode.
+; Interrupts may or may not be disabled.
.align 5
.globl EXT(bcopy_phys)
LEXT(bcopy_phys)
-
+ mflr r0 ; get return address
+ rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ stw r0,8(r1) ; save
+ mfsprg r8,2 ; get processor feature flags
+ stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
+ rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
+ rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
+ rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
+ mr r5,r7 ; Get the length into the right register
+
+bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
mfmsr r9 ; Get the MSR
+ lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
+ ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
+ andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
+ bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
- crclr noncache ; Set cached
- rlwinm. r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; Is data translation on?
-
- cmplw cr1,r4,r3 ; Compare "to" and "from"
- cmplwi cr7,r5,0 ; Check if we have a 0 length
- mr r6,r3 ; Set source
- beqlr- cr1 ; Bail if "to" and "from" are the same
- xor r9,r9,r8 ; Turn off translation if it is on (should be)
- beqlr- cr7 ; Bail if length is 0
-
- rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off
- crclr killbats ; Make sure we do not trash BATs on the way out
- rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off
- mtmsr r9 ; Set DR translation off
+; 32-bit CPUs
+
+ mtmsr r9 ; turn DR, FP, and VEC off
isync ; Wait for it
- crnot fixxlate,cr0_eq ; Remember to turn on translation if it was
- b copyit ; Go copy it...
+ bl EXT(bcopy) ; do the copy with translation off and caching on
+
+ mfmsr r9 ; Get the MSR
+ ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
+ mtmsr r9 ; restore msr
+ isync ; wait for it to happen
+ lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
+ mtlr r0
+ addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
+ blr
+
+
+; 64-bit: turn DR off and SF on.
+
+bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
+ ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
+ srdi r2,r3,31 ; Get a 1 if source is in I/O memory
+ li r0,1 ; Note - we use this in a couple places below
+ srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
+ std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
+ rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
+ cmpldi cr0,r2,1 ; Is source in I/O memory?
+ cmpldi cr7,r10,1 ; Is sink in I/O memory?
+ mtmsrd r9 ; turn 64-bit addressing on, data translation off
+ isync ; wait for it to happen
+ cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
+ beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
+
+ bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
+
+bcopy_phys64x:
+ mfmsr r9 ; Get the MSR we used to copy
+ rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
+ ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
+ mtmsrd r9 ; turn 64-bit mode off, translation back on
+ isync ; wait for it to happen
+ lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
+ ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
+ mtlr r0
+ mtmsrd r8,1 ; turn EE back on if necessary
+ addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
+ blr
+
+; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
+; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
+; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
+; this state, so we have to disable interrupts as well. This is an unpleasant hack.
+
+io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
+ sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
+ sldi r0,r0,32+8 ; Get the right bit to turn off caching
+ andc r9,r9,r11 ; Turn off EE bit
+ mfspr r2,hid4 ; Get HID4
+ mtmsrd r9,1 ; Force off EE
+ or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
+ sync ; Sync up
+ mtspr hid4,r2 ; Make real accesses cache-inhibited
+ isync ; Toss prefetches
+
+ lis r12,0xE000 ; Get the unlikeliest ESID possible
+ srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
+ slbie r12 ; Make sure the ERAT is cleared
+
+ sync
+ isync
+
+ bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
+
+ li r0,1 ; Get a 1
+ sldi r0,r0,32+8 ; Get the right bit to turn off caching
+ mfspr r2,hid4 ; Get HID4
+ andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
+ sync ; Sync up
+ mtspr hid4,r2 ; Make real accesses not cache-inhibited
+ isync ; Toss prefetches
+
+ lis r12,0xE000 ; Get the unlikeliest ESID possible
+ srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
+ slbie r12 ; Make sure the ERAT is cleared
+ b bcopy_phys64x
+
+;
+; shortcopy
+;
+; Special case short operands (<32 bytes), which are very common. Note that the check for
+; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
+; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
+; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
+; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
+; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
+; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
+; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
+; might do unaligned accesses this code cannot be called from bcopy_nc().
+; r4 = destination
+; r5 = length (<32)
+; r6 = source
+; r12 = (dest - source)
+
+ .align 5
+shortcopy:
+ cmplw r12,r5 ; must move reverse if (dest-source)<length
+ mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
+ mtcrf 1,r5 ; ...which is faster on G4 and G5
+ bge++ backend ; handle forward moves (most common case)
+ add r6,r6,r5 ; point one past end of operands in reverse moves
+ add r4,r4,r5
+ b bbackend ; handle reverse moves
+
;
; void bcopy(from, to, nbytes)
;
+; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
+; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
+; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
+; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
+; registers to accelerate the copy.
+;
+; NOTE: this code can be called in any of three "modes":
+; - on 32-bit processors (32-byte cache line)
+; - on 64-bit processors running in 32-bit mode (128-byte cache line)
+; - on 64-bit processors running in 64-bit mode (128-byte cache line)
.align 5
.globl EXT(bcopy)
+ .globl EXT(bcopy_nop_if_32bit)
LEXT(bcopy)
-
+ cmplwi cr1,r5,kShort ; less than 32 bytes?
+ sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
+ mr r6,r3 ; Set source (must preserve r3 for memcopy return)
+ blt cr1,shortcopy ; special case short operands
crclr noncache ; Set cached
-
-bcpswap: cmplw cr1,r4,r3 ; Compare "to" and "from"
- mr. r5,r5 ; Check if we have a 0 length
- mr r6,r3 ; Set source
- crclr killbats ; Make sure we do not trash BATs on the way out
- beqlr- cr1 ; Bail if "to" and "from" are the same
- beqlr- ; Bail if length is 0
- crclr fixxlate ; Set translation already ok
- b copyit ; Go copy it...
-
+LEXT(bcopy_nop_if_32bit)
+ bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
+ bne+ copyit32 ; handle 32-bit processor
+ blr ; to==from so nothing to do
+
;
-; When we move the memory, forward overlays must be handled. We
-; also can not use the cache instructions if we are from bcopy_nc.
-; We need to preserve R3 because it needs to be returned for memcpy.
-; We can be interrupted and lose control here.
+; bcopy_nc(from, to, nbytes)
;
-; There is no stack, so in order to used floating point, we would
-; need to take the FP exception. Any potential gains by using FP
-; would be more than eaten up by this.
+; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
+; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
+; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
+; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
+
+ .align 5
+ .globl EXT(bcopy_nc)
+ .globl EXT(bcopy_nc_nop_if_32bit)
+
+LEXT(bcopy_nc)
+ cmpwi cr1,r5,0 ; Check if we have a 0 length
+ sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
+ mr r6,r3 ; Set source (must preserve r3 for memcopy return)
+ crset noncache ; Set non-cached
+ cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
+LEXT(bcopy_nc_nop_if_32bit)
+ bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
+ bne+ copyit32 ; handle 32-bit processor
+ blr ; either zero length or to==from
+
;
-; Later, we should used Altivec for large moves.
+; void* memcpy(to, from, nbytes)
+; void* memmove(to, from, nbytes)
;
-
+; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
+; However, they would work correctly if called in 64-bit mode.
+
.align 5
.globl EXT(memcpy)
+ .globl EXT(memmove)
+ .globl EXT(memcpy_nop_if_32bit)
LEXT(memcpy)
-
- cmplw cr1,r3,r4 ; "to" and "from" the same?
- mr r6,r4 ; Set the "from"
- mr. r5,r5 ; Length zero?
+LEXT(memmove)
+ cmplwi cr1,r5,kShort ; less than 32 bytes?
+ sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
+ mr r6,r4 ; Set source
+ mr r4,r3 ; Set the "to" (must preserve r3 for return value)
+ blt cr1,shortcopy ; special case short operands
crclr noncache ; Set cached
- mr r4,r3 ; Set the "to"
- crclr fixxlate ; Set translation already ok
- beqlr- cr1 ; "to" and "from" are the same
- beqlr- ; Length is 0
- crclr killbats ; Make sure we do not trash BATs on the way out
-
-copyit: sub r12,r4,r6 ; Get potential overlap (negative if backward move)
- lis r8,0x7FFF ; Start up a mask
- srawi r11,r12,31 ; Propagate the sign bit
- dcbt br0,r6 ; Touch in the first source line
- cntlzw r7,r5 ; Get the highest power of 2 factor of the length
- ori r8,r8,0xFFFF ; Make limit 0x7FFFFFFF
- xor r9,r12,r11 ; If sink - source was negative, invert bits
- srw r8,r8,r7 ; Get move length limitation
- sub r9,r9,r11 ; If sink - source was negative, add 1 and get absolute value
- cmplw r12,r5 ; See if we actually forward overlap
- cmplwi cr7,r9,32 ; See if at least a line between source and sink
- dcbtst br0,r4 ; Touch in the first sink line
- cmplwi cr1,r5,32 ; Are we moving more than a line?
- cror noncache,noncache,28 ; Set to not DCBZ output line if not enough space
- blt- fwdovrlap ; This is a forward overlapping area, handle it...
+LEXT(memcpy_nop_if_32bit)
+ bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
+ beqlr- ; exit if to==from
+
+; Here to copy on 32-bit processors.
;
-; R4 = sink
-; R5 = length
-; R6 = source
-;
-
+; When we move the memory, forward overlays must be handled. We
+; also can not use the cache instructions if we are from bcopy_nc.
+; We need to preserve R3 because it needs to be returned for memcpy.
+; We can be interrupted and lose control here.
;
-; Here we figure out how much we have to move to get the sink onto a
-; cache boundary. If we can, and there are still more that 32 bytes
-; left to move, we can really speed things up by DCBZing the sink line.
-; We can not do this if noncache is set because we will take an
-; alignment exception.
-
- neg r0,r4 ; Get the number of bytes to move to align to a line boundary
- rlwinm. r0,r0,0,27,31 ; Clean it up and test it
- and r0,r0,r8 ; limit to the maximum front end move
- mtcrf 3,r0 ; Make branch mask for partial moves
- sub r5,r5,r0 ; Set the length left to move
+; When entered:
+; r4 = destination
+; r5 = length (>0)
+; r6 = source
+; r12 = (dest - source)
+; cr5 = noncache flag
+
+copyit32: ; WARNING! can drop down to this label
+ cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
+ cntlzw r11,r5 ; get magnitude of length
+ dcbt 0,r6 ; start to touch in source
+ lis r10,hi16(0x80000000) ; get 0x80000000
+ neg r9,r4 ; start to get alignment for destination
+ dcbtst 0,r4 ; start to touch in destination
+ sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
+ blt- cr1,reverse32bit ; reverse move required
+
+; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
+; NOTE: we never do an unaligned access if the source and destination are "relatively"
+; word aligned. We depend on this in the uncached case on 64-bit processors.
+; r4 = destination
+; r5 = length (>0)
+; r6 = source
+; r8 = inverse of largest mask smaller than operand length
+; r9 = neg(dest), used to compute alignment
+; cr5 = noncache flag
+
+forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
+ rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
+ andc. r0,r7,r8 ; limit to the maximum front end move
+ mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
beq alline ; Already on a line...
+ mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
+ sub r5,r5,r0 ; Set the length left to move
+
bf 31,alhalf ; No single byte to do...
lbz r7,0(r6) ; Get the byte
addi r6,r6,1 ; Point to the next
; Sink is line aligned here
alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
- mtcrf 3,r5 ; Make branch mask for backend partial moves
- rlwinm r11,r5,0,0,26 ; Get number of bytes we are going to move
+ mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
+ mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
beq- backend ; No full lines to move
-
- sub r5,r5,r11 ; Calculate the residual
- li r10,96 ; Stride for touch ahead
-
-nxtline: subic. r0,r0,1 ; Account for the line now
-
+
+ mtctr r0 ; set up loop count
+ li r0,96 ; Stride for touch ahead
+ b nxtline
+
+ .align 4
+nxtline:
+ lwz r2,0(r6) ; Get the first word
+ lwz r5,4(r6) ; Get the second word
+ lwz r7,8(r6) ; Get the third word
+ lwz r8,12(r6) ; Get the fourth word
+ lwz r9,16(r6) ; Get the fifth word
+ lwz r10,20(r6) ; Get the sixth word
+ lwz r11,24(r6) ; Get the seventh word
+ lwz r12,28(r6) ; Get the eighth word
bt- noncache,skipz ; Skip if we are not cached...
- dcbz br0,r4 ; Blow away the whole line because we are replacing it
- dcbt r6,r10 ; Touch ahead a bit
-
-skipz: lwz r7,0(r6) ; Get the first word
- lwz r8,4(r6) ; Get the second word
- lwz r9,8(r6) ; Get the third word
- stw r7,0(r4) ; Save the first word
- lwz r11,12(r6) ; Get the fourth word
- stw r8,4(r4) ; Save the second word
- lwz r7,16(r6) ; Get the fifth word
- stw r9,8(r4) ; Save the third word
- lwz r8,20(r6) ; Get the sixth word
- stw r11,12(r4) ; Save the fourth word
- lwz r9,24(r6) ; Get the seventh word
- stw r7,16(r4) ; Save the fifth word
- lwz r11,28(r6) ; Get the eighth word
+ dcbz 0,r4 ; Blow away the whole line because we are replacing it
+ dcbt r6,r0 ; Touch ahead a bit
+skipz:
addi r6,r6,32 ; Point to the next
- stw r8,20(r4) ; Save the sixth word
- stw r9,24(r4) ; Save the seventh word
- stw r11,28(r4) ; Save the eighth word
+ stw r2,0(r4) ; Save the first word
+ stw r5,4(r4) ; Save the second word
+ stw r7,8(r4) ; Save the third word
+ stw r8,12(r4) ; Save the fourth word
+ stw r9,16(r4) ; Save the fifth word
+ stw r10,20(r4) ; Save the sixth word
+ stw r11,24(r4) ; Save the seventh word
+ stw r12,28(r4) ; Save the eighth word
addi r4,r4,32 ; Bump sink
- bgt+ nxtline ; Do the next line, if any...
+ bdnz+ nxtline ; Do the next line, if any...
; Move backend quadword
-backend: bf 27,noquad ; No quad to do...
+backend: ; Join here from "shortcopy" for forward moves <32 bytes
+ bf 27,noquad ; No quad to do...
lwz r7,0(r6) ; Get the first word
lwz r8,4(r6) ; Get the second word
lwz r9,8(r6) ; Get the third word
; Move backend byte
-nohalf: bf 31,bcpydone ; Leave cuz we are all done...
+nohalf: bflr 31 ; Leave cuz we are all done...
lbz r7,0(r6) ; Get the byte
stb r7,0(r4) ; Save the single
-
-bcpydone: bt- killbats,bcclrbat ; Jump if we need to clear bats...
- bflr fixxlate ; Leave now if we do not need to fix translation...
- mfmsr r9 ; Get the MSR
- ori r9,r9,lo16(MASK(MSR_DR)) ; Turn data translation on
- rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off
- rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off
- mtmsr r9 ; Just do it
- isync ; Hang in there
- blr ; Leave cuz we are all done...
-
-bcclrbat: li r0,0 ; Get set to invalidate upper half
- sync ; Make sure all is well
- mtdbatu 0,r0 ; Clear sink upper DBAT
- mtdbatu 1,r0 ; Clear source upper DBAT
- sync
- isync
- blr
+ blr
-;
-; 0123456789ABCDEF0123456789ABCDEF
-; 0123456789ABCDEF0123456789ABCDEF
-; F
-; DE
-; 9ABC
-; 12345678
-; 123456789ABCDEF0
-; 0
+; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
+; NOTE: we never do an unaligned access if the source and destination are "relatively"
+; word aligned. We depend on this in the uncached case on 64-bit processors.
+; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
+; r4 = destination
+; r5 = length (>0)
+; r6 = source
+; r8 = inverse of largest mask smaller than operand length
+; cr5 = noncache flag (but we don't dcbz anyway)
-;
-; Here is where we handle a forward overlapping move. These will be slow
-; because we can not kill the cache of the destination until after we have
-; loaded/saved the source area. Also, because reading memory backwards is
-; slower when the cache line needs to be loaded because the critical
-; doubleword is loaded first, i.e., the last, then it goes back to the first,
-; and on in order. That means that when we are at the second to last DW we
-; have to wait until the whole line is in cache before we can proceed.
-;
-
-fwdovrlap: add r4,r5,r4 ; Point past the last sink byte
+reverse32bit: ; here from 64-bit code with word aligned uncached operands
+ add r4,r5,r4 ; Point past the last sink byte
add r6,r5,r6 ; Point past the last source byte
- and r0,r4,r8 ; Apply movement limit
- li r12,-1 ; Make sure we touch in the actual line
- mtcrf 3,r0 ; Figure out the best way to move backwards
+ rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
+ li r12,-1 ; Make sure we touch in the actual line
+ andc. r0,r7,r8 ; Apply movement limit
dcbt r12,r6 ; Touch in the last line of source
- rlwinm. r0,r0,0,27,31 ; Calculate the length to adjust to cache boundary
+ mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
dcbtst r12,r4 ; Touch in the last line of the sink
- beq- balline ; Aready on cache line boundary
+ mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
+ beq- balline ; Aready on cache line boundary (or too short to bother)
sub r5,r5,r0 ; Precaculate move length left after alignment
; Sink is line aligned here
balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
- mtcrf 3,r5 ; Make branch mask for backend partial moves
+ mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
+ mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
beq- bbackend ; No full lines to move
-
-
-; Registers in use: R0, R1, R3, R4, R5, R6
-; Registers not in use: R2, R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
+ mtctr r0 ; set up loop count
+ b bnxtline
-bnxtline: subic. r0,r0,1 ; Account for the line now
-
+ .align 4
+bnxtline:
lwz r7,-32(r6) ; Get the first word
lwz r5,-28(r6) ; Get the second word
lwz r2,-24(r6) ; Get the third word
subi r6,r6,32 ; Point to the next
stw r7,-32(r4) ; Get the first word
- ble- bnotouch ; Last time, skip touch of source...
- dcbt br0,r6 ; Touch in next source line
-
-bnotouch: stw r5,-28(r4) ; Get the second word
+ stw r5,-28(r4) ; Get the second word
stw r2,-24(r4) ; Get the third word
stw r12,-20(r4) ; Get the third word
stw r11,-16(r4) ; Get the fifth word
stw r8,-4(r4) ; Get the eighth word
subi r4,r4,32 ; Bump sink
- bgt+ bnxtline ; Do the next line, if any...
+ bdnz+ bnxtline ; Do the next line, if any...
;
; Note: We touched these lines in at the beginning
; Move backend quadword
-bbackend: bf 27,bnoquad ; No quad to do...
+bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
+ bf 27,bnoquad ; No quad to do...
lwz r7,-16(r6) ; Get the first word
lwz r8,-12(r6) ; Get the second word
lwz r9,-8(r6) ; Get the third word
; Move backend byte
-bnohalf: bflr 31 ; Leave cuz we are all done...
+bnohalf: bflr 31 ; Leave cuz we are all done...
lbz r7,-1(r6) ; Get the byte
stb r7,-1(r4) ; Save the single
-
- b bcpydone ; Go exit cuz we are all done...
+ blr
+
+
+// Here on 64-bit processors, which have a 128-byte cache line. This can be
+// called either in 32 or 64-bit mode, which makes the test for reverse moves
+// a little tricky. We've already filtered out the (sou==dest) and (len==0)
+// special cases.
+//
+// When entered:
+// r4 = destination (32 or 64-bit ptr)
+// r5 = length (always 32 bits)
+// r6 = source (32 or 64-bit ptr)
+// r12 = (dest - source), reverse move required if (dest-source)<length
+// cr5 = noncache flag
+
+ .align 5
+copyit64:
+ rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
+ cntlzw r11,r5 // get magnitude of length
+ dcbt 0,r6 // touch in 1st block of source
+ dcbtst 0,r4 // touch in 1st destination cache block
+ subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
+ li r0,0 // get a 0
+ lis r10,hi16(0x80000000)// get 0x80000000
+ addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
+ neg r9,r4 // start to get alignment for destination
+ sraw r8,r10,r11 // get mask based on operand length, to limit alignment
+ bt-- noncache,c64uncached// skip if uncached
+ beq-- c64rdouble // handle cached reverse moves
+
+
+// Forward, cached or doubleword aligned uncached. This is the common case.
+// NOTE: we never do an unaligned access if the source and destination are "relatively"
+// doubleword aligned. We depend on this in the uncached case.
+// r4 = destination
+// r5 = length (>0)
+// r6 = source
+// r8 = inverse of largest mask smaller than operand length
+// r9 = neg(dest), used to compute alignment
+// cr5 = noncache flag
+
+c64double:
+ rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
+ andc r7,r7,r8 // limit by operand length
+ andi. r8,r7,7 // r8 <- #bytes to doubleword align
+ srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
+ sub r5,r5,r7 // adjust length remaining
+ cmpwi cr1,r9,0 // any doublewords to move to cache align?
+ srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
+ cmpwi cr7,r10,0 // set cr7 on chunk count
+ beq c64double2 // dest already doubleword aligned
+ mtctr r8
+ b c64double1
+
+ .align 5 // align inner loops
+c64double1: // copy bytes until dest is doubleword aligned
+ lbz r0,0(r6)
+ addi r6,r6,1
+ stb r0,0(r4)
+ addi r4,r4,1
+ bdnz c64double1
+
+c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
+ beq cr1,c64double4 // no doublewords to xfer in order to cache align
+ mtctr r9
+ b c64double3
+
+ .align 5 // align inner loops
+c64double3: // copy doublewords until dest is 128-byte aligned
+ ld r7,0(r6)
+ addi r6,r6,8
+ std r7,0(r4)
+ addi r4,r4,8
+ bdnz c64double3
+
+// Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
+// data (64 bytes), we load/store each twice per 128-byte chunk.
+
+c64double4: // r10/cr7=128-byte chunks
+ rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
+ cmpwi cr1,r0,0 // set cr1 on leftover doublewords
+ beq cr7,c64double7 // no 128-byte chunks
+
+ ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
+ ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
+
+ sub r8,r6,r4 // r8 <- (source - dest)
+ rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
+ cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
+ mtctr r10
+ b c64InnerLoop
+
+ .align 5 // align inner loop
+c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
+ ld r0,0(r6) // start pipe: load 1st half-line
+ ld r2,8(r6)
+ ld r7,16(r6)
+ ld r8,24(r6)
+ ld r9,32(r6)
+ ld r10,40(r6)
+ ld r11,48(r6)
+ ld r12,56(r6)
+ bt noncache,c64InnerLoop1 // skip if uncached or overlap
+ dcbz128 0,r4 // avoid prefetch of next cache line
+c64InnerLoop1:
+
+ std r0,0(r4)
+ std r2,8(r4)
+ std r7,16(r4)
+ std r8,24(r4)
+ std r9,32(r4)
+ std r10,40(r4)
+ std r11,48(r4)
+ std r12,56(r4)
+
+ ld r0,64(r6) // load 2nd half of chunk
+ ld r2,72(r6)
+ ld r7,80(r6)
+ ld r8,88(r6)
+ ld r9,96(r6)
+ ld r10,104(r6)
+ ld r11,112(r6)
+ ld r12,120(r6)
+ addi r6,r6,128
+
+ std r0,64(r4)
+ std r2,72(r4)
+ std r7,80(r4)
+ std r8,88(r4)
+ std r9,96(r4)
+ std r10,104(r4)
+ std r11,112(r4)
+ std r12,120(r4)
+ addi r4,r4,128 // advance to next dest chunk
+
+ bdnz c64InnerLoop // loop if more chunks
+
+
+c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
+ rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
+ andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
+ beq cr1,c64byte // no leftover doublewords
+ mtctr r0
+ b c64double8
+
+ .align 5 // align inner loop
+c64double8: // loop copying leftover doublewords
+ ld r0,0(r6)
+ addi r6,r6,8
+ std r0,0(r4)
+ addi r4,r4,8
+ bdnz c64double8
+
+
+// Forward byte loop.
+
+c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
+ beqlr // done if no leftover bytes
+ mtctr r5
+ b c64byte1
+
+ .align 5 // align inner loop
+c64byte1:
+ lbz r0,0(r6)
+ addi r6,r6,1
+ stb r0,0(r4)
+ addi r4,r4,1
+ bdnz c64byte1
+
+ blr
+
+
+// Uncached copies. We must avoid unaligned accesses, since they always take alignment
+// exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
+// a byte at a time, but that is still much faster than alignment exceptions.
+// r4 = destination
+// r5 = length (>0)
+// r6 = source
+// r8 = inverse of largest mask smaller than operand length
+// r9 = neg(dest), used to compute alignment
+// r12 = (dest-source), used to test relative alignment
+// cr0 = beq if reverse move required
+// cr5 = noncache flag
+
+c64uncached:
+ rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
+ rlwinm r11,r12,0,30,31 // relatively word aligned?
+ cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
+ cmpwi cr1,r11,0 // set cr1 beq if word aligned
+ beq-- c64reverseUncached
+
+ beq cr7,c64double // doubleword aligned
+ beq cr1,forward32bit // word aligned, use G3/G4 code
+ cmpwi r5,0 // set cr0 on byte count
+ b c64byte // unaligned operands
+
+c64reverseUncached:
+ beq cr7,c64rdouble // doubleword aligned so can use LD/STD
+ beq cr1,reverse32bit // word aligned, use G3/G4 code
+ add r6,r6,r5 // point to (end+1) of source and dest
+ add r4,r4,r5
+ cmpwi r5,0 // set cr0 on length
+ b c64rbyte // copy a byte at a time
+
+
+
+// Reverse doubleword copies. This is used for all cached copies, and doubleword
+// aligned uncached copies.
+// r4 = destination
+// r5 = length (>0)
+// r6 = source
+// r8 = inverse of largest mask of low-order 1s smaller than operand length
+// cr5 = noncache flag
+
+c64rdouble:
+ add r6,r6,r5 // point to (end+1) of source and dest
+ add r4,r4,r5
+ rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
+ andc. r7,r7,r8 // limit by operand length
+ sub r5,r5,r7 // adjust length
+ srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
+ cmpwi cr1,r8,0 // any chunks?
+ beq c64rd2 // source already doubleword aligned
+ mtctr r7
+
+c64rd1: // copy bytes until source doublword aligned
+ lbzu r0,-1(r6)
+ stbu r0,-1(r4)
+ bdnz c64rd1
+
+c64rd2: // r8/cr1 <- count of 64-byte chunks
+ rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
+ andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
+ cmpwi cr7,r0,0 // leftover doublewords?
+ beq cr1,c64rd4 // no chunks to xfer
+ mtctr r8
+ b c64rd3
+
+ .align 5 // align inner loop
+c64rd3: // loop copying 64-byte chunks
+ ld r7,-8(r6)
+ ld r8,-16(r6)
+ ld r9,-24(r6)
+ ld r10,-32(r6)
+ ld r11,-40(r6)
+ ld r12,-48(r6)
+ std r7,-8(r4)
+ std r8,-16(r4)
+ ld r7,-56(r6)
+ ldu r8,-64(r6)
+ std r9,-24(r4)
+ std r10,-32(r4)
+ std r11,-40(r4)
+ std r12,-48(r4)
+ std r7,-56(r4)
+ stdu r8,-64(r4)
+ bdnz c64rd3
+
+c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
+ beq cr7,c64rbyte // no leftover doublewords
+ mtctr r0
+
+c64rd5: // loop copying leftover doublewords
+ ldu r0,-8(r6)
+ stdu r0,-8(r4)
+ bdnz c64rd5
+
+
+// Reverse byte loop.
+
+c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
+ beqlr // done if no leftover bytes
+ mtctr r5
+
+c64rbyte1:
+ lbzu r0,-1(r6)
+ stbu r0,-1(r4)
+ bdnz c64rbyte1
+
+ blr
+