/*
- * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
- * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License"). You may not use this file except in compliance with the
+ * License. Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
*
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
;
#include <ppc/asm.h>
#include <ppc/proc_reg.h>
-#include <assym.s>
; Use CR5_lt to indicate non-cached
#define noncache 20
-
; Use CR5_gt to indicate that we need to turn data translation back on
#define fixxlate 21
-
-; Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off
-; 64-bit mode (if 64-bit) before returning to our caller. We overload the
-; bit to reduce the number of conditional branches at bcopy exit.
-#define restorex 22
-
-; Use CR5_so to indicate that we need to restore real-mode cachability
-; Only needed on 64-bit machines
-#define flipcache 23
+; Use CR5_eq to indicate that we need to invalidate bats
+#define killbats 22
;
; bcopy_nc(from, to, nbytes)
;
; void bcopy_physvir(from, to, nbytes)
; Attempt to copy physically addressed memory with translation on if conditions are met.
-; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
-; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
-; for the passed phys addrs and do the copy with translation on.
+; Otherwise do a normal bcopy_phys.
;
; Rules are: neither source nor destination can cross a page.
+; No accesses above the 2GB line (I/O or ROM).
;
-; Interrupts must be disabled throughout the copy when this is called.
+; Interrupts must be disabled throughout the copy when this is called
+
; To do this, we build a
; 128 DBAT for both the source and sink. If both are the same, only one is
; loaded. We do not touch the IBATs, so there is no issue if either physical page
; address is the same as the virtual address of the instructions we are executing.
;
-; At the end, we invalidate the used DBATs.
-;
-; Note that the address parameters are long longs. We will transform these to 64-bit
-; values. Note that on 32-bit architectures that this will ignore the high half of the
-; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
-; there anyhow.
+; At the end, we invalidate the used DBATs and reenable interrupts.
;
; Note, this one will not work in user state
;
LEXT(bcopy_physvir)
- crclr flipcache ; (HACK) No cache flip needed
- mfsprg r8,2 ; get processor feature flags
- rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
- addic. r0,r7,-1 ; Get length - 1
- rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
+ addic. r0,r5,-1 ; Get length - 1
add r11,r3,r0 ; Point to last byte of sink
- rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
- mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
- rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
- mr r5,r7 ; Get the length into the right register
- cmplw cr1,r3,r4 ; Does source == sink?
- bt++ pf64Bitb,bcopy_phys1 ; if 64-bit processor, use standard routine (no BATs)
+ cmplw cr1,r3,r4 ; Does source == sink?
add r12,r4,r0 ; Point to last byte of source
bltlr- ; Bail if length is 0 or way too big
xor r7,r11,r3 ; See if we went to next page
xor r8,r12,r4 ; See if we went to next page
or r0,r7,r8 ; Combine wrap
-// li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
- li r9,((2<<3)|2) ; Set default attributes
+ li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
li r7,2 ; Set validity flags
li r8,2 ; Set validity flags
- bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
+ bne- EXT(bcopy_phys) ; Overflowed page, do normal physical copy...
- crset restorex ; Remember to trash BATs on the way out
+ crset killbats ; Remember to trash BATs on the way out
rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
bcpvsame: mr r6,r3 ; Set source
crclr noncache ; Set cached
- crclr fixxlate ; Set translation already ok
- b copyit32 ; Go copy it...
+ b copyit ; Go copy it...
+
;
; void bcopy_phys(from, to, nbytes)
; Turns off data translation before the copy. Note, this one will
-; not work in user state. This routine is used on 32 and 64-bit
-; machines.
-;
-; Note that the address parameters are long longs. We will transform these to 64-bit
-; values. Note that on 32-bit architectures that this will ignore the high half of the
-; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
-; there anyhow.
-;
-; Also note that you probably will not be happy if either the sink or source spans across the
-; boundary between RAM and I/O space. Good chance of hanging the machine and this code
-; will not check, so be careful.
+; not work in user state
;
.align 5
.globl EXT(bcopy_phys)
LEXT(bcopy_phys)
- crclr flipcache ; (HACK) No cache flip needed
- rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
- mfsprg r8,2 ; get processor feature flags
- rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
- rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
- mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
- rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
- mr r5,r7 ; Get the length into the right register
-
-bcopy_phys1: ; enter from bcopy_physvir with pf64Bit already in cr6
+
mfmsr r9 ; Get the MSR
+
crclr noncache ; Set cached
- bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
-
-; 32-bit CPUs
-
- sub. r0,r3,r4 ; to==from?
- rlwinm r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; was translation on?
- cmpwi cr1,r8,0 ; set cr1 beq if translation was off
- oris r8,r8,hi16(MASK(MSR_VEC)) ; Get vector enable
+ rlwinm. r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; Is data translation on?
+
+ cmplw cr1,r4,r3 ; Compare "to" and "from"
cmplwi cr7,r5,0 ; Check if we have a 0 length
- beqlr- ; bail if to==from
- ori r8,r8,lo16(MASK(MSR_FP)) ; Get FP
mr r6,r3 ; Set source
- andc r9,r9,r8 ; Turn off translation if it is on (should be) and FP, VEC
+ beqlr- cr1 ; Bail if "to" and "from" are the same
+ xor r9,r9,r8 ; Turn off translation if it is on (should be)
beqlr- cr7 ; Bail if length is 0
- crclr restorex ; Make sure we do not trash BATs on the way out
+ rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off
+ crclr killbats ; Make sure we do not trash BATs on the way out
+ rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off
mtmsr r9 ; Set DR translation off
isync ; Wait for it
- crnot fixxlate,cr1_eq ; Remember to turn on translation if it was
- b copyit32 ; Go copy it...
-
-; 64-bit: turn DR off and SF on, remember if we need to restore on way out.
-
-bcopy_phys64: ; r9 = MSR
-
- srdi r2,r3,31 ; (HACK) Get a 1 if source is in I/O memory
- srdi. r0,r9,63-MSR_SF_BIT ; set cr0 beq on if SF was off when we were called
- rlwinm r8,r9,MSR_DR_BIT+1,31,31 ; r8 <- DR bit right justified
- cmpld cr1,r3,r4 ; to==from?
- li r0,1 ; Note - we use this in a couple places below
- lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
- cmpwi cr7,r5,0 ; length==0 ?
- ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
- beqlr-- cr1 ; bail if to==from
- srdi r10,r4,31 ; (HACK) Get a 1 if sink is in I/O memory
- rldimi r9,r0,63,MSR_SF_BIT ; set SF on
- beqlr-- cr7 ; bail if length==0
- andc r9,r9,r6 ; turn DR, VEC, FP off
- cmpwi cr1,r8,0 ; was DR on?
- crmove restorex,cr0_eq ; if SF was off, remember to turn back off before we return
- mtmsrd r9 ; turn 64-bit addressing on, data translation off
- cmpldi cr0,r2,1 ; (HACK) Is source in I/O memory?
- isync ; wait for it to happen
- mr r6,r3 ; Set source
- cmpldi cr7,r10,1 ; (HACK) Is sink in I/O memory?
- crnot fixxlate,cr1_eq ; if DR was on, remember to turn back on before we return
-
- cror flipcache,cr0_eq,cr7_eq ; (HACK) See if either source or sink is in I/O area
-
- rlwinm r10,r9,MSR_EE_BIT+1,31,31 ; (HACK GLORIOUS HACK) Isolate the EE bit
- sldi r11,r0,31-MSR_EE_BIT ; (HACK GLORIOUS HACK)) Get a mask for the EE bit
- sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching
- bf++ flipcache,copyit64 ; (HACK) No need to mess with caching...
-
-;
-; HACK GLORIOUS HACK - when we force of caching, we need to also force off
-; interruptions. We are out of CR bits, so we need to stash the entry EE
-; somewheres. It is in the XER.... We NEED to change this!!!!
-;
-
- mtxer r10 ; (HACK GLORIOUS HACK) Remember EE
- andc r9,r9,r11 ; (HACK GLORIOUS HACK) Turn off EE bit
- mfspr r2,hid4 ; (HACK) Get HID4
- crset noncache ; (HACK) Set non-cached
- mtmsrd r9 ; (HACK GLORIOUS HACK) Force off EE
- or r2,r2,r0 ; (HACK) Set bit to make real accesses cache-inhibited
- sync ; (HACK) Sync up
- li r0,1
- mtspr hid4,r2 ; (HACK) Make real accesses cache-inhibited
- isync ; (HACK) Toss prefetches
-
- lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible
- srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000
- slbie r12 ; (HACK) Make sure the ERAT is cleared
-
- sync ; (HACK)
- isync ; (HACK)
-
- b copyit64
-
+ crnot fixxlate,cr0_eq ; Remember to turn on translation if it was
+ b copyit ; Go copy it...
;
; void bcopy(from, to, nbytes)
crclr noncache ; Set cached
-bcpswap:
- crclr flipcache ; (HACK) No cache flip needed
- mfsprg r8,2 ; get processor feature flags
- sub. r0,r4,r3 ; test for to==from in mode-independent way
- mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
- cmpwi cr1,r5,0 ; Check if we have a 0 length
- crclr restorex ; Make sure we do not trash BATs on the way out
+bcpswap: cmplw cr1,r4,r3 ; Compare "to" and "from"
+ mr. r5,r5 ; Check if we have a 0 length
mr r6,r3 ; Set source
+ crclr killbats ; Make sure we do not trash BATs on the way out
+ beqlr- cr1 ; Bail if "to" and "from" are the same
+ beqlr- ; Bail if length is 0
crclr fixxlate ; Set translation already ok
- beqlr- ; Bail if "to" and "from" are the same
- beqlr- cr1 ; Bail if length is 0
- bt++ pf64Bitb,copyit64 ; handle 64-bit processor
- b copyit32 ; Go copy it...
+ b copyit ; Go copy it...
;
; When we move the memory, forward overlays must be handled. We
; We need to preserve R3 because it needs to be returned for memcpy.
; We can be interrupted and lose control here.
;
-; There is no stack, so in order to use vectors, we would
-; need to take the vector exception. Any potential gains by using vectors
+; There is no stack, so in order to used floating point, we would
+; need to take the FP exception. Any potential gains by using FP
; would be more than eaten up by this.
;
-; NOTE: this code is called in three "modes":
-; - on 32-bit processors (32-byte cache line)
-; - on 64-bit processors running in 32-bit mode (128-byte cache line)
-; - on 64-bit processors running in 64-bit mode (128-byte cache line)
-;
-; ALSO NOTE: bcopy is called from copyin and copyout etc
-; with the "thread_recover" ptr set. This means bcopy must not set up a
-; stack frame or touch non-volatile registers, and also means that it
-; cannot rely on turning off interrupts, because we expect to get DSIs
-; and have execution aborted by a "longjmp" to the thread_recover
-; routine.
+; Later, we should used Altivec for large moves.
;
.align 5
.globl EXT(memcpy)
- ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit
- ; processors...
+
LEXT(memcpy)
- crclr flipcache ; (HACK) No cache flip needed
- mfsprg r8,2 ; get processor feature flags
+
cmplw cr1,r3,r4 ; "to" and "from" the same?
- mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
mr r6,r4 ; Set the "from"
mr. r5,r5 ; Length zero?
crclr noncache ; Set cached
crclr fixxlate ; Set translation already ok
beqlr- cr1 ; "to" and "from" are the same
beqlr- ; Length is 0
- crclr restorex ; Make sure we do not trash BATs on the way out
- bt++ pf64Bitb,copyit64 ; handle 64-bit processors
+ crclr killbats ; Make sure we do not trash BATs on the way out
-copyit32: sub r12,r4,r6 ; Get potential overlap (negative if backward move)
+copyit: sub r12,r4,r6 ; Get potential overlap (negative if backward move)
lis r8,0x7FFF ; Start up a mask
srawi r11,r12,31 ; Propagate the sign bit
dcbt br0,r6 ; Touch in the first source line
cmplwi cr7,r9,32 ; See if at least a line between source and sink
dcbtst br0,r4 ; Touch in the first sink line
cmplwi cr1,r5,32 ; Are we moving more than a line?
- cror noncache,noncache,cr7_lt ; Set to not DCBZ output line if not enough space
+ cror noncache,noncache,28 ; Set to not DCBZ output line if not enough space
blt- fwdovrlap ; This is a forward overlapping area, handle it...
;
; We can not do this if noncache is set because we will take an
; alignment exception.
-G4word: ; enter from 64-bit case with word aligned uncached operands
neg r0,r4 ; Get the number of bytes to move to align to a line boundary
rlwinm. r0,r0,0,27,31 ; Clean it up and test it
and r0,r0,r8 ; limit to the maximum front end move
lbz r7,0(r6) ; Get the byte
stb r7,0(r4) ; Save the single
-bcpydone:
+bcpydone: bt- killbats,bcclrbat ; Jump if we need to clear bats...
+ bflr fixxlate ; Leave now if we do not need to fix translation...
mfmsr r9 ; Get the MSR
- bf++ flipcache,bcpydone0 ; (HACK) No need to mess with caching...
-
- li r0,1 ; (HACK) Get a 1
- mfxer r10 ; (HACK GLORIOUS HACK) Get the entry EE
- sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching
- mfspr r2,hid4 ; (HACK) Get HID4
- rlwinm r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT ; (HACK GLORIOUS HACK) Set the EE bit
- andc r2,r2,r0 ; (HACK) Clear bit to make real accesses cache-inhibited
- or r9,r9,r10 ; (HACK GLORIOUS HACK) Set the EE in MSR
- sync ; (HACK) Sync up
- mtspr hid4,r2 ; (HACK) Make real accesses not cache-inhibited
- isync ; (HACK) Toss prefetches
-
- lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible
- srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000
- slbie r12 ; (HACK) Make sure the ERAT is cleared
-
- mtmsr r9 ; (HACK GLORIOUS HACK) Set EE properly
-
-bcpydone0:
- lis r0,hi16(MASK(MSR_VEC)) ; Get the vector bit
- ori r0,r0,lo16(MASK(MSR_FP)) ; Get the float bit
- bf++ fixxlate,bcpydone1 ; skip if we do not need to fix translation...
ori r9,r9,lo16(MASK(MSR_DR)) ; Turn data translation on
- andc r9,r9,r0 ; Make sure that FP and VEC are off
+ rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off
+ rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off
mtmsr r9 ; Just do it
isync ; Hang in there
-
-bcpydone1:
- bflr++ restorex ; done if we do not have to fix up addressing
- mfsprg r8,2 ; get the feature flags again
- mtcrf 0x02,r8 ; put pf64Bit where we can test it
- bt++ pf64Bitb,bcpydone2 ; skip if 64-bit processor
-
- ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir
-
- li r0,0 ; Get set to invalidate upper half
+ blr ; Leave cuz we are all done...
+
+bcclrbat: li r0,0 ; Get set to invalidate upper half
sync ; Make sure all is well
mtdbatu 0,r0 ; Clear sink upper DBAT
mtdbatu 1,r0 ; Clear source upper DBAT
isync
blr
- ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys
-
-bcpydone2:
- mfmsr r9 ; get MSR again
- andc r9,r9,r0 ; Make sure that FP and VEC are off
- rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
- mtmsrd r9
- isync
- blr
-
;
; 0123456789ABCDEF0123456789ABCDEF
; and on in order. That means that when we are at the second to last DW we
; have to wait until the whole line is in cache before we can proceed.
;
-
-G4reverseWord: ; here from 64-bit code with word aligned uncached operands
+
fwdovrlap: add r4,r5,r4 ; Point past the last sink byte
add r6,r5,r6 ; Point past the last source byte
and r0,r4,r8 ; Apply movement limit
stb r7,-1(r4) ; Save the single
b bcpydone ; Go exit cuz we are all done...
-
-
-// Here on 64-bit processors, which have a 128-byte cache line. This can be
-// called either in 32 or 64-bit mode, which makes the test for reverse moves
-// a little tricky. We've already filtered out the (sou==dest) and (len==0)
-// special cases.
-//
-// When entered:
-// r4 = destination (32 or 64-bit ptr)
-// r5 = length (always 32 bits)
-// r6 = source (32 or 64-bit ptr)
-// cr5 = noncache, fixxlate, flipcache, and restorex flags set
-
- .align 5
-copyit64:
- lis r2,0x4000 // r2 = 0x00000000 40000000
- neg r12,r4 // start to compute #bytes to align dest
- bt-- noncache,noncache1 // (HACK) Do not even try anything cached...
- dcbt 0,r6 // touch in 1st block of source
-noncache1:
- add. r2,r2,r2 // if 0x00000000 80000000 < 0, we are in 32-bit mode
- cntlzw r9,r5 // get highest power-of-2 in length
- rlwinm r7,r12,0,25,31 // r7 <- bytes to 128-byte align dest
- bt-- noncache,noncache2 // (HACK) Do not even try anything cached...
- dcbtst 0,r4 // touch in 1st destination cache block
-noncache2:
- sraw r2,r2,r9 // get mask with 1s for leading 0s in length, plus 1 more 1-bit
- bge copyit64a // skip if we are running in 64-bit mode
- rlwinm r4,r4,0,0,31 // running in 32-bit mode, so truncate ptrs and lengths to 32 bits
- rlwinm r5,r5,0,0,31
- rlwinm r6,r6,0,0,31
-copyit64a: // now we can use 64-bit compares even if running in 32-bit mode
- sub r8,r4,r6 // get (dest-source)
- andc r7,r7,r2 // limit bytes to align by operand length
- cmpld cr1,r8,r5 // if (dest-source)<length, must move reverse
- bt-- noncache,c64uncached // skip if uncached
- blt-- cr1,c64rdouble // handle cached reverse moves
-
-
-// Forward, cached or doubleword aligned uncached. This is the common case.
-// r4-r6 = dest, length, source (as above)
-// r7 = #bytes 128-byte align dest (limited by copy length)
-// cr5 = flags, as above
-
-c64double:
- andi. r8,r7,7 // r8 <- #bytes to doubleword align
- srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
- sub r5,r5,r7 // adjust length remaining
- cmpwi cr1,r9,0 // any doublewords to move to cache align?
- srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
- cmpwi cr7,r10,0 // set cr7 on chunk count
- beq c64double2 // dest already doubleword aligned
- mtctr r8
- b c64double1
-
- .align 5 // align inner loops
-c64double1: // copy bytes until dest is doubleword aligned
- lbz r0,0(r6)
- addi r6,r6,1
- stb r0,0(r4)
- addi r4,r4,1
- bdnz c64double1
-
-c64double2: // r9/cr1=doublewords, r10=128-byte chunks, cr7=blt if r5==0
- beq cr1,c64double4 // no doublewords to xfer in order to cache align
- mtctr r9
- b c64double3
-
- .align 5 // align inner loops
-c64double3: // copy doublewords until dest is 128-byte aligned
- ld r7,0(r6)
- addi r6,r6,8
- std r7,0(r4)
- addi r4,r4,8
- bdnz c64double3
-
-// Here to xfer 128-byte chunks, if any. Because the IBM 970 cannot issue two stores/cycle,
-// we pipeline the inner loop so we can pair loads and stores. Since we only have 8 GPRs for
-// data (64 bytes), we load/store each twice per 128-byte chunk.
-
-c64double4: // r10/cr7=128-byte chunks
- rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
- cmpwi cr1,r0,0 // set cr1 on leftover doublewords
- beq cr7,c64double7 // no 128-byte chunks
- sub r8,r6,r4 // r8 <- (source - dest)
- li r9,128 // start at next cache line (we've already touched in 1st line)
- cmpldi cr7,r8,128 // if (source-dest)<128, cannot use dcbz128 beacause of overlap
- cror noncache,cr7_lt,noncache // turn on "noncache" flag if (source-dest)<128
- bt-- noncache,noncache3 // (HACK) Skip cache touch if noncachable
- dcbt128 r9,r6,1 // start forward stream
-noncache3:
- mtctr r10
-
- ld r0,0(r6) // start pipe: load 1st half-line
- ld r2,8(r6)
- ld r7,16(r6)
- ld r8,24(r6)
- ld r9,32(r6)
- ld r10,40(r6)
- ld r11,48(r6)
- ld r12,56(r6)
- b c64InnerLoopEntryPt
-
- .align 5 // align inner loop
-c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
- std r0,64(r4) // store 2nd half of chunk n
- ld r0,0(r6) // load 1st half of chunk n+1
- std r2,72(r4)
- ld r2,8(r6)
- std r7,80(r4)
- ld r7,16(r6)
- std r8,88(r4)
- ld r8,24(r6)
- std r9,96(r4)
- ld r9,32(r6)
- std r10,104(r4)
- ld r10,40(r6)
- std r11,112(r4)
- ld r11,48(r6)
- std r12,120(r4)
- ld r12,56(r6)
- addi r4,r4,128 // advance to next dest chunk
-c64InnerLoopEntryPt: // initial entry into loop, with 1st halfline loaded
- bt noncache,c64InnerLoop1 // skip if uncached or overlap
- dcbz128 0,r4 // avoid prefetch of next cache line
-c64InnerLoop1:
- std r0,0(r4) // store 1st half of chunk n
- ld r0,64(r6) // load 2nd half of chunk n
- std r2,8(r4)
- ld r2,72(r6)
- std r7,16(r4)
- ld r7,80(r6)
- std r8,24(r4)
- ld r8,88(r6)
- std r9,32(r4)
- ld r9,96(r6)
- std r10,40(r4)
- ld r10,104(r6)
- std r11,48(r4)
- ld r11,112(r6)
- std r12,56(r4)
- ld r12,120(r6)
- addi r6,r6,128 // advance to next source chunk if any
- bdnz c64InnerLoop // loop if more chunks
-
- std r0,64(r4) // store 2nd half of last chunk
- std r2,72(r4)
- std r7,80(r4)
- std r8,88(r4)
- std r9,96(r4)
- std r10,104(r4)
- std r11,112(r4)
- std r12,120(r4)
- addi r4,r4,128 // advance to next dest chunk
-
-c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
- rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
- andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
- beq cr1,c64byte // no leftover doublewords
- mtctr r0
- b c64double8
-
- .align 5 // align inner loop
-c64double8: // loop copying leftover doublewords
- ld r0,0(r6)
- addi r6,r6,8
- std r0,0(r4)
- addi r4,r4,8
- bdnz c64double8
-
-
-// Forward byte loop.
-
-c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
- beq bcpydone // done if no leftover bytes
- mtctr r5
- b c64byte1
-
- .align 5 // align inner loop
-c64byte1:
- lbz r0,0(r6)
- addi r6,r6,1
- stb r0,0(r4)
- addi r4,r4,1
- bdnz c64byte1
-
- b bcpydone
-
-
-// Uncached copies. We must avoid unaligned accesses, since they always take alignment
-// exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
-// a byte at a time, but that is still much faster than alignment exceptions.
-// r4-r6 = dest, length, source (as above)
-// r2 = mask of 1s for leading 0s in length, plus 1 extra 1
-// r7 = #bytes to copy to 128-byte align dest (limited by operand length)
-// cr1 = blt if reverse move required
-
-c64uncached:
- xor r0,r6,r4 // get relative alignment
- rlwinm r10,r0,0,29,31 // relatively doubleword aligned?
- rlwinm r11,r0,0,30,31 // relatively word aligned?
- not r8,r2 // get mask to limit initial length of copy for G4word
- blt cr1,c64reverseUncached
-
- cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned
- cmpwi cr1,r11,0 // set cr1 beq if word aligned
- beq cr0,c64double // doubleword aligned
- beq cr1,G4word // word aligned, use G3/G4 code
- cmpwi r5,0 // set cr0 on byte count
- b c64byte // unaligned operands
-
-c64reverseUncached:
- cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned
- cmpwi cr1,r11,0 // set cr1 beq if word aligned
- beq cr0,c64rdouble // doubleword aligned so can use LD/STD
- beq cr1,G4reverseWord // word aligned, use G3/G4 code
- add r6,r6,r5 // point to (end+1) of source and dest
- add r4,r4,r5
- cmpwi r5,0 // set cr0 on length
- b c64rbyte // copy a byte at a time
-
-
-
-// Reverse doubleword copies. This is used for all cached copies, and doubleword
-// aligned uncached copies.
-// r4 = destination (32 or 64-bit ptr)
-// r5 = length (always 32 bits)
-// r6 = source (32 or 64-bit ptr)
-// cr5 = noncache, fixxlate, and restorex flags set
-
-c64rdouble:
- add r6,r6,r5 // point to (end+1) of source and dest
- add r4,r4,r5
- rlwinm. r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
- cmplw cr1,r7,r5 // operand long enough to doubleword align?
- blt cr1,c64rd0 // yes
- mr r7,r5 // no
-c64rd0:
- sub r5,r5,r7 // adjust length
- srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
- cmpwi cr1,r8,0 // any chunks?
- beq c64rd2 // source already doubleword aligned
- mtctr r7
-
-c64rd1: // copy bytes until source doublword aligned
- lbzu r0,-1(r6)
- stbu r0,-1(r4)
- bdnz c64rd1
-
-c64rd2: // r8/cr1 <- count of 64-byte chunks
- rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
- andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
- cmpwi cr7,r0,0 // leftover doublewords?
- beq cr1,c64rd4 // no chunks to xfer
- li r9,-128 // start at next cache line
- mtctr r8
- bt noncache,c64rd3 // (HACK) Do not start a stream if noncachable...
- dcbt128 r9,r6,3 // start reverse stream
- b c64rd3
-
- .align 5 // align inner loop
-c64rd3: // loop copying 64-byte chunks
- ld r7,-8(r6)
- ld r8,-16(r6)
- ld r9,-24(r6)
- ld r10,-32(r6)
- ld r11,-40(r6)
- ld r12,-48(r6)
- std r7,-8(r4)
- std r8,-16(r4)
- ld r7,-56(r6)
- ldu r8,-64(r6)
- std r9,-24(r4)
- std r10,-32(r4)
- std r11,-40(r4)
- std r12,-48(r4)
- std r7,-56(r4)
- stdu r8,-64(r4)
- bdnz c64rd3
-
-c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
- beq cr7,c64rbyte // no leftover doublewords
- mtctr r0
-
-c64rd5: // loop copying leftover doublewords
- ldu r0,-8(r6)
- stdu r0,-8(r4)
- bdnz c64rd5
-
-
-// Reverse byte loop.
-
-c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
- beq bcpydone // done if no leftover bytes
- mtctr r5
-
-c64rbyte1:
- lbzu r0,-1(r6)
- stbu r0,-1(r4)
- bdnz c64rbyte1
-
- b bcpydone
-