/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
;
#include <ppc/asm.h>
#include <ppc/proc_reg.h>
+#include <assym.s>
; Use CR5_lt to indicate non-cached
#define noncache 20
+
; Use CR5_gt to indicate that we need to turn data translation back on
#define fixxlate 21
-; Use CR5_eq to indicate that we need to invalidate bats
-#define killbats 22
+
+; Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off
+; 64-bit mode (if 64-bit) before returning to our caller. We overload the
+; bit to reduce the number of conditional branches at bcopy exit.
+#define restorex 22
+
+; Use CR5_so to indicate that we need to restore real-mode cachability
+; Only needed on 64-bit machines
+#define flipcache 23
;
; bcopy_nc(from, to, nbytes)
;
; void bcopy_physvir(from, to, nbytes)
; Attempt to copy physically addressed memory with translation on if conditions are met.
-; Otherwise do a normal bcopy_phys.
+; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
+; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
+; for the passed phys addrs and do the copy with translation on.
;
; Rules are: neither source nor destination can cross a page.
-; No accesses above the 2GB line (I/O or ROM).
;
-; Interrupts must be disabled throughout the copy when this is called
-
+; Interrupts must be disabled throughout the copy when this is called.
; To do this, we build a
; 128 DBAT for both the source and sink. If both are the same, only one is
; loaded. We do not touch the IBATs, so there is no issue if either physical page
; address is the same as the virtual address of the instructions we are executing.
;
-; At the end, we invalidate the used DBATs and reenable interrupts.
+; At the end, we invalidate the used DBATs.
+;
+; Note that the address parameters are long longs. We will transform these to 64-bit
+; values. Note that on 32-bit architectures that this will ignore the high half of the
+; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
+; there anyhow.
;
; Note, this one will not work in user state
;
LEXT(bcopy_physvir)
- addic. r0,r5,-1 ; Get length - 1
+ crclr flipcache ; (HACK) No cache flip needed
+ mfsprg r8,2 ; get processor feature flags
+ rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ addic. r0,r7,-1 ; Get length - 1
+ rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
add r11,r3,r0 ; Point to last byte of sink
- cmplw cr1,r3,r4 ; Does source == sink?
+ rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
+ rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
+ mr r5,r7 ; Get the length into the right register
+ cmplw cr1,r3,r4 ; Does source == sink?
+ bt++ pf64Bitb,bcopy_phys1 ; if 64-bit processor, use standard routine (no BATs)
add r12,r4,r0 ; Point to last byte of source
bltlr- ; Bail if length is 0 or way too big
xor r7,r11,r3 ; See if we went to next page
xor r8,r12,r4 ; See if we went to next page
or r0,r7,r8 ; Combine wrap
- li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
+// li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
+ li r9,((2<<3)|2) ; Set default attributes
rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
li r7,2 ; Set validity flags
li r8,2 ; Set validity flags
- bne- EXT(bcopy_phys) ; Overflowed page, do normal physical copy...
+ bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
- crset killbats ; Remember to trash BATs on the way out
+ crset restorex ; Remember to trash BATs on the way out
rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
bcpvsame: mr r6,r3 ; Set source
crclr noncache ; Set cached
+ crclr fixxlate ; Set translation already ok
- b copyit ; Go copy it...
-
+ b copyit32 ; Go copy it...
;
; void bcopy_phys(from, to, nbytes)
; Turns off data translation before the copy. Note, this one will
-; not work in user state
+; not work in user state. This routine is used on 32 and 64-bit
+; machines.
+;
+; Note that the address parameters are long longs. We will transform these to 64-bit
+; values. Note that on 32-bit architectures that this will ignore the high half of the
+; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
+; there anyhow.
+;
+; Also note that you probably will not be happy if either the sink or source spans across the
+; boundary between RAM and I/O space. Good chance of hanging the machine and this code
+; will not check, so be careful.
;
.align 5
.globl EXT(bcopy_phys)
LEXT(bcopy_phys)
-
+ crclr flipcache ; (HACK) No cache flip needed
+ rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ mfsprg r8,2 ; get processor feature flags
+ rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
+ rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
+ rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
+ mr r5,r7 ; Get the length into the right register
+
+bcopy_phys1: ; enter from bcopy_physvir with pf64Bit already in cr6
mfmsr r9 ; Get the MSR
-
crclr noncache ; Set cached
- rlwinm. r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; Is data translation on?
-
- cmplw cr1,r4,r3 ; Compare "to" and "from"
+ bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
+
+; 32-bit CPUs
+
+ sub. r0,r3,r4 ; to==from?
+ rlwinm r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; was translation on?
+ cmpwi cr1,r8,0 ; set cr1 beq if translation was off
+ oris r8,r8,hi16(MASK(MSR_VEC)) ; Get vector enable
cmplwi cr7,r5,0 ; Check if we have a 0 length
+ beqlr- ; bail if to==from
+ ori r8,r8,lo16(MASK(MSR_FP)) ; Get FP
mr r6,r3 ; Set source
- beqlr- cr1 ; Bail if "to" and "from" are the same
- xor r9,r9,r8 ; Turn off translation if it is on (should be)
+ andc r9,r9,r8 ; Turn off translation if it is on (should be) and FP, VEC
beqlr- cr7 ; Bail if length is 0
- rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off
- crclr killbats ; Make sure we do not trash BATs on the way out
- rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off
+ crclr restorex ; Make sure we do not trash BATs on the way out
mtmsr r9 ; Set DR translation off
isync ; Wait for it
- crnot fixxlate,cr0_eq ; Remember to turn on translation if it was
- b copyit ; Go copy it...
+ crnot fixxlate,cr1_eq ; Remember to turn on translation if it was
+ b copyit32 ; Go copy it...
+
+; 64-bit: turn DR off and SF on, remember if we need to restore on way out.
+
+bcopy_phys64: ; r9 = MSR
+
+ srdi r2,r3,31 ; (HACK) Get a 1 if source is in I/O memory
+ srdi. r0,r9,63-MSR_SF_BIT ; set cr0 beq on if SF was off when we were called
+ rlwinm r8,r9,MSR_DR_BIT+1,31,31 ; r8 <- DR bit right justified
+ cmpld cr1,r3,r4 ; to==from?
+ li r0,1 ; Note - we use this in a couple places below
+ lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
+ cmpwi cr7,r5,0 ; length==0 ?
+ ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
+ beqlr-- cr1 ; bail if to==from
+ srdi r10,r4,31 ; (HACK) Get a 1 if sink is in I/O memory
+ rldimi r9,r0,63,MSR_SF_BIT ; set SF on
+ beqlr-- cr7 ; bail if length==0
+ andc r9,r9,r6 ; turn DR, VEC, FP off
+ cmpwi cr1,r8,0 ; was DR on?
+ crmove restorex,cr0_eq ; if SF was off, remember to turn back off before we return
+ mtmsrd r9 ; turn 64-bit addressing on, data translation off
+ cmpldi cr0,r2,1 ; (HACK) Is source in I/O memory?
+ isync ; wait for it to happen
+ mr r6,r3 ; Set source
+ cmpldi cr7,r10,1 ; (HACK) Is sink in I/O memory?
+ crnot fixxlate,cr1_eq ; if DR was on, remember to turn back on before we return
+
+ cror flipcache,cr0_eq,cr7_eq ; (HACK) See if either source or sink is in I/O area
+
+ rlwinm r10,r9,MSR_EE_BIT+1,31,31 ; (HACK GLORIOUS HACK) Isolate the EE bit
+ sldi r11,r0,31-MSR_EE_BIT ; (HACK GLORIOUS HACK)) Get a mask for the EE bit
+ sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching
+ bf++ flipcache,copyit64 ; (HACK) No need to mess with caching...
+
+;
+; HACK GLORIOUS HACK - when we force of caching, we need to also force off
+; interruptions. We are out of CR bits, so we need to stash the entry EE
+; somewheres. It is in the XER.... We NEED to change this!!!!
+;
+
+ mtxer r10 ; (HACK GLORIOUS HACK) Remember EE
+ andc r9,r9,r11 ; (HACK GLORIOUS HACK) Turn off EE bit
+ mfspr r2,hid4 ; (HACK) Get HID4
+ crset noncache ; (HACK) Set non-cached
+ mtmsrd r9 ; (HACK GLORIOUS HACK) Force off EE
+ or r2,r2,r0 ; (HACK) Set bit to make real accesses cache-inhibited
+ sync ; (HACK) Sync up
+ li r0,1
+ mtspr hid4,r2 ; (HACK) Make real accesses cache-inhibited
+ isync ; (HACK) Toss prefetches
+
+ lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible
+ srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000
+ slbie r12 ; (HACK) Make sure the ERAT is cleared
+
+ sync ; (HACK)
+ isync ; (HACK)
+
+ b copyit64
+
;
; void bcopy(from, to, nbytes)
crclr noncache ; Set cached
-bcpswap: cmplw cr1,r4,r3 ; Compare "to" and "from"
- mr. r5,r5 ; Check if we have a 0 length
+bcpswap:
+ crclr flipcache ; (HACK) No cache flip needed
+ mfsprg r8,2 ; get processor feature flags
+ sub. r0,r4,r3 ; test for to==from in mode-independent way
+ mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
+ cmpwi cr1,r5,0 ; Check if we have a 0 length
+ crclr restorex ; Make sure we do not trash BATs on the way out
mr r6,r3 ; Set source
- crclr killbats ; Make sure we do not trash BATs on the way out
- beqlr- cr1 ; Bail if "to" and "from" are the same
- beqlr- ; Bail if length is 0
crclr fixxlate ; Set translation already ok
- b copyit ; Go copy it...
+ beqlr- ; Bail if "to" and "from" are the same
+ beqlr- cr1 ; Bail if length is 0
+ bt++ pf64Bitb,copyit64 ; handle 64-bit processor
+ b copyit32 ; Go copy it...
;
; When we move the memory, forward overlays must be handled. We
; We need to preserve R3 because it needs to be returned for memcpy.
; We can be interrupted and lose control here.
;
-; There is no stack, so in order to used floating point, we would
-; need to take the FP exception. Any potential gains by using FP
+; There is no stack, so in order to use vectors, we would
+; need to take the vector exception. Any potential gains by using vectors
; would be more than eaten up by this.
;
-; Later, we should used Altivec for large moves.
+; NOTE: this code is called in three "modes":
+; - on 32-bit processors (32-byte cache line)
+; - on 64-bit processors running in 32-bit mode (128-byte cache line)
+; - on 64-bit processors running in 64-bit mode (128-byte cache line)
+;
+; ALSO NOTE: bcopy is called from copyin and copyout etc
+; with the "thread_recover" ptr set. This means bcopy must not set up a
+; stack frame or touch non-volatile registers, and also means that it
+; cannot rely on turning off interrupts, because we expect to get DSIs
+; and have execution aborted by a "longjmp" to the thread_recover
+; routine.
;
.align 5
.globl EXT(memcpy)
-
+ ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit
+ ; processors...
LEXT(memcpy)
-
+ crclr flipcache ; (HACK) No cache flip needed
+ mfsprg r8,2 ; get processor feature flags
cmplw cr1,r3,r4 ; "to" and "from" the same?
+ mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
mr r6,r4 ; Set the "from"
mr. r5,r5 ; Length zero?
crclr noncache ; Set cached
crclr fixxlate ; Set translation already ok
beqlr- cr1 ; "to" and "from" are the same
beqlr- ; Length is 0
- crclr killbats ; Make sure we do not trash BATs on the way out
+ crclr restorex ; Make sure we do not trash BATs on the way out
+ bt++ pf64Bitb,copyit64 ; handle 64-bit processors
-copyit: sub r12,r4,r6 ; Get potential overlap (negative if backward move)
+copyit32: sub r12,r4,r6 ; Get potential overlap (negative if backward move)
lis r8,0x7FFF ; Start up a mask
srawi r11,r12,31 ; Propagate the sign bit
dcbt br0,r6 ; Touch in the first source line
cmplwi cr7,r9,32 ; See if at least a line between source and sink
dcbtst br0,r4 ; Touch in the first sink line
cmplwi cr1,r5,32 ; Are we moving more than a line?
- cror noncache,noncache,28 ; Set to not DCBZ output line if not enough space
+ cror noncache,noncache,cr7_lt ; Set to not DCBZ output line if not enough space
blt- fwdovrlap ; This is a forward overlapping area, handle it...
;
; We can not do this if noncache is set because we will take an
; alignment exception.
+G4word: ; enter from 64-bit case with word aligned uncached operands
neg r0,r4 ; Get the number of bytes to move to align to a line boundary
rlwinm. r0,r0,0,27,31 ; Clean it up and test it
and r0,r0,r8 ; limit to the maximum front end move
lbz r7,0(r6) ; Get the byte
stb r7,0(r4) ; Save the single
-bcpydone: bt- killbats,bcclrbat ; Jump if we need to clear bats...
- bflr fixxlate ; Leave now if we do not need to fix translation...
+bcpydone:
mfmsr r9 ; Get the MSR
+ bf++ flipcache,bcpydone0 ; (HACK) No need to mess with caching...
+
+ li r0,1 ; (HACK) Get a 1
+ mfxer r10 ; (HACK GLORIOUS HACK) Get the entry EE
+ sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching
+ mfspr r2,hid4 ; (HACK) Get HID4
+ rlwinm r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT ; (HACK GLORIOUS HACK) Set the EE bit
+ andc r2,r2,r0 ; (HACK) Clear bit to make real accesses cache-inhibited
+ or r9,r9,r10 ; (HACK GLORIOUS HACK) Set the EE in MSR
+ sync ; (HACK) Sync up
+ mtspr hid4,r2 ; (HACK) Make real accesses not cache-inhibited
+ isync ; (HACK) Toss prefetches
+
+ lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible
+ srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000
+ slbie r12 ; (HACK) Make sure the ERAT is cleared
+
+ mtmsr r9 ; (HACK GLORIOUS HACK) Set EE properly
+
+bcpydone0:
+ lis r0,hi16(MASK(MSR_VEC)) ; Get the vector bit
+ ori r0,r0,lo16(MASK(MSR_FP)) ; Get the float bit
+ bf++ fixxlate,bcpydone1 ; skip if we do not need to fix translation...
ori r9,r9,lo16(MASK(MSR_DR)) ; Turn data translation on
- rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off
- rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off
+ andc r9,r9,r0 ; Make sure that FP and VEC are off
mtmsr r9 ; Just do it
isync ; Hang in there
- blr ; Leave cuz we are all done...
-
-bcclrbat: li r0,0 ; Get set to invalidate upper half
+
+bcpydone1:
+ bflr++ restorex ; done if we do not have to fix up addressing
+ mfsprg r8,2 ; get the feature flags again
+ mtcrf 0x02,r8 ; put pf64Bit where we can test it
+ bt++ pf64Bitb,bcpydone2 ; skip if 64-bit processor
+
+ ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir
+
+ li r0,0 ; Get set to invalidate upper half
sync ; Make sure all is well
mtdbatu 0,r0 ; Clear sink upper DBAT
mtdbatu 1,r0 ; Clear source upper DBAT
isync
blr
+ ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys
+
+bcpydone2:
+ mfmsr r9 ; get MSR again
+ andc r9,r9,r0 ; Make sure that FP and VEC are off
+ rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
+ mtmsrd r9
+ isync
+ blr
+
;
; 0123456789ABCDEF0123456789ABCDEF
; and on in order. That means that when we are at the second to last DW we
; have to wait until the whole line is in cache before we can proceed.
;
-
+
+G4reverseWord: ; here from 64-bit code with word aligned uncached operands
fwdovrlap: add r4,r5,r4 ; Point past the last sink byte
add r6,r5,r6 ; Point past the last source byte
and r0,r4,r8 ; Apply movement limit
; Move backend byte
-bnohalf: bflr 31 ; Leave cuz we are all done...
+bnohalf: bf 31,bcpydone ; Leave cuz we are all done...
lbz r7,-1(r6) ; Get the byte
stb r7,-1(r4) ; Save the single
b bcpydone ; Go exit cuz we are all done...
+
+
+// Here on 64-bit processors, which have a 128-byte cache line. This can be
+// called either in 32 or 64-bit mode, which makes the test for reverse moves
+// a little tricky. We've already filtered out the (sou==dest) and (len==0)
+// special cases.
+//
+// When entered:
+// r4 = destination (32 or 64-bit ptr)
+// r5 = length (always 32 bits)
+// r6 = source (32 or 64-bit ptr)
+// cr5 = noncache, fixxlate, flipcache, and restorex flags set
+
+ .align 5
+copyit64:
+ lis r2,0x4000 // r2 = 0x00000000 40000000
+ neg r12,r4 // start to compute #bytes to align dest
+ bt-- noncache,noncache1 // (HACK) Do not even try anything cached...
+ dcbt 0,r6 // touch in 1st block of source
+noncache1:
+ add. r2,r2,r2 // if 0x00000000 80000000 < 0, we are in 32-bit mode
+ cntlzw r9,r5 // get highest power-of-2 in length
+ rlwinm r7,r12,0,25,31 // r7 <- bytes to 128-byte align dest
+ bt-- noncache,noncache2 // (HACK) Do not even try anything cached...
+ dcbtst 0,r4 // touch in 1st destination cache block
+noncache2:
+ sraw r2,r2,r9 // get mask with 1s for leading 0s in length, plus 1 more 1-bit
+ bge copyit64a // skip if we are running in 64-bit mode
+ rlwinm r4,r4,0,0,31 // running in 32-bit mode, so truncate ptrs and lengths to 32 bits
+ rlwinm r5,r5,0,0,31
+ rlwinm r6,r6,0,0,31
+copyit64a: // now we can use 64-bit compares even if running in 32-bit mode
+ sub r8,r4,r6 // get (dest-source)
+ andc r7,r7,r2 // limit bytes to align by operand length
+ cmpld cr1,r8,r5 // if (dest-source)<length, must move reverse
+ bt-- noncache,c64uncached // skip if uncached
+ blt-- cr1,c64rdouble // handle cached reverse moves
+
+
+// Forward, cached or doubleword aligned uncached. This is the common case.
+// r4-r6 = dest, length, source (as above)
+// r7 = #bytes 128-byte align dest (limited by copy length)
+// cr5 = flags, as above
+
+c64double:
+ andi. r8,r7,7 // r8 <- #bytes to doubleword align
+ srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
+ sub r5,r5,r7 // adjust length remaining
+ cmpwi cr1,r9,0 // any doublewords to move to cache align?
+ srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
+ cmpwi cr7,r10,0 // set cr7 on chunk count
+ beq c64double2 // dest already doubleword aligned
+ mtctr r8
+ b c64double1
+
+ .align 5 // align inner loops
+c64double1: // copy bytes until dest is doubleword aligned
+ lbz r0,0(r6)
+ addi r6,r6,1
+ stb r0,0(r4)
+ addi r4,r4,1
+ bdnz c64double1
+
+c64double2: // r9/cr1=doublewords, r10=128-byte chunks, cr7=blt if r5==0
+ beq cr1,c64double4 // no doublewords to xfer in order to cache align
+ mtctr r9
+ b c64double3
+
+ .align 5 // align inner loops
+c64double3: // copy doublewords until dest is 128-byte aligned
+ ld r7,0(r6)
+ addi r6,r6,8
+ std r7,0(r4)
+ addi r4,r4,8
+ bdnz c64double3
+
+// Here to xfer 128-byte chunks, if any. Because the IBM 970 cannot issue two stores/cycle,
+// we pipeline the inner loop so we can pair loads and stores. Since we only have 8 GPRs for
+// data (64 bytes), we load/store each twice per 128-byte chunk.
+
+c64double4: // r10/cr7=128-byte chunks
+ rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
+ cmpwi cr1,r0,0 // set cr1 on leftover doublewords
+ beq cr7,c64double7 // no 128-byte chunks
+ sub r8,r6,r4 // r8 <- (source - dest)
+ li r9,128 // start at next cache line (we've already touched in 1st line)
+ cmpldi cr7,r8,128 // if (source-dest)<128, cannot use dcbz128 beacause of overlap
+ cror noncache,cr7_lt,noncache // turn on "noncache" flag if (source-dest)<128
+ bt-- noncache,noncache3 // (HACK) Skip cache touch if noncachable
+ dcbt128 r9,r6,1 // start forward stream
+noncache3:
+ mtctr r10
+
+ ld r0,0(r6) // start pipe: load 1st half-line
+ ld r2,8(r6)
+ ld r7,16(r6)
+ ld r8,24(r6)
+ ld r9,32(r6)
+ ld r10,40(r6)
+ ld r11,48(r6)
+ ld r12,56(r6)
+ b c64InnerLoopEntryPt
+
+ .align 5 // align inner loop
+c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
+ std r0,64(r4) // store 2nd half of chunk n
+ ld r0,0(r6) // load 1st half of chunk n+1
+ std r2,72(r4)
+ ld r2,8(r6)
+ std r7,80(r4)
+ ld r7,16(r6)
+ std r8,88(r4)
+ ld r8,24(r6)
+ std r9,96(r4)
+ ld r9,32(r6)
+ std r10,104(r4)
+ ld r10,40(r6)
+ std r11,112(r4)
+ ld r11,48(r6)
+ std r12,120(r4)
+ ld r12,56(r6)
+ addi r4,r4,128 // advance to next dest chunk
+c64InnerLoopEntryPt: // initial entry into loop, with 1st halfline loaded
+ bt noncache,c64InnerLoop1 // skip if uncached or overlap
+ dcbz128 0,r4 // avoid prefetch of next cache line
+c64InnerLoop1:
+ std r0,0(r4) // store 1st half of chunk n
+ ld r0,64(r6) // load 2nd half of chunk n
+ std r2,8(r4)
+ ld r2,72(r6)
+ std r7,16(r4)
+ ld r7,80(r6)
+ std r8,24(r4)
+ ld r8,88(r6)
+ std r9,32(r4)
+ ld r9,96(r6)
+ std r10,40(r4)
+ ld r10,104(r6)
+ std r11,48(r4)
+ ld r11,112(r6)
+ std r12,56(r4)
+ ld r12,120(r6)
+ addi r6,r6,128 // advance to next source chunk if any
+ bdnz c64InnerLoop // loop if more chunks
+
+ std r0,64(r4) // store 2nd half of last chunk
+ std r2,72(r4)
+ std r7,80(r4)
+ std r8,88(r4)
+ std r9,96(r4)
+ std r10,104(r4)
+ std r11,112(r4)
+ std r12,120(r4)
+ addi r4,r4,128 // advance to next dest chunk
+
+c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
+ rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
+ andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
+ beq cr1,c64byte // no leftover doublewords
+ mtctr r0
+ b c64double8
+
+ .align 5 // align inner loop
+c64double8: // loop copying leftover doublewords
+ ld r0,0(r6)
+ addi r6,r6,8
+ std r0,0(r4)
+ addi r4,r4,8
+ bdnz c64double8
+
+
+// Forward byte loop.
+
+c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
+ beq bcpydone // done if no leftover bytes
+ mtctr r5
+ b c64byte1
+
+ .align 5 // align inner loop
+c64byte1:
+ lbz r0,0(r6)
+ addi r6,r6,1
+ stb r0,0(r4)
+ addi r4,r4,1
+ bdnz c64byte1
+
+ b bcpydone
+
+
+// Uncached copies. We must avoid unaligned accesses, since they always take alignment
+// exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
+// a byte at a time, but that is still much faster than alignment exceptions.
+// r4-r6 = dest, length, source (as above)
+// r2 = mask of 1s for leading 0s in length, plus 1 extra 1
+// r7 = #bytes to copy to 128-byte align dest (limited by operand length)
+// cr1 = blt if reverse move required
+
+c64uncached:
+ xor r0,r6,r4 // get relative alignment
+ rlwinm r10,r0,0,29,31 // relatively doubleword aligned?
+ rlwinm r11,r0,0,30,31 // relatively word aligned?
+ not r8,r2 // get mask to limit initial length of copy for G4word
+ blt cr1,c64reverseUncached
+
+ cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned
+ cmpwi cr1,r11,0 // set cr1 beq if word aligned
+ beq cr0,c64double // doubleword aligned
+ beq cr1,G4word // word aligned, use G3/G4 code
+ cmpwi r5,0 // set cr0 on byte count
+ b c64byte // unaligned operands
+
+c64reverseUncached:
+ cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned
+ cmpwi cr1,r11,0 // set cr1 beq if word aligned
+ beq cr0,c64rdouble // doubleword aligned so can use LD/STD
+ beq cr1,G4reverseWord // word aligned, use G3/G4 code
+ add r6,r6,r5 // point to (end+1) of source and dest
+ add r4,r4,r5
+ cmpwi r5,0 // set cr0 on length
+ b c64rbyte // copy a byte at a time
+
+
+
+// Reverse doubleword copies. This is used for all cached copies, and doubleword
+// aligned uncached copies.
+// r4 = destination (32 or 64-bit ptr)
+// r5 = length (always 32 bits)
+// r6 = source (32 or 64-bit ptr)
+// cr5 = noncache, fixxlate, and restorex flags set
+
+c64rdouble:
+ add r6,r6,r5 // point to (end+1) of source and dest
+ add r4,r4,r5
+ rlwinm. r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
+ cmplw cr1,r7,r5 // operand long enough to doubleword align?
+ blt cr1,c64rd0 // yes
+ mr r7,r5 // no
+c64rd0:
+ sub r5,r5,r7 // adjust length
+ srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
+ cmpwi cr1,r8,0 // any chunks?
+ beq c64rd2 // source already doubleword aligned
+ mtctr r7
+
+c64rd1: // copy bytes until source doublword aligned
+ lbzu r0,-1(r6)
+ stbu r0,-1(r4)
+ bdnz c64rd1
+
+c64rd2: // r8/cr1 <- count of 64-byte chunks
+ rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
+ andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
+ cmpwi cr7,r0,0 // leftover doublewords?
+ beq cr1,c64rd4 // no chunks to xfer
+ li r9,-128 // start at next cache line
+ mtctr r8
+ bt noncache,c64rd3 // (HACK) Do not start a stream if noncachable...
+ dcbt128 r9,r6,3 // start reverse stream
+ b c64rd3
+
+ .align 5 // align inner loop
+c64rd3: // loop copying 64-byte chunks
+ ld r7,-8(r6)
+ ld r8,-16(r6)
+ ld r9,-24(r6)
+ ld r10,-32(r6)
+ ld r11,-40(r6)
+ ld r12,-48(r6)
+ std r7,-8(r4)
+ std r8,-16(r4)
+ ld r7,-56(r6)
+ ldu r8,-64(r6)
+ std r9,-24(r4)
+ std r10,-32(r4)
+ std r11,-40(r4)
+ std r12,-48(r4)
+ std r7,-56(r4)
+ stdu r8,-64(r4)
+ bdnz c64rd3
+
+c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
+ beq cr7,c64rbyte // no leftover doublewords
+ mtctr r0
+
+c64rd5: // loop copying leftover doublewords
+ ldu r0,-8(r6)
+ stdu r0,-8(r4)
+ bdnz c64rd5
+
+
+// Reverse byte loop.
+
+c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
+ beq bcpydone // done if no leftover bytes
+ mtctr r5
+
+c64rbyte1:
+ lbzu r0,-1(r6)
+ stbu r0,-1(r4)
+ bdnz c64rbyte1
+
+ b bcpydone
+