+ blr
+
+
+// Here on 64-bit processors, which have a 128-byte cache line. This can be
+// called either in 32 or 64-bit mode, which makes the test for reverse moves
+// a little tricky. We've already filtered out the (sou==dest) and (len==0)
+// special cases.
+//
+// When entered:
+// r4 = destination (32 or 64-bit ptr)
+// r5 = length (always 32 bits)
+// r6 = source (32 or 64-bit ptr)
+// r12 = (dest - source), reverse move required if (dest-source)<length
+// cr5 = noncache flag
+
+ .align 5
+copyit64:
+ rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
+ cntlzw r11,r5 // get magnitude of length
+ dcbt 0,r6 // touch in 1st block of source
+ dcbtst 0,r4 // touch in 1st destination cache block
+ subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
+ li r0,0 // get a 0
+ lis r10,hi16(0x80000000)// get 0x80000000
+ addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
+ neg r9,r4 // start to get alignment for destination
+ sraw r8,r10,r11 // get mask based on operand length, to limit alignment
+ bt-- noncache,c64uncached// skip if uncached
+ beq-- c64rdouble // handle cached reverse moves
+
+
+// Forward, cached or doubleword aligned uncached. This is the common case.
+// NOTE: we never do an unaligned access if the source and destination are "relatively"
+// doubleword aligned. We depend on this in the uncached case.
+// r4 = destination
+// r5 = length (>0)
+// r6 = source
+// r8 = inverse of largest mask smaller than operand length
+// r9 = neg(dest), used to compute alignment
+// cr5 = noncache flag
+
+c64double:
+ rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
+ andc r7,r7,r8 // limit by operand length
+ andi. r8,r7,7 // r8 <- #bytes to doubleword align
+ srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
+ sub r5,r5,r7 // adjust length remaining
+ cmpwi cr1,r9,0 // any doublewords to move to cache align?
+ srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
+ cmpwi cr7,r10,0 // set cr7 on chunk count
+ beq c64double2 // dest already doubleword aligned
+ mtctr r8
+ b c64double1
+
+ .align 5 // align inner loops
+c64double1: // copy bytes until dest is doubleword aligned
+ lbz r0,0(r6)
+ addi r6,r6,1
+ stb r0,0(r4)
+ addi r4,r4,1
+ bdnz c64double1
+
+c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
+ beq cr1,c64double4 // no doublewords to xfer in order to cache align
+ mtctr r9
+ b c64double3
+
+ .align 5 // align inner loops
+c64double3: // copy doublewords until dest is 128-byte aligned
+ ld r7,0(r6)
+ addi r6,r6,8
+ std r7,0(r4)
+ addi r4,r4,8
+ bdnz c64double3
+
+// Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
+// data (64 bytes), we load/store each twice per 128-byte chunk.
+
+c64double4: // r10/cr7=128-byte chunks
+ rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
+ cmpwi cr1,r0,0 // set cr1 on leftover doublewords
+ beq cr7,c64double7 // no 128-byte chunks
+
+ ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
+ ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
+
+ sub r8,r6,r4 // r8 <- (source - dest)
+ rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
+ cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
+ mtctr r10
+ b c64InnerLoop
+
+ .align 5 // align inner loop
+c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
+ ld r0,0(r6) // start pipe: load 1st half-line
+ ld r2,8(r6)
+ ld r7,16(r6)
+ ld r8,24(r6)
+ ld r9,32(r6)
+ ld r10,40(r6)
+ ld r11,48(r6)
+ ld r12,56(r6)
+ bt noncache,c64InnerLoop1 // skip if uncached or overlap
+ dcbz128 0,r4 // avoid prefetch of next cache line
+c64InnerLoop1:
+
+ std r0,0(r4)
+ std r2,8(r4)
+ std r7,16(r4)
+ std r8,24(r4)
+ std r9,32(r4)
+ std r10,40(r4)
+ std r11,48(r4)
+ std r12,56(r4)
+
+ ld r0,64(r6) // load 2nd half of chunk
+ ld r2,72(r6)
+ ld r7,80(r6)
+ ld r8,88(r6)
+ ld r9,96(r6)
+ ld r10,104(r6)
+ ld r11,112(r6)
+ ld r12,120(r6)
+ addi r6,r6,128
+
+ std r0,64(r4)
+ std r2,72(r4)
+ std r7,80(r4)
+ std r8,88(r4)
+ std r9,96(r4)
+ std r10,104(r4)
+ std r11,112(r4)
+ std r12,120(r4)
+ addi r4,r4,128 // advance to next dest chunk
+
+ bdnz c64InnerLoop // loop if more chunks
+
+
+c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
+ rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
+ andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
+ beq cr1,c64byte // no leftover doublewords
+ mtctr r0
+ b c64double8
+
+ .align 5 // align inner loop
+c64double8: // loop copying leftover doublewords
+ ld r0,0(r6)
+ addi r6,r6,8
+ std r0,0(r4)
+ addi r4,r4,8
+ bdnz c64double8
+
+
+// Forward byte loop.
+
+c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
+ beqlr // done if no leftover bytes
+ mtctr r5
+ b c64byte1
+
+ .align 5 // align inner loop
+c64byte1:
+ lbz r0,0(r6)
+ addi r6,r6,1
+ stb r0,0(r4)
+ addi r4,r4,1
+ bdnz c64byte1
+
+ blr
+
+
+// Uncached copies. We must avoid unaligned accesses, since they always take alignment
+// exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
+// a byte at a time, but that is still much faster than alignment exceptions.
+// r4 = destination
+// r5 = length (>0)
+// r6 = source
+// r8 = inverse of largest mask smaller than operand length
+// r9 = neg(dest), used to compute alignment
+// r12 = (dest-source), used to test relative alignment
+// cr0 = beq if reverse move required
+// cr5 = noncache flag
+
+c64uncached:
+ rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
+ rlwinm r11,r12,0,30,31 // relatively word aligned?
+ cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
+ cmpwi cr1,r11,0 // set cr1 beq if word aligned
+ beq-- c64reverseUncached
+
+ beq cr7,c64double // doubleword aligned
+ beq cr1,forward32bit // word aligned, use G3/G4 code
+ cmpwi r5,0 // set cr0 on byte count
+ b c64byte // unaligned operands
+
+c64reverseUncached:
+ beq cr7,c64rdouble // doubleword aligned so can use LD/STD
+ beq cr1,reverse32bit // word aligned, use G3/G4 code
+ add r6,r6,r5 // point to (end+1) of source and dest
+ add r4,r4,r5
+ cmpwi r5,0 // set cr0 on length
+ b c64rbyte // copy a byte at a time
+
+
+
+// Reverse doubleword copies. This is used for all cached copies, and doubleword
+// aligned uncached copies.
+// r4 = destination
+// r5 = length (>0)
+// r6 = source
+// r8 = inverse of largest mask of low-order 1s smaller than operand length
+// cr5 = noncache flag
+
+c64rdouble:
+ add r6,r6,r5 // point to (end+1) of source and dest
+ add r4,r4,r5
+ rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
+ andc. r7,r7,r8 // limit by operand length
+ sub r5,r5,r7 // adjust length
+ srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
+ cmpwi cr1,r8,0 // any chunks?
+ beq c64rd2 // source already doubleword aligned
+ mtctr r7
+
+c64rd1: // copy bytes until source doublword aligned
+ lbzu r0,-1(r6)
+ stbu r0,-1(r4)
+ bdnz c64rd1
+
+c64rd2: // r8/cr1 <- count of 64-byte chunks
+ rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
+ andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
+ cmpwi cr7,r0,0 // leftover doublewords?
+ beq cr1,c64rd4 // no chunks to xfer
+ mtctr r8
+ b c64rd3
+
+ .align 5 // align inner loop
+c64rd3: // loop copying 64-byte chunks
+ ld r7,-8(r6)
+ ld r8,-16(r6)
+ ld r9,-24(r6)
+ ld r10,-32(r6)
+ ld r11,-40(r6)
+ ld r12,-48(r6)
+ std r7,-8(r4)
+ std r8,-16(r4)
+ ld r7,-56(r6)
+ ldu r8,-64(r6)
+ std r9,-24(r4)
+ std r10,-32(r4)
+ std r11,-40(r4)
+ std r12,-48(r4)
+ std r7,-56(r4)
+ stdu r8,-64(r4)
+ bdnz c64rd3
+
+c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
+ beq cr7,c64rbyte // no leftover doublewords
+ mtctr r0
+
+c64rd5: // loop copying leftover doublewords
+ ldu r0,-8(r6)
+ stdu r0,-8(r4)
+ bdnz c64rd5
+
+
+// Reverse byte loop.
+
+c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
+ beqlr // done if no leftover bytes
+ mtctr r5
+
+c64rbyte1:
+ lbzu r0,-1(r6)
+ stbu r0,-1(r4)
+ bdnz c64rbyte1
+
+ blr
+