+/*
+ * Copyright (c) 1992-2001 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License"). You may not use this file except in compliance with the
+ * License. Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ *
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+#include <architecture/ppc/asm_help.h>
+
+// =================================================================================================
+// *** The easiest way to assemble things on Mac OS X is via "cc", so this uses #defines and such.
+// =================================================================================================
+
+// Keep track of whether we have Altivec
+// This gets set in pthread_init()
+
+.data
+.align 2
+.globl __cpu_has_altivec
+__cpu_has_altivec:
+.long 0
+
+.text
+.align 2
+.globl _bcopy
+.globl _memcpy
+.globl _memmove
+
+_bcopy:
+ mr r2,r4 // Since bcopy uses (src,dest,count), swap r3,r4
+ mr r4,r3
+ mr r3,r2
+_memcpy:
+_memmove:
+ mr r2,r3 // Store dest ptr in r2 to preserve r3 on return
+
+// ------------------
+// Standard registers
+
+#define rs r4
+#define rd r2
+#define rc r5
+
+// Should we bother using Altivec?
+
+ cmpwi r5, 128
+ blt+ LScalar
+
+// Determine whether we have Altivec enabled
+
+ mflr r0
+ bcl 20,31,1f
+1:
+ mflr r6
+ mtlr r0
+ addis r6, r6, ha16(__cpu_has_altivec - 1b)
+ lwz r6, lo16(__cpu_has_altivec - 1b)(r6)
+ cmpwi r6, 0
+ bne+ LAltivec
+
+// =================================================================================================
+
+// *****************************************
+// * S c a l a r B l o c k M o o f D a t a *
+// *****************************************
+//
+// This is the scalar (non-AltiVec) version of BlockMoofData.
+//
+// void ScalarBlockMoofData (ptr sou, ptr dest, long len)
+// void ScalarBlockMoofDataUncached (ptr sou, ptr dest, long len)
+//
+//
+// Calling Sequence: r3 = source pointer
+// r4 = destination pointer
+// r5 = length in bytes
+//
+// Uses: all volatile registers.
+
+LScalar:
+ cmplwi cr7,rc,32 // length <= 32 bytes?
+ cmplw cr6,rd,rs // up or down?
+ mr. r0,rc // copy to r0 for MoveShort, and test for negative
+ bgt cr7,Lbm1 // skip if count > 32
+
+// Handle short moves (<=32 bytes.)
+
+ beq cr7,LMove32 // special case 32-byte blocks
+ blt cr6,LMoveDownShort // move down in memory and return
+ add rs,rs,rc // moving up (right-to-left), so adjust pointers
+ add rd,rd,rc
+ b LMoveUpShort // move up in memory and return
+
+// Handle long moves (>32 bytes.)
+
+Lbm1:
+ beqlr cr6 // rs==rd, so nothing to move
+ bltlr cr0 // length<0, so ignore call and return
+ mflr r12 // save return address
+ bge cr6,Lbm2 // rd>=rs, so move up
+
+// Long moves down (left-to-right.)
+
+ neg r6,rd // start to 32-byte-align destination
+ andi. r0,r6,0x1F // r0 <- bytes to move to align destination
+ bnel LMoveDownShort // align destination if necessary
+ bl LMoveDownLong // move 32-byte chunks down
+ andi. r0,rc,0x1F // done?
+ mtlr r12 // restore caller's return address
+ bne LMoveDownShort // move trailing leftover bytes and done
+ blr // no leftovers, so done
+
+// Long moves up (right-to-left.)
+
+Lbm2:
+ add rs,rs,rc // moving up (right-to-left), so adjust pointers
+ add rd,rd,rc
+ andi. r0,rd,0x1F // r0 <- bytes to move to align destination
+ bnel LMoveUpShort // align destination if necessary
+ bl LMoveUpLong // move 32-byte chunks up
+ andi. r0,rc,0x1F // done?
+ mtlr r12 // restore caller's return address
+ bne LMoveUpShort // move trailing leftover bytes and done
+ blr // no leftovers, so done
+
+// ***************
+// * M O V E 3 2 *
+// ***************
+//
+// Special case subroutine to move a 32-byte block. MoveDownShort and
+// MoveUpShort only handle 0..31 bytes, and we believe 32 bytes is too
+// common a case to send it through the general purpose long-block code.
+// Since it moves both up and down, we must load all 32 bytes before
+// storing any.
+//
+// Calling Sequence: rs = source ptr
+// rd = destination ptr
+//
+// Uses: r0,r5-r11.
+//
+
+LMove32:
+ lwz r0,0(rs)
+ lwz r5,4(rs)
+ lwz r6,8(rs)
+ lwz r7,12(rs)
+ lwz r8,16(rs)
+ lwz r9,20(rs)
+ lwz r10,24(rs)
+ lwz r11,28(rs)
+ stw r0,0(rd)
+ stw r5,4(rd)
+ stw r6,8(rd)
+ stw r7,12(rd)
+ stw r8,16(rd)
+ stw r9,20(rd)
+ stw r10,24(rd)
+ stw r11,28(rd)
+ blr
+
+
+// *************************
+// * M o v e U p S h o r t *
+// *************************
+//
+// Subroutine called to move <32 bytes up in memory (ie, right-to-left).
+//
+// Entry conditions: rs = last byte moved from source (right-to-left)
+// rd = last byte moved into destination
+// r0 = #bytes to move (0..31)
+//
+// Exit conditions: rs = updated source ptr
+// rd = updated destination ptr
+// rc = decremented by #bytes moved
+//
+// Uses: r0,r6,r7,r8,cr7.
+//
+
+LMoveUpShort:
+ andi. r6,r0,0x10 // test 0x10 bit in length
+ mtcrf 0x1,r0 // move count to cr7 so we can test bits
+ sub rc,rc,r0 // decrement count of bytes remaining to be moved
+ beq Lmus1 // skip if 0x10 bit in length is 0
+ lwzu r0,-16(rs) // set, so copy up 16 bytes
+ lwz r6,4(rs)
+ lwz r7,8(rs)
+ lwz r8,12(rs)
+ stwu r0,-16(rd)
+ stw r6,4(rd)
+ stw r7,8(rd)
+ stw r8,12(rd)
+
+Lmus1:
+ bf 28,Lmus2 // test 0x08 bit
+ lwzu r0,-8(rs)
+ lwz r6,4(rs)
+ stwu r0,-8(rd)
+ stw r6,4(rd)
+
+Lmus2:
+ bf 29,Lmus3 // test 0x4 bit
+ lwzu r0,-4(rs)
+ stwu r0,-4(rd)
+
+Lmus3:
+ bf 30,Lmus4 // test 0x2 bit
+ lhzu r0,-2(rs)
+ sthu r0,-2(rd)
+
+Lmus4:
+ bflr 31 // test 0x1 bit, return if 0
+ lbzu r0,-1(rs)
+ stbu r0,-1(rd)
+ blr
+
+
+// *****************************
+// * M o v e D o w n S h o r t *
+// *****************************
+//
+// Subroutine called to move <32 bytes down in memory (ie, left-to-right).
+//
+// Entry conditions: rs = source pointer
+// rd = destination pointer
+// r0 = #bytes to move (0..31)
+//
+// Exit conditions: rs = ptr to 1st byte not moved
+// rd = ptr to 1st byte not moved
+// rc = decremented by #bytes moved
+//
+// Uses: r0,r6,r7,r8,cr7.
+//
+
+LMoveDownShort:
+ andi. r6,r0,0x10 // test 0x10 bit in length
+ mtcrf 0x1,r0 // move count to cr7 so we can test bits
+ sub rc,rc,r0 // decrement count of bytes remaining to be moved
+ beq Lmds1 // skip if 0x10 bit in length is 0
+ lwz r0,0(rs) // set, so copy up 16 bytes
+ lwz r6,4(rs)
+ lwz r7,8(rs)
+ lwz r8,12(rs)
+ addi rs,rs,16
+ stw r0,0(rd)
+ stw r6,4(rd)
+ stw r7,8(rd)
+ stw r8,12(rd)
+ addi rd,rd,16
+
+Lmds1:
+ bf 28,Lmds2 // test 0x08 bit
+ lwz r0,0(rs)
+ lwz r6,4(rs)
+ addi rs,rs,8
+ stw r0,0(rd)
+ stw r6,4(rd)
+ addi rd,rd,8
+
+Lmds2:
+ bf 29,Lmds3 // test 0x4 bit
+ lwz r0,0(rs)
+ addi rs,rs,4
+ stw r0,0(rd)
+ addi rd,rd,4
+
+Lmds3:
+ bf 30,Lmds4 // test 0x2 bit
+ lhz r0,0(rs)
+ addi rs,rs,2
+ sth r0,0(rd)
+ addi rd,rd,2
+
+Lmds4:
+ bflr 31 // test 0x1 bit, return if 0
+ lbz r0,0(rs)
+ addi rs,rs,1
+ stb r0,0(rd)
+ addi rd,rd,1
+ blr
+
+
+// ***********************
+// * M o v e U p L o n g *
+// ***********************
+//
+// Subroutine to move 32-byte chunks of memory up (ie, right-to-left.)
+// The destination is known to be 32-byte aligned, but the source is
+// *not* necessarily aligned.
+//
+// Entry conditions: rs = last byte moved from source (right-to-left)
+// rd = last byte moved into destination
+// rc = count of bytes to move
+// cr = crCached set iff destination is cacheable
+//
+// Exit conditions: rs = updated source ptr
+// rd = updated destination ptr
+// rc = low order 8 bits of count of bytes to move
+//
+// Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
+//
+
+LMoveUpLong:
+ srwi. r11,rc,5 // r11 <- #32 byte chunks to move
+ mtctr r11 // prepare loop count
+ beqlr // return if no chunks to move
+ andi. r0,rs,7 // is source at least doubleword aligned?
+ beq Lmup3 // yes, can optimize this case
+ mtcrf 0x1,rc // save low bits of count
+ mtcrf 0x2,rc // (one cr at a time, as 604 prefers)
+
+Lmup1: // loop over each 32-byte-chunk
+ lwzu r0,-32(rs)
+ subi rd,rd,32 // prepare destination address for 'dcbz'
+ lwz r5,4(rs)
+ lwz r6,8(rs)
+ lwz r7,12(rs)
+ lwz r8,16(rs)
+ lwz r9,20(rs)
+ lwz r10,24(rs)
+ lwz r11,28(rs)
+ stw r0,0(rd)
+ stw r5,4(rd)
+ stw r6,8(rd)
+ stw r7,12(rd)
+ stw r8,16(rd)
+ stw r9,20(rd)
+ stw r10,24(rd)
+ stw r11,28(rd)
+ bdnz Lmup1
+ mfcr rc // restore low bits of count
+ blr // return to caller
+
+// Aligned operands, so use d.p. floating point registers to move data.
+
+Lmup3:
+ lfdu f0,-32(rs)
+ subi rd,rd,32 // prepare destination address for 'dcbz'
+ lfd f1,8(rs)
+ lfd f2,16(rs)
+ lfd f3,24(rs)
+ stfd f0,0(rd)
+ stfd f1,8(rd)
+ stfd f2,16(rd)
+ stfd f3,24(rd)
+ bdnz Lmup3
+ blr // return to caller
+
+
+// ***************************
+// * M o v e D o w n L o n g *
+// ***************************
+//
+// Subroutine to move 32-byte chunks of memory down (ie, left-to-right.)
+// The destination is known to be 32-byte aligned, but the source is
+// *not* necessarily aligned.
+//
+// Entry conditions: rs = source ptr (next byte to move)
+// rd = dest ptr (next byte to move into)
+// rc = count of bytes to move
+// cr = crCached set iff destination is cacheable
+//
+// Exit conditions: rs = updated source ptr
+// rd = updated destination ptr
+// rc = low order 8 bits of count of bytes to move
+//
+// Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
+//
+
+LMoveDownLong:
+ srwi. r11,rc,5 // r11 <- #32 byte chunks to move
+ mtctr r11 // prepare loop count
+ beqlr // return if no chunks to move
+ andi. r0,rs,7 // is source at least doubleword aligned?
+ beq Lmdown3 // yes, can optimize this case
+ mtcrf 0x1,rc // save low 8 bits of count
+ mtcrf 0x2,rc // (one cr at a time, as 604 prefers)
+
+Lmdown1: // loop over each 32-byte-chunk
+ lwz r0,0(rs)
+ lwz r5,4(rs)
+ lwz r6,8(rs)
+ lwz r7,12(rs)
+ lwz r8,16(rs)
+ lwz r9,20(rs)
+ lwz r10,24(rs)
+ lwz r11,28(rs)
+ stw r0,0(rd)
+ stw r5,4(rd)
+ stw r6,8(rd)
+ stw r7,12(rd)
+ stw r8,16(rd)
+ stw r9,20(rd)
+ addi rs,rs,32
+ stw r10,24(rd)
+ stw r11,28(rd)
+ addi rd,rd,32
+ bdnz Lmdown1
+ mfcr rc // restore low bits of count
+ blr // return to caller
+
+// Aligned operands, so use d.p. floating point registers to move data.
+
+Lmdown3:
+ lfd f0,0(rs)
+ lfd f1,8(rs)
+ lfd f2,16(rs)
+ lfd f3,24(rs)
+ addi rs,rs,32
+ stfd f0,0(rd)
+ stfd f1,8(rd)
+ stfd f2,16(rd)
+ stfd f3,24(rd)
+ addi rd,rd,32
+ bdnz Lmdown3
+ blr // return to caller
+
+//
+// Register use conventions are as follows:
+//
+// r0 - temp
+// r6 - copy of VMX SPR at entry
+// r7 - temp
+// r8 - constant -1 (also temp and a string op buffer)
+// r9 - constant 16 or -17 (also temp and a string op buffer)
+// r10- constant 32 or -33 (also temp and a string op buffer)
+// r11- constant 48 or -49 (also temp and a string op buffer)
+// r12- chunk count ("c") in long moves
+//
+// v0 - vp - permute vector
+// v1 - va - 1st quadword of source
+// v2 - vb - 2nd quadword of source
+// v3 - vc - 3rd quadword of source
+// v4 - vd - 4th quadword of source
+// v5 - vx - temp
+// v6 - vy - temp
+// v7 - vz - temp
+
+#define vp v0
+#define va v1
+#define vb v2
+#define vc v3
+#define vd v4
+#define vx v5
+#define vy v6
+#define vz v7
+
+#define VRSave 256
+
+// kShort should be the crossover point where the long algorithm is faster than the short.
+// WARNING: kShort must be >= 64
+
+// Yes, I know, we just checked rc > 128 to get here...
+
+#define kShort 128
+LAltivec:
+ cmpwi cr1,rc,kShort //(1) too short to bother using vector regs?
+ sub. r0,rd,rs //(1) must move reverse if (rd-rs)<rc
+ dcbt 0,rs //(2) prefetch first source block
+ cmplw cr6,r0,rc //(2) set cr6 blt iff we must move reverse
+ beqlr- //(2) done if src==dest
+ srawi. r9,rc,4 //(3) r9 <- quadwords to move, test for zero
+ or r8,rs,rd //(3) start to check for word alignment
+ dcbtst 0,rd //(4) prefetch first destination block
+ rlwinm r8,r8,0,30,31 //(4) r8 is zero if word aligned
+ bgt- cr1,LMoveLong //(4) handle long operands
+ cmpwi cr1,r8,0 //(5) word aligned?
+ rlwinm r7,rc,0,28,31 //(5) r7 <- leftover bytes to move after quadwords
+ bltlr- //(5) done if negative count
+ blt- cr6,LShortReverse //(5) handle reverse moves
+ cmpwi cr7,r7,0 //(6) leftover bytes?
+ beq- Leftovers //(6) r9==0, so no quadwords to move
+ mtctr r9 //(7) set up for quadword loop
+ bne- cr1,LUnalignedLoop //(7) not word aligned (less common than word aligned)
+
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> S H O R T O P E R A N D S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+LAlignedLoop: // word aligned operands (the common case)
+ lfd f0,0(rs) //(1)
+ lfd f1,8(rs) //(2)
+ addi rs,rs,16 //(2)
+ stfd f0,0(rd) //(3)
+ stfd f1,8(rd) //(4)
+ addi rd,rd,16 //(4)
+ bdnz LAlignedLoop //(4)
+
+Leftovers:
+ beqlr- cr7 //(8) done if r7==0, ie no leftover bytes
+ mtxer r7 //(9) count of bytes to move (1-15)
+ lswx r8,0,rs
+ stswx r8,0,rd
+ blr //(17)
+
+LUnalignedLoop: // not word aligned, cannot use lfd/stfd
+ lwz r8,0(rs) //(1)
+ lwz r9,4(rs) //(2)
+ lwz r10,8(rs) //(3)
+ lwz r11,12(rs) //(4)
+ addi rs,rs,16 //(4)
+ stw r8,0(rd) //(5)
+ stw r9,4(rd) //(6)
+ stw r10,8(rd) //(7)
+ stw r11,12(rd) //(8)
+ addi rd,rd,16 //(8)
+ bdnz LUnalignedLoop //(8)
+
+ b Leftovers
+
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> S H O R T R E V E R S E M O V E S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+ // cr0 & r9 <- #doublewords to move (>=0)
+ // cr1 <- beq if word aligned
+ // r7 <- #leftover bytes to move (0-15)
+
+LShortReverse:
+ cmpwi cr7,r7,0 // leftover bytes?
+ add rs,rs,rc // point 1 past end of string for reverse moves
+ add rd,rd,rc
+ beq- LeftoversReverse // r9==0, ie no words to move
+ mtctr r9 // set up for quadword loop
+ bne- cr1,LUnalignedLoopReverse
+
+LAlignedLoopReverse: // word aligned, so use lfd/stfd
+ lfd f0,-8(rs)
+ lfdu f1,-16(rs)
+ stfd f0,-8(rd)
+ stfdu f1,-16(rd)
+ bdnz LAlignedLoopReverse
+
+LeftoversReverse:
+ beqlr- cr7 // done if r7==0, ie no leftover bytes
+ mtxer r7 // count of bytes to move (1-15)
+ neg r7,r7 // index back by #bytes
+ lswx r8,r7,rs
+ stswx r8,r7,rd
+ blr
+
+LUnalignedLoopReverse: // not word aligned, cannot use lfd/stfd
+ lwz r8,-4(rs)
+ lwz r9,-8(rs)
+ lwz r10,-12(rs)
+ lwzu r11,-16(rs)
+ stw r8,-4(rd)
+ stw r9,-8(rd)
+ stw r10,-12(rd)
+ stwu r11,-16(rd)
+ bdnz LUnalignedLoopReverse
+
+ b LeftoversReverse
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> L O N G O P E R A N D S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+ // cr6 set (blt) if must move reverse
+ // r0 <- (rd - rs)
+
+LMoveLong:
+ mfspr r6,VRSave //(5) save caller's VMX mask register
+ stw r6,-4(r1) // use CR save area so we can use r6 later
+ neg r8,rd //(5) start to compute #bytes to fill in 1st dest quadword
+ rlwinm r0,r0,0,28,31 //(6) start to determine relative alignment
+ andi. r7,r8,0xF //(6) r7 <- #bytes to fill in 1st dest quadword
+ cmpwi cr7,r0,0 //(7) relatively aligned? (ie, 16 bytes apart?)
+ oris r9,r6,0xFF00 //(7) light bits for regs we use (v0-v7)
+ mtspr VRSave,r9 //(8) update live register bitmask
+ blt- cr6,LongReverse //(8) must move reverse direction
+ sub rc,rc,r7 //(9) adjust length while we wait
+ beq- LDest16Aligned //(9) r7==0, ie destination already quadword aligned
+
+ // Align destination on a quadword.
+
+ mtxer r7 //(10) set up byte count (1-15)
+ lswx r8,0,rs // load into r8-r11
+ stswx r8,0,rd // store r8-r11 (measured latency on arthur is 7.2 cycles)
+ add rd,rd,r7 //(18) adjust ptrs
+ add rs,rs,r7 //(18)
+
+ // Begin preparation for inner loop and "dst" stream.
+
+LDest16Aligned:
+ andi. r0,rd,0x10 //(19) is destination cache-block aligned?
+ li r9,16 //(19) r9 <- constant used to access 2nd quadword
+ li r10,32 //(20) r10<- constant used to access 3rd quadword
+ beq- cr7,LAligned //(20) handle relatively aligned operands
+ lvx va,0,rs //(20) prefetch 1st source quadword
+ li r11,48 //(21) r11<- constant used to access 4th quadword
+ lvsl vp,0,rs //(21) get permute vector to left shift
+ beq LDest32Aligned //(22) destination already cache-block aligned
+
+ // Copy 16 bytes to align destination on 32-byte (cache block) boundary
+ // to maximize store gathering.
+
+ lvx vb,r9,rs //(23) get 2nd source qw
+ subi rc,rc,16 //(23) adjust count
+ addi rs,rs,16 //(24) adjust source ptr
+ vperm vx,va,vb,vp //(25) vx <- 1st destination qw
+ vor va,vb,vb //(25) va <- vb
+ stvx vx,0,rd //(26) assuming store Q deep enough to avoid latency
+ addi rd,rd,16 //(26) adjust dest ptr
+
+ // Destination 32-byte aligned, source alignment unknown.
+
+LDest32Aligned:
+ srwi. r12,rc,6 //(27) r12<- count of 64-byte chunks to move
+ rlwinm r7,rc,28,30,31 //(27) r7 <- count of 16-byte chunks to move
+ cmpwi cr1,r7,0 //(28) remember if any 16-byte chunks
+ rlwinm r8,r12,0,26,31 //(29) mask chunk count down to 0-63
+ subi r0,r8,1 //(30) r8==0?
+ beq- LNoChunks //(30) r12==0, ie no chunks to move
+ rlwimi r8,r0,0,25,25 //(31) if r8==0, then r8 <- 64
+ li r0,64 //(31) r0 <- used to get 1st quadword of next chunk
+ sub. r12,r12,r8 //(32) adjust chunk count, set cr0
+ mtctr r8 //(32) set up loop count
+ li r8,96 //SKP
+ li r6,128 //SKP
+ // Inner loop for unaligned sources. We copy 64 bytes per iteration.
+ // We loop at most 64 times, then reprime the "dst" and loop again for
+ // the next 4KB. This loop is tuned to keep the CPU flat out, which
+ // means we need to execute a lvx or stvx every cycle.
+
+LoopBy64:
+ dcbt rs,r8 //SKP
+ dcbt rs,r6 //SKP
+ lvx vb,r9,rs //(1) 2nd source quadword (1st already in va)
+ lvx vc,r10,rs //(2) 3rd
+ lvx vd,r11,rs //(3) 4th
+ vperm vx,va,vb,vp //(3) vx <- 1st destination quadword
+ lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (r0 must be RB!)
+ vperm vy,vb,vc,vp //(4) vy <- 2nd dest qw
+ stvx vx,0,rd //(5)
+ vperm vz,vc,vd,vp //(5) vz <- 3rd dest qw
+ stvx vy,r9,rd //(6)
+ vperm vx,vd,va,vp //(6) vx <- 4th
+ stvx vz,r10,rd //(7)
+ addi rs,rs,64 //(7)
+ stvx vx,r11,rd //(8)
+ addi rd,rd,64 //(8)
+ bdnz LoopBy64 //(8)
+
+ // End of inner loop. Should we reprime dst stream and restart loop?
+ // This block is only executed when we're moving more than 4KB.
+ // It is usually folded out because cr0 is set in the loop prologue.
+
+ beq+ LNoChunks // r12==0, ie no more chunks to move
+ sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
+ mtctr r0 // initialize loop count to 64
+ b LoopBy64 // restart inner loop, xfer another 4KB
+
+ // Fewer than 64 bytes remain to be moved.
+
+LNoChunks: // r7 and cr1 are set with the number of QWs
+ andi. rc,rc,0xF //(33) rc <- leftover bytes
+ beq- cr1,LCleanup //(33) r7==0, ie fewer than 16 bytes remaining
+ mtctr r7 //(34) we will loop over 1-3 QWs
+
+LoopBy16:
+ lvx vb,r9,rs //(1) vb <- 2nd source quadword
+ addi rs,rs,16 //(1)
+ vperm vx,va,vb,vp //(3) vx <- next destination quadword
+ vor va,vb,vb //(3) va <- vb
+ stvx vx,0,rd //(4) assuming store Q is deep enough to mask latency
+ addi rd,rd,16 //(4)
+ bdnz LoopBy16 //(4)
+
+ // Move remaining bytes in last quadword. rc and cr0 have the count.
+
+LCleanup:
+ lwz r6,-4(r1) // load VRSave from CR save area
+ mtspr VRSave,r6 //(35) restore caller's live-register bitmask
+ beqlr //(36) rc==0, ie no leftovers, so done
+ mtxer rc //(37) load byte count (1-15)
+ lswx r8,0,rs
+ stswx r8,0,rd
+ blr //(45)
+
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> L O N G A L I G N E D M O V E S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+ // rs, rd <- both quadword aligned
+ // cr0 <- beq if dest is cache block (32-byte) aligned
+ // r9 <- 16
+ // r10 <- 32
+
+LAligned:
+ lvx va,0,rs // prefetch 1st source quadword
+ li r11,48 // r11<- constant used to access 4th quadword
+ beq LAligned32 // destination already cache-block aligned
+
+ // Copy 16 bytes to align destination on 32-byte (cache block) boundary
+ // to maximize store gathering.
+
+ subi rc,rc,16 // adjust count
+ addi rs,rs,16 // adjust source ptr
+ stvx va,0,rd // assuming store Q deep enough to avoid latency
+ addi rd,rd,16 // adjust dest ptr
+
+ // Destination 32-byte aligned, source 16-byte aligned. Set up for inner loop.
+
+LAligned32:
+ srwi. r12,rc,6 // r12<- count of 64-byte chunks to move
+ rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move
+ cmpwi cr1,r7,0 // remember if any 16-byte chunks
+ rlwinm r8,r12,0,26,31 // mask chunk count down to 0-63
+ subi r0,r8,1 // r8==0?
+ beq- LAlignedNoChunks // r12==0, ie no chunks to move
+ rlwimi r8,r0,0,25,25 // if r8==0, then r8 <- 64
+ li r0,64 // r0 <- used at end of loop
+ sub. r12,r12,r8 // adjust chunk count, set cr0
+ mtctr r8 // set up loop count
+ li r8,96 //SKP
+ li r6,128 //SKP
+
+ // Inner loop for aligned sources. We copy 64 bytes per iteration.
+
+LAlignedLoopBy64:
+ dcbt rs,r8 //SKP
+ dcbt rs,r6 //SKP
+ lvx va,0,rs //(1)
+ lvx vb,r9,rs //(2)
+ lvx vc,r10,rs //(3)
+ lvx vd,r11,rs //(4)
+ addi rs,rs,64 //(4)
+ stvx va,0,rd //(5)
+ stvx vb,r9,rd //(6)
+ stvx vc,r10,rd //(7)
+ stvx vd,r11,rd //(8)
+ addi rd,rd,64 //(8)
+ bdnz LAlignedLoopBy64 //(8)
+
+ // End of inner loop. Loop again for next 4KB iff any.
+
+ beq+ LAlignedNoChunks // r12==0, ie no more chunks to move
+ sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
+ mtctr r0 // reinitialize loop count to 64
+ b LAlignedLoopBy64 // restart inner loop, xfer another 4KB
+
+ // Fewer than 64 bytes remain to be moved.
+
+LAlignedNoChunks: // r7 and cr1 are set with the number of QWs
+ andi. rc,rc,0xF // rc <- leftover bytes
+ beq- cr1,LCleanup // r7==0, ie fewer than 16 bytes remaining
+ mtctr r7 // we will loop over 1-3 QWs
+
+LAlignedLoopBy16:
+ lvx va,0,rs // get next quadword
+ addi rs,rs,16
+ stvx va,0,rd
+ addi rd,rd,16
+ bdnz LAlignedLoopBy16
+
+ b LCleanup // handle last 0-15 bytes, if any
+
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> L O N G R E V E R S E M O V E S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+ // Reverse moves. These involve overlapping operands, with the source
+ // lower in memory (lower addresses) than the destination. They must be
+ // done right-to-left, ie from high addresses down to low addresses.
+ // Throughout this code, we maintain rs and rd as pointers one byte past
+ // the end of the untransferred operands.
+ //
+ // The byte count is >=kShort and the following registers are already loaded:
+ //
+ // r6 - VMX mask at entry
+ // cr7 - beq if relatively aligned
+ //
+
+LongReverse:
+ add rd,rd,rc // update source/dest ptrs to be 1 byte past end
+ add rs,rs,rc
+ andi. r7,rd,0xF // r7 <- #bytes needed to move to align destination
+ sub rc,rc,r7 // adjust length while we wait
+ sub rs,rs,r7 // adjust ptrs by #bytes to xfer, also while we wait
+ sub rd,rd,r7
+ beq- LDest16AlignedReverse
+
+ // Align destination on a quadword. Note that we do NOT align on a cache
+ // block boundary for store gathering etc// since all these operands overlap
+ // many dest cache blocks will already be in the L1, so its not clear that
+ // this would be a win.
+
+ mtxer r7 // load byte count
+ lswx r8,0,rs
+ stswx r8,0,rd
+
+ // Prepare for inner loop and start "dstst" stream. Frankly, its not
+ // clear whether "dst" or "dstst" would be better// somebody should
+ // measure. We use "dstst" because, being overlapped, at least some
+ // source cache blocks will also be stored into.
+
+LDest16AlignedReverse:
+ srwi. r12,rc,6 // r12 <- count of 64-byte chunks to move
+ rlwinm r0,rc,11,9,15 // position quadword count for dst
+ rlwinm r11,r12,0,26,31 // mask chunk count down to 0-63
+ li r9,-17 // r9 <- constant used to access 2nd quadword
+ oris r0,r0,0x0100 // set dst block size to 1 qw
+ li r10,-33 // r10<- constant used to access 3rd quadword
+ ori r0,r0,0xFFE0 // set dst stride to -16 bytes
+ li r8,-1 // r8<- constant used to access 1st quadword
+ dstst rs,r0,3 // start stream 0
+ subi r0,r11,1 // r11==0 ?
+ lvx va,r8,rs // prefetch 1st source quadword
+ rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move
+ lvsl vp,0,rs // get permute vector to right shift
+ cmpwi cr1,r7,0 // remember if any 16-byte chunks
+ beq- LNoChunksReverse // r12==0, so skip inner loop
+ rlwimi r11,r0,0,25,25 // if r11==0, then r11 <- 64
+ sub. r12,r12,r11 // adjust chunk count, set cr0
+ mtctr r11 // set up loop count
+ li r11,-49 // r11<- constant used to access 4th quadword
+ li r0,-64 // r0 <- used for several purposes
+ beq- cr7,LAlignedLoopBy64Reverse
+
+ // Inner loop for unaligned sources. We copy 64 bytes per iteration.
+
+LoopBy64Reverse:
+ lvx vb,r9,rs //(1) 2nd source quadword (1st already in va)
+ lvx vc,r10,rs //(2) 3rd quadword
+ lvx vd,r11,rs //(3) 4th
+ vperm vx,vb,va,vp //(3) vx <- 1st destination quadword
+ lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (note r0 must be RB)
+ vperm vy,vc,vb,vp //(4) vy <- 2nd dest qw
+ stvx vx,r8,rd //(5)
+ vperm vz,vd,vc,vp //(5) vz <- 3rd destination quadword
+ stvx vy,r9,rd //(6)
+ vperm vx,va,vd,vp //(6) vx <- 4th qw
+ stvx vz,r10,rd //(7)
+ subi rs,rs,64 //(7)
+ stvx vx,r11,rd //(8)
+ subi rd,rd,64 //(8)
+ bdnz LoopBy64Reverse //(8)
+
+ // End of inner loop. Should we reprime dst stream and restart loop?
+ // This block is only executed when we're moving more than 4KB.
+ // It is usually folded out because cr0 is set in the loop prologue.
+
+ beq+ LNoChunksReverse // r12==0, ie no more chunks to move
+ lis r8,0x0440 // dst control: 64 4-qw blocks
+ add. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
+ ori r8,r8,0xFFC0 // stride is -64 bytes
+ dstst rs,r8,3 // restart the prefetch stream
+ li r8,64 // inner loop count
+ mtctr r8 // initialize loop count to 64
+ li r8,-1 // restore qw1 offset for inner loop
+ b LoopBy64Reverse // restart inner loop, xfer another 4KB
+
+ // Fewer than 64 bytes remain to be moved.
+
+LNoChunksReverse: // r7 and cr1 are set with the number of QWs
+ andi. rc,rc,0xF // rc <- leftover bytes
+ beq- cr1,LCleanupReverse // r7==0, ie fewer than 16 bytes left
+ mtctr r7
+ beq- cr7,LAlignedLoopBy16Reverse
+
+LoopBy16Reverse:
+ lvx vb,r9,rs // vb <- 2nd source quadword
+ subi rs,rs,16
+ vperm vx,vb,va,vp // vx <- next destination quadword
+ vor va,vb,vb // va <- vb
+ stvx vx,r8,rd
+ subi rd,rd,16
+ bdnz LoopBy16Reverse
+
+ // Fewer that 16 bytes remain to be moved.
+
+LCleanupReverse: // rc and cr0 set with remaining byte count
+ lwz r6,-4(r1) // load VRSave from CR save area
+ mtspr VRSave,r6 // restore caller's live-register bitmask
+ beqlr // rc==0, ie no leftovers so done
+ neg r7,rc // get -(#bytes)
+ mtxer rc // byte count
+ lswx r8,r7,rs
+ stswx r8,r7,rd
+ blr
+
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> A L I G N E D L O N G R E V E R S E M O V E S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+ // Inner loop. We copy 64 bytes per iteration.
+
+LAlignedLoopBy64Reverse:
+ lvx va,r8,rs //(1)
+ lvx vb,r9,rs //(2)
+ lvx vc,r10,rs //(3)
+ lvx vd,r11,rs //(4)
+ subi rs,rs,64 //(4)
+ stvx va,r8,rd //(5)
+ stvx vb,r9,rd //(6)
+ stvx vc,r10,rd //(7)
+ stvx vd,r11,rd //(8)
+ subi rd,rd,64 //(8)
+ bdnz LAlignedLoopBy64Reverse //(8)
+
+ // End of inner loop. Loop for next 4KB iff any.
+
+ beq+ LNoChunksReverse // r12==0, ie no more chunks to move
+ lis r8,0x0440 // dst control: 64 4-qw blocks
+ add. r12,r12,r0 // r12 <- r12 - 64, set cr0
+ ori r8,r8,0xFFC0 // stride is -64 bytes
+ dstst rs,r8,3 // restart the prefetch stream
+ li r8,64 // inner loop count
+ mtctr r8 // initialize loop count to 64
+ li r8,-1 // restore qw1 offset for inner loop
+ b LAlignedLoopBy64Reverse
+
+ // Loop to copy leftover quadwords (1-3).
+
+LAlignedLoopBy16Reverse:
+ lvx va,r8,rs // get next qw
+ subi rs,rs,16
+ stvx va,r8,rd
+ subi rd,rd,16
+ bdnz LAlignedLoopBy16Reverse
+
+ b LCleanupReverse // handle up to 15 bytes in last qw