2 * Copyright (c) 1992-2001 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 #include <architecture/ppc/asm_help.h>
24 // =================================================================================================
25 // *** The easiest way to assemble things on Mac OS X is via "cc", so this uses #defines and such.
26 // =================================================================================================
28 // Keep track of whether we have Altivec
29 // This gets set in pthread_init()
33 .globl __cpu_has_altivec
44 mr r2,r4 // Since bcopy uses (src,dest,count), swap r3,r4
49 mr r2,r3 // Store dest ptr in r2 to preserve r3 on return
58 // Should we bother using Altivec?
63 // Determine whether we have Altivec enabled
70 addis r6, r6, ha16(__cpu_has_altivec - 1b)
71 lwz r6, lo16(__cpu_has_altivec - 1b)(r6)
75 // =================================================================================================
77 // *****************************************
78 // * S c a l a r B l o c k M o o f D a t a *
79 // *****************************************
81 // This is the scalar (non-AltiVec) version of BlockMoofData.
83 // void ScalarBlockMoofData (ptr sou, ptr dest, long len)
84 // void ScalarBlockMoofDataUncached (ptr sou, ptr dest, long len)
87 // Calling Sequence: r3 = source pointer
88 // r4 = destination pointer
89 // r5 = length in bytes
91 // Uses: all volatile registers.
94 cmplwi cr7,rc,32 // length <= 32 bytes?
95 cmplw cr6,rd,rs // up or down?
96 mr. r0,rc // copy to r0 for MoveShort, and test for negative
97 bgt cr7,Lbm1 // skip if count > 32
99 // Handle short moves (<=32 bytes.)
101 beq cr7,LMove32 // special case 32-byte blocks
102 blt cr6,LMoveDownShort // move down in memory and return
103 add rs,rs,rc // moving up (right-to-left), so adjust pointers
105 b LMoveUpShort // move up in memory and return
107 // Handle long moves (>32 bytes.)
110 beqlr cr6 // rs==rd, so nothing to move
111 bltlr cr0 // length<0, so ignore call and return
112 mflr r12 // save return address
113 bge cr6,Lbm2 // rd>=rs, so move up
115 // Long moves down (left-to-right.)
117 neg r6,rd // start to 32-byte-align destination
118 andi. r0,r6,0x1F // r0 <- bytes to move to align destination
119 bnel LMoveDownShort // align destination if necessary
120 bl LMoveDownLong // move 32-byte chunks down
121 andi. r0,rc,0x1F // done?
122 mtlr r12 // restore caller's return address
123 bne LMoveDownShort // move trailing leftover bytes and done
124 blr // no leftovers, so done
126 // Long moves up (right-to-left.)
129 add rs,rs,rc // moving up (right-to-left), so adjust pointers
131 andi. r0,rd,0x1F // r0 <- bytes to move to align destination
132 bnel LMoveUpShort // align destination if necessary
133 bl LMoveUpLong // move 32-byte chunks up
134 andi. r0,rc,0x1F // done?
135 mtlr r12 // restore caller's return address
136 bne LMoveUpShort // move trailing leftover bytes and done
137 blr // no leftovers, so done
143 // Special case subroutine to move a 32-byte block. MoveDownShort and
144 // MoveUpShort only handle 0..31 bytes, and we believe 32 bytes is too
145 // common a case to send it through the general purpose long-block code.
146 // Since it moves both up and down, we must load all 32 bytes before
149 // Calling Sequence: rs = source ptr
150 // rd = destination ptr
175 // *************************
176 // * M o v e U p S h o r t *
177 // *************************
179 // Subroutine called to move <32 bytes up in memory (ie, right-to-left).
181 // Entry conditions: rs = last byte moved from source (right-to-left)
182 // rd = last byte moved into destination
183 // r0 = #bytes to move (0..31)
185 // Exit conditions: rs = updated source ptr
186 // rd = updated destination ptr
187 // rc = decremented by #bytes moved
189 // Uses: r0,r6,r7,r8,cr7.
193 andi. r6,r0,0x10 // test 0x10 bit in length
194 mtcrf 0x1,r0 // move count to cr7 so we can test bits
195 sub rc,rc,r0 // decrement count of bytes remaining to be moved
196 beq Lmus1 // skip if 0x10 bit in length is 0
197 lwzu r0,-16(rs) // set, so copy up 16 bytes
207 bf 28,Lmus2 // test 0x08 bit
214 bf 29,Lmus3 // test 0x4 bit
219 bf 30,Lmus4 // test 0x2 bit
224 bflr 31 // test 0x1 bit, return if 0
230 // *****************************
231 // * M o v e D o w n S h o r t *
232 // *****************************
234 // Subroutine called to move <32 bytes down in memory (ie, left-to-right).
236 // Entry conditions: rs = source pointer
237 // rd = destination pointer
238 // r0 = #bytes to move (0..31)
240 // Exit conditions: rs = ptr to 1st byte not moved
241 // rd = ptr to 1st byte not moved
242 // rc = decremented by #bytes moved
244 // Uses: r0,r6,r7,r8,cr7.
248 andi. r6,r0,0x10 // test 0x10 bit in length
249 mtcrf 0x1,r0 // move count to cr7 so we can test bits
250 sub rc,rc,r0 // decrement count of bytes remaining to be moved
251 beq Lmds1 // skip if 0x10 bit in length is 0
252 lwz r0,0(rs) // set, so copy up 16 bytes
264 bf 28,Lmds2 // test 0x08 bit
273 bf 29,Lmds3 // test 0x4 bit
280 bf 30,Lmds4 // test 0x2 bit
287 bflr 31 // test 0x1 bit, return if 0
295 // ***********************
296 // * M o v e U p L o n g *
297 // ***********************
299 // Subroutine to move 32-byte chunks of memory up (ie, right-to-left.)
300 // The destination is known to be 32-byte aligned, but the source is
301 // *not* necessarily aligned.
303 // Entry conditions: rs = last byte moved from source (right-to-left)
304 // rd = last byte moved into destination
305 // rc = count of bytes to move
306 // cr = crCached set iff destination is cacheable
308 // Exit conditions: rs = updated source ptr
309 // rd = updated destination ptr
310 // rc = low order 8 bits of count of bytes to move
312 // Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
316 srwi. r11,rc,5 // r11 <- #32 byte chunks to move
317 mtctr r11 // prepare loop count
318 beqlr // return if no chunks to move
319 andi. r0,rs,7 // is source at least doubleword aligned?
320 beq Lmup3 // yes, can optimize this case
321 mtcrf 0x1,rc // save low bits of count
322 mtcrf 0x2,rc // (one cr at a time, as 604 prefers)
324 Lmup1: // loop over each 32-byte-chunk
326 subi rd,rd,32 // prepare destination address for 'dcbz'
343 mfcr rc // restore low bits of count
344 blr // return to caller
346 // Aligned operands, so use d.p. floating point registers to move data.
350 subi rd,rd,32 // prepare destination address for 'dcbz'
359 blr // return to caller
362 // ***************************
363 // * M o v e D o w n L o n g *
364 // ***************************
366 // Subroutine to move 32-byte chunks of memory down (ie, left-to-right.)
367 // The destination is known to be 32-byte aligned, but the source is
368 // *not* necessarily aligned.
370 // Entry conditions: rs = source ptr (next byte to move)
371 // rd = dest ptr (next byte to move into)
372 // rc = count of bytes to move
373 // cr = crCached set iff destination is cacheable
375 // Exit conditions: rs = updated source ptr
376 // rd = updated destination ptr
377 // rc = low order 8 bits of count of bytes to move
379 // Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
383 srwi. r11,rc,5 // r11 <- #32 byte chunks to move
384 mtctr r11 // prepare loop count
385 beqlr // return if no chunks to move
386 andi. r0,rs,7 // is source at least doubleword aligned?
387 beq Lmdown3 // yes, can optimize this case
388 mtcrf 0x1,rc // save low 8 bits of count
389 mtcrf 0x2,rc // (one cr at a time, as 604 prefers)
391 Lmdown1: // loop over each 32-byte-chunk
411 mfcr rc // restore low bits of count
412 blr // return to caller
414 // Aligned operands, so use d.p. floating point registers to move data.
428 blr // return to caller
431 // Register use conventions are as follows:
434 // r6 - copy of VMX SPR at entry
436 // r8 - constant -1 (also temp and a string op buffer)
437 // r9 - constant 16 or -17 (also temp and a string op buffer)
438 // r10- constant 32 or -33 (also temp and a string op buffer)
439 // r11- constant 48 or -49 (also temp and a string op buffer)
440 // r12- chunk count ("c") in long moves
442 // v0 - vp - permute vector
443 // v1 - va - 1st quadword of source
444 // v2 - vb - 2nd quadword of source
445 // v3 - vc - 3rd quadword of source
446 // v4 - vd - 4th quadword of source
462 // kShort should be the crossover point where the long algorithm is faster than the short.
463 // WARNING: kShort must be >= 64
465 // Yes, I know, we just checked rc > 128 to get here...
469 cmpwi cr1,rc,kShort //(1) too short to bother using vector regs?
470 sub. r0,rd,rs //(1) must move reverse if (rd-rs)<rc
471 dcbt 0,rs //(2) prefetch first source block
472 cmplw cr6,r0,rc //(2) set cr6 blt iff we must move reverse
473 beqlr- //(2) done if src==dest
474 srawi. r9,rc,4 //(3) r9 <- quadwords to move, test for zero
475 or r8,rs,rd //(3) start to check for word alignment
476 dcbtst 0,rd //(4) prefetch first destination block
477 rlwinm r8,r8,0,30,31 //(4) r8 is zero if word aligned
478 bgt- cr1,LMoveLong //(4) handle long operands
479 cmpwi cr1,r8,0 //(5) word aligned?
480 rlwinm r7,rc,0,28,31 //(5) r7 <- leftover bytes to move after quadwords
481 bltlr- //(5) done if negative count
482 blt- cr6,LShortReverse //(5) handle reverse moves
483 cmpwi cr7,r7,0 //(6) leftover bytes?
484 beq- Leftovers //(6) r9==0, so no quadwords to move
485 mtctr r9 //(7) set up for quadword loop
486 bne- cr1,LUnalignedLoop //(7) not word aligned (less common than word aligned)
489 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
490 // <><> S H O R T O P E R A N D S <><>
491 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
493 LAlignedLoop: // word aligned operands (the common case)
500 bdnz LAlignedLoop //(4)
503 beqlr- cr7 //(8) done if r7==0, ie no leftover bytes
504 mtxer r7 //(9) count of bytes to move (1-15)
509 LUnalignedLoop: // not word aligned, cannot use lfd/stfd
520 bdnz LUnalignedLoop //(8)
525 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
526 // <><> S H O R T R E V E R S E M O V E S <><>
527 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
529 // cr0 & r9 <- #doublewords to move (>=0)
530 // cr1 <- beq if word aligned
531 // r7 <- #leftover bytes to move (0-15)
534 cmpwi cr7,r7,0 // leftover bytes?
535 add rs,rs,rc // point 1 past end of string for reverse moves
537 beq- LeftoversReverse // r9==0, ie no words to move
538 mtctr r9 // set up for quadword loop
539 bne- cr1,LUnalignedLoopReverse
541 LAlignedLoopReverse: // word aligned, so use lfd/stfd
546 bdnz LAlignedLoopReverse
549 beqlr- cr7 // done if r7==0, ie no leftover bytes
550 mtxer r7 // count of bytes to move (1-15)
551 neg r7,r7 // index back by #bytes
556 LUnalignedLoopReverse: // not word aligned, cannot use lfd/stfd
565 bdnz LUnalignedLoopReverse
569 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
570 // <><> L O N G O P E R A N D S <><>
571 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
573 // cr6 set (blt) if must move reverse
577 mfspr r6,VRSave //(5) save caller's VMX mask register
578 stw r6,-4(r1) // use CR save area so we can use r6 later
579 neg r8,rd //(5) start to compute #bytes to fill in 1st dest quadword
580 rlwinm r0,r0,0,28,31 //(6) start to determine relative alignment
581 andi. r7,r8,0xF //(6) r7 <- #bytes to fill in 1st dest quadword
582 cmpwi cr7,r0,0 //(7) relatively aligned? (ie, 16 bytes apart?)
583 oris r9,r6,0xFF00 //(7) light bits for regs we use (v0-v7)
584 mtspr VRSave,r9 //(8) update live register bitmask
585 blt- cr6,LongReverse //(8) must move reverse direction
586 sub rc,rc,r7 //(9) adjust length while we wait
587 beq- LDest16Aligned //(9) r7==0, ie destination already quadword aligned
589 // Align destination on a quadword.
591 mtxer r7 //(10) set up byte count (1-15)
592 lswx r8,0,rs // load into r8-r11
593 stswx r8,0,rd // store r8-r11 (measured latency on arthur is 7.2 cycles)
594 add rd,rd,r7 //(18) adjust ptrs
597 // Begin preparation for inner loop and "dst" stream.
600 andi. r0,rd,0x10 //(19) is destination cache-block aligned?
601 li r9,16 //(19) r9 <- constant used to access 2nd quadword
602 li r10,32 //(20) r10<- constant used to access 3rd quadword
603 beq- cr7,LAligned //(20) handle relatively aligned operands
604 lvx va,0,rs //(20) prefetch 1st source quadword
605 li r11,48 //(21) r11<- constant used to access 4th quadword
606 lvsl vp,0,rs //(21) get permute vector to left shift
607 beq LDest32Aligned //(22) destination already cache-block aligned
609 // Copy 16 bytes to align destination on 32-byte (cache block) boundary
610 // to maximize store gathering.
612 lvx vb,r9,rs //(23) get 2nd source qw
613 subi rc,rc,16 //(23) adjust count
614 addi rs,rs,16 //(24) adjust source ptr
615 vperm vx,va,vb,vp //(25) vx <- 1st destination qw
616 vor va,vb,vb //(25) va <- vb
617 stvx vx,0,rd //(26) assuming store Q deep enough to avoid latency
618 addi rd,rd,16 //(26) adjust dest ptr
620 // Destination 32-byte aligned, source alignment unknown.
623 srwi. r12,rc,6 //(27) r12<- count of 64-byte chunks to move
624 rlwinm r7,rc,28,30,31 //(27) r7 <- count of 16-byte chunks to move
625 cmpwi cr1,r7,0 //(28) remember if any 16-byte chunks
626 rlwinm r8,r12,0,26,31 //(29) mask chunk count down to 0-63
627 subi r0,r8,1 //(30) r8==0?
628 beq- LNoChunks //(30) r12==0, ie no chunks to move
629 rlwimi r8,r0,0,25,25 //(31) if r8==0, then r8 <- 64
630 li r0,64 //(31) r0 <- used to get 1st quadword of next chunk
631 sub. r12,r12,r8 //(32) adjust chunk count, set cr0
632 mtctr r8 //(32) set up loop count
635 // Inner loop for unaligned sources. We copy 64 bytes per iteration.
636 // We loop at most 64 times, then reprime the "dst" and loop again for
637 // the next 4KB. This loop is tuned to keep the CPU flat out, which
638 // means we need to execute a lvx or stvx every cycle.
643 lvx vb,r9,rs //(1) 2nd source quadword (1st already in va)
644 lvx vc,r10,rs //(2) 3rd
645 lvx vd,r11,rs //(3) 4th
646 vperm vx,va,vb,vp //(3) vx <- 1st destination quadword
647 lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (r0 must be RB!)
648 vperm vy,vb,vc,vp //(4) vy <- 2nd dest qw
650 vperm vz,vc,vd,vp //(5) vz <- 3rd dest qw
652 vperm vx,vd,va,vp //(6) vx <- 4th
659 // End of inner loop. Should we reprime dst stream and restart loop?
660 // This block is only executed when we're moving more than 4KB.
661 // It is usually folded out because cr0 is set in the loop prologue.
663 beq+ LNoChunks // r12==0, ie no more chunks to move
664 sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
665 mtctr r0 // initialize loop count to 64
666 b LoopBy64 // restart inner loop, xfer another 4KB
668 // Fewer than 64 bytes remain to be moved.
670 LNoChunks: // r7 and cr1 are set with the number of QWs
671 andi. rc,rc,0xF //(33) rc <- leftover bytes
672 beq- cr1,LCleanup //(33) r7==0, ie fewer than 16 bytes remaining
673 mtctr r7 //(34) we will loop over 1-3 QWs
676 lvx vb,r9,rs //(1) vb <- 2nd source quadword
678 vperm vx,va,vb,vp //(3) vx <- next destination quadword
679 vor va,vb,vb //(3) va <- vb
680 stvx vx,0,rd //(4) assuming store Q is deep enough to mask latency
684 // Move remaining bytes in last quadword. rc and cr0 have the count.
687 lwz r6,-4(r1) // load VRSave from CR save area
688 mtspr VRSave,r6 //(35) restore caller's live-register bitmask
689 beqlr //(36) rc==0, ie no leftovers, so done
690 mtxer rc //(37) load byte count (1-15)
696 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
697 // <><> L O N G A L I G N E D M O V E S <><>
698 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
700 // rs, rd <- both quadword aligned
701 // cr0 <- beq if dest is cache block (32-byte) aligned
706 lvx va,0,rs // prefetch 1st source quadword
707 li r11,48 // r11<- constant used to access 4th quadword
708 beq LAligned32 // destination already cache-block aligned
710 // Copy 16 bytes to align destination on 32-byte (cache block) boundary
711 // to maximize store gathering.
713 subi rc,rc,16 // adjust count
714 addi rs,rs,16 // adjust source ptr
715 stvx va,0,rd // assuming store Q deep enough to avoid latency
716 addi rd,rd,16 // adjust dest ptr
718 // Destination 32-byte aligned, source 16-byte aligned. Set up for inner loop.
721 srwi. r12,rc,6 // r12<- count of 64-byte chunks to move
722 rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move
723 cmpwi cr1,r7,0 // remember if any 16-byte chunks
724 rlwinm r8,r12,0,26,31 // mask chunk count down to 0-63
725 subi r0,r8,1 // r8==0?
726 beq- LAlignedNoChunks // r12==0, ie no chunks to move
727 rlwimi r8,r0,0,25,25 // if r8==0, then r8 <- 64
728 li r0,64 // r0 <- used at end of loop
729 sub. r12,r12,r8 // adjust chunk count, set cr0
730 mtctr r8 // set up loop count
734 // Inner loop for aligned sources. We copy 64 bytes per iteration.
749 bdnz LAlignedLoopBy64 //(8)
751 // End of inner loop. Loop again for next 4KB iff any.
753 beq+ LAlignedNoChunks // r12==0, ie no more chunks to move
754 sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
755 mtctr r0 // reinitialize loop count to 64
756 b LAlignedLoopBy64 // restart inner loop, xfer another 4KB
758 // Fewer than 64 bytes remain to be moved.
760 LAlignedNoChunks: // r7 and cr1 are set with the number of QWs
761 andi. rc,rc,0xF // rc <- leftover bytes
762 beq- cr1,LCleanup // r7==0, ie fewer than 16 bytes remaining
763 mtctr r7 // we will loop over 1-3 QWs
766 lvx va,0,rs // get next quadword
770 bdnz LAlignedLoopBy16
772 b LCleanup // handle last 0-15 bytes, if any
775 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
776 // <><> L O N G R E V E R S E M O V E S <><>
777 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
779 // Reverse moves. These involve overlapping operands, with the source
780 // lower in memory (lower addresses) than the destination. They must be
781 // done right-to-left, ie from high addresses down to low addresses.
782 // Throughout this code, we maintain rs and rd as pointers one byte past
783 // the end of the untransferred operands.
785 // The byte count is >=kShort and the following registers are already loaded:
787 // r6 - VMX mask at entry
788 // cr7 - beq if relatively aligned
792 add rd,rd,rc // update source/dest ptrs to be 1 byte past end
794 andi. r7,rd,0xF // r7 <- #bytes needed to move to align destination
795 sub rc,rc,r7 // adjust length while we wait
796 sub rs,rs,r7 // adjust ptrs by #bytes to xfer, also while we wait
798 beq- LDest16AlignedReverse
800 // Align destination on a quadword. Note that we do NOT align on a cache
801 // block boundary for store gathering etc// since all these operands overlap
802 // many dest cache blocks will already be in the L1, so its not clear that
803 // this would be a win.
805 mtxer r7 // load byte count
809 // Prepare for inner loop and start "dstst" stream. Frankly, its not
810 // clear whether "dst" or "dstst" would be better// somebody should
811 // measure. We use "dstst" because, being overlapped, at least some
812 // source cache blocks will also be stored into.
814 LDest16AlignedReverse:
815 srwi. r12,rc,6 // r12 <- count of 64-byte chunks to move
816 rlwinm r0,rc,11,9,15 // position quadword count for dst
817 rlwinm r11,r12,0,26,31 // mask chunk count down to 0-63
818 li r9,-17 // r9 <- constant used to access 2nd quadword
819 oris r0,r0,0x0100 // set dst block size to 1 qw
820 li r10,-33 // r10<- constant used to access 3rd quadword
821 ori r0,r0,0xFFE0 // set dst stride to -16 bytes
822 li r8,-1 // r8<- constant used to access 1st quadword
823 dstst rs,r0,3 // start stream 0
824 subi r0,r11,1 // r11==0 ?
825 lvx va,r8,rs // prefetch 1st source quadword
826 rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move
827 lvsl vp,0,rs // get permute vector to right shift
828 cmpwi cr1,r7,0 // remember if any 16-byte chunks
829 beq- LNoChunksReverse // r12==0, so skip inner loop
830 rlwimi r11,r0,0,25,25 // if r11==0, then r11 <- 64
831 sub. r12,r12,r11 // adjust chunk count, set cr0
832 mtctr r11 // set up loop count
833 li r11,-49 // r11<- constant used to access 4th quadword
834 li r0,-64 // r0 <- used for several purposes
835 beq- cr7,LAlignedLoopBy64Reverse
837 // Inner loop for unaligned sources. We copy 64 bytes per iteration.
840 lvx vb,r9,rs //(1) 2nd source quadword (1st already in va)
841 lvx vc,r10,rs //(2) 3rd quadword
842 lvx vd,r11,rs //(3) 4th
843 vperm vx,vb,va,vp //(3) vx <- 1st destination quadword
844 lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (note r0 must be RB)
845 vperm vy,vc,vb,vp //(4) vy <- 2nd dest qw
847 vperm vz,vd,vc,vp //(5) vz <- 3rd destination quadword
849 vperm vx,va,vd,vp //(6) vx <- 4th qw
854 bdnz LoopBy64Reverse //(8)
856 // End of inner loop. Should we reprime dst stream and restart loop?
857 // This block is only executed when we're moving more than 4KB.
858 // It is usually folded out because cr0 is set in the loop prologue.
860 beq+ LNoChunksReverse // r12==0, ie no more chunks to move
861 lis r8,0x0440 // dst control: 64 4-qw blocks
862 add. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
863 ori r8,r8,0xFFC0 // stride is -64 bytes
864 dstst rs,r8,3 // restart the prefetch stream
865 li r8,64 // inner loop count
866 mtctr r8 // initialize loop count to 64
867 li r8,-1 // restore qw1 offset for inner loop
868 b LoopBy64Reverse // restart inner loop, xfer another 4KB
870 // Fewer than 64 bytes remain to be moved.
872 LNoChunksReverse: // r7 and cr1 are set with the number of QWs
873 andi. rc,rc,0xF // rc <- leftover bytes
874 beq- cr1,LCleanupReverse // r7==0, ie fewer than 16 bytes left
876 beq- cr7,LAlignedLoopBy16Reverse
879 lvx vb,r9,rs // vb <- 2nd source quadword
881 vperm vx,vb,va,vp // vx <- next destination quadword
882 vor va,vb,vb // va <- vb
887 // Fewer that 16 bytes remain to be moved.
889 LCleanupReverse: // rc and cr0 set with remaining byte count
890 lwz r6,-4(r1) // load VRSave from CR save area
891 mtspr VRSave,r6 // restore caller's live-register bitmask
892 beqlr // rc==0, ie no leftovers so done
893 neg r7,rc // get -(#bytes)
894 mtxer rc // byte count
900 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
901 // <><> A L I G N E D L O N G R E V E R S E M O V E S <><>
902 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
904 // Inner loop. We copy 64 bytes per iteration.
906 LAlignedLoopBy64Reverse:
917 bdnz LAlignedLoopBy64Reverse //(8)
919 // End of inner loop. Loop for next 4KB iff any.
921 beq+ LNoChunksReverse // r12==0, ie no more chunks to move
922 lis r8,0x0440 // dst control: 64 4-qw blocks
923 add. r12,r12,r0 // r12 <- r12 - 64, set cr0
924 ori r8,r8,0xFFC0 // stride is -64 bytes
925 dstst rs,r8,3 // restart the prefetch stream
926 li r8,64 // inner loop count
927 mtctr r8 // initialize loop count to 64
928 li r8,-1 // restore qw1 offset for inner loop
929 b LAlignedLoopBy64Reverse
931 // Loop to copy leftover quadwords (1-3).
933 LAlignedLoopBy16Reverse:
934 lvx va,r8,rs // get next qw
938 bdnz LAlignedLoopBy16Reverse
940 b LCleanupReverse // handle up to 15 bytes in last qw