]> git.saurik.com Git - apple/libc.git/blob - gen.subproj/ppc.subproj/blockmoof.s
947e7f0a3404e15b5360d9eb09782345d843eebd
[apple/libc.git] / gen.subproj / ppc.subproj / blockmoof.s
1 /*
2 * Copyright (c) 1992-2001 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 #include <architecture/ppc/asm_help.h>
23
24 // =================================================================================================
25 // *** The easiest way to assemble things on Mac OS X is via "cc", so this uses #defines and such.
26 // =================================================================================================
27
28 // Keep track of whether we have Altivec
29 // This gets set in pthread_init()
30
31 .data
32 .align 2
33 .globl __cpu_has_altivec
34 __cpu_has_altivec:
35 .long 0
36
37 .text
38 .align 2
39 .globl _bcopy
40 .globl _memcpy
41 .globl _memmove
42
43 _bcopy:
44 mr r2,r4 // Since bcopy uses (src,dest,count), swap r3,r4
45 mr r4,r3
46 mr r3,r2
47 _memcpy:
48 _memmove:
49 mr r2,r3 // Store dest ptr in r2 to preserve r3 on return
50
51 // ------------------
52 // Standard registers
53
54 #define rs r4
55 #define rd r2
56 #define rc r5
57
58 // Should we bother using Altivec?
59
60 cmpwi r5, 128
61 blt+ LScalar
62
63 // Determine whether we have Altivec enabled
64
65 mflr r0
66 bcl 20,31,1f
67 1:
68 mflr r6
69 mtlr r0
70 addis r6, r6, ha16(__cpu_has_altivec - 1b)
71 lwz r6, lo16(__cpu_has_altivec - 1b)(r6)
72 cmpwi r6, 0
73 bne+ LAltivec
74
75 // =================================================================================================
76
77 // *****************************************
78 // * S c a l a r B l o c k M o o f D a t a *
79 // *****************************************
80 //
81 // This is the scalar (non-AltiVec) version of BlockMoofData.
82 //
83 // void ScalarBlockMoofData (ptr sou, ptr dest, long len)
84 // void ScalarBlockMoofDataUncached (ptr sou, ptr dest, long len)
85 //
86 //
87 // Calling Sequence: r3 = source pointer
88 // r4 = destination pointer
89 // r5 = length in bytes
90 //
91 // Uses: all volatile registers.
92
93 LScalar:
94 cmplwi cr7,rc,32 // length <= 32 bytes?
95 cmplw cr6,rd,rs // up or down?
96 mr. r0,rc // copy to r0 for MoveShort, and test for negative
97 bgt cr7,Lbm1 // skip if count > 32
98
99 // Handle short moves (<=32 bytes.)
100
101 beq cr7,LMove32 // special case 32-byte blocks
102 blt cr6,LMoveDownShort // move down in memory and return
103 add rs,rs,rc // moving up (right-to-left), so adjust pointers
104 add rd,rd,rc
105 b LMoveUpShort // move up in memory and return
106
107 // Handle long moves (>32 bytes.)
108
109 Lbm1:
110 beqlr cr6 // rs==rd, so nothing to move
111 bltlr cr0 // length<0, so ignore call and return
112 mflr r12 // save return address
113 bge cr6,Lbm2 // rd>=rs, so move up
114
115 // Long moves down (left-to-right.)
116
117 neg r6,rd // start to 32-byte-align destination
118 andi. r0,r6,0x1F // r0 <- bytes to move to align destination
119 bnel LMoveDownShort // align destination if necessary
120 bl LMoveDownLong // move 32-byte chunks down
121 andi. r0,rc,0x1F // done?
122 mtlr r12 // restore caller's return address
123 bne LMoveDownShort // move trailing leftover bytes and done
124 blr // no leftovers, so done
125
126 // Long moves up (right-to-left.)
127
128 Lbm2:
129 add rs,rs,rc // moving up (right-to-left), so adjust pointers
130 add rd,rd,rc
131 andi. r0,rd,0x1F // r0 <- bytes to move to align destination
132 bnel LMoveUpShort // align destination if necessary
133 bl LMoveUpLong // move 32-byte chunks up
134 andi. r0,rc,0x1F // done?
135 mtlr r12 // restore caller's return address
136 bne LMoveUpShort // move trailing leftover bytes and done
137 blr // no leftovers, so done
138
139 // ***************
140 // * M O V E 3 2 *
141 // ***************
142 //
143 // Special case subroutine to move a 32-byte block. MoveDownShort and
144 // MoveUpShort only handle 0..31 bytes, and we believe 32 bytes is too
145 // common a case to send it through the general purpose long-block code.
146 // Since it moves both up and down, we must load all 32 bytes before
147 // storing any.
148 //
149 // Calling Sequence: rs = source ptr
150 // rd = destination ptr
151 //
152 // Uses: r0,r5-r11.
153 //
154
155 LMove32:
156 lwz r0,0(rs)
157 lwz r5,4(rs)
158 lwz r6,8(rs)
159 lwz r7,12(rs)
160 lwz r8,16(rs)
161 lwz r9,20(rs)
162 lwz r10,24(rs)
163 lwz r11,28(rs)
164 stw r0,0(rd)
165 stw r5,4(rd)
166 stw r6,8(rd)
167 stw r7,12(rd)
168 stw r8,16(rd)
169 stw r9,20(rd)
170 stw r10,24(rd)
171 stw r11,28(rd)
172 blr
173
174
175 // *************************
176 // * M o v e U p S h o r t *
177 // *************************
178 //
179 // Subroutine called to move <32 bytes up in memory (ie, right-to-left).
180 //
181 // Entry conditions: rs = last byte moved from source (right-to-left)
182 // rd = last byte moved into destination
183 // r0 = #bytes to move (0..31)
184 //
185 // Exit conditions: rs = updated source ptr
186 // rd = updated destination ptr
187 // rc = decremented by #bytes moved
188 //
189 // Uses: r0,r6,r7,r8,cr7.
190 //
191
192 LMoveUpShort:
193 andi. r6,r0,0x10 // test 0x10 bit in length
194 mtcrf 0x1,r0 // move count to cr7 so we can test bits
195 sub rc,rc,r0 // decrement count of bytes remaining to be moved
196 beq Lmus1 // skip if 0x10 bit in length is 0
197 lwzu r0,-16(rs) // set, so copy up 16 bytes
198 lwz r6,4(rs)
199 lwz r7,8(rs)
200 lwz r8,12(rs)
201 stwu r0,-16(rd)
202 stw r6,4(rd)
203 stw r7,8(rd)
204 stw r8,12(rd)
205
206 Lmus1:
207 bf 28,Lmus2 // test 0x08 bit
208 lwzu r0,-8(rs)
209 lwz r6,4(rs)
210 stwu r0,-8(rd)
211 stw r6,4(rd)
212
213 Lmus2:
214 bf 29,Lmus3 // test 0x4 bit
215 lwzu r0,-4(rs)
216 stwu r0,-4(rd)
217
218 Lmus3:
219 bf 30,Lmus4 // test 0x2 bit
220 lhzu r0,-2(rs)
221 sthu r0,-2(rd)
222
223 Lmus4:
224 bflr 31 // test 0x1 bit, return if 0
225 lbzu r0,-1(rs)
226 stbu r0,-1(rd)
227 blr
228
229
230 // *****************************
231 // * M o v e D o w n S h o r t *
232 // *****************************
233 //
234 // Subroutine called to move <32 bytes down in memory (ie, left-to-right).
235 //
236 // Entry conditions: rs = source pointer
237 // rd = destination pointer
238 // r0 = #bytes to move (0..31)
239 //
240 // Exit conditions: rs = ptr to 1st byte not moved
241 // rd = ptr to 1st byte not moved
242 // rc = decremented by #bytes moved
243 //
244 // Uses: r0,r6,r7,r8,cr7.
245 //
246
247 LMoveDownShort:
248 andi. r6,r0,0x10 // test 0x10 bit in length
249 mtcrf 0x1,r0 // move count to cr7 so we can test bits
250 sub rc,rc,r0 // decrement count of bytes remaining to be moved
251 beq Lmds1 // skip if 0x10 bit in length is 0
252 lwz r0,0(rs) // set, so copy up 16 bytes
253 lwz r6,4(rs)
254 lwz r7,8(rs)
255 lwz r8,12(rs)
256 addi rs,rs,16
257 stw r0,0(rd)
258 stw r6,4(rd)
259 stw r7,8(rd)
260 stw r8,12(rd)
261 addi rd,rd,16
262
263 Lmds1:
264 bf 28,Lmds2 // test 0x08 bit
265 lwz r0,0(rs)
266 lwz r6,4(rs)
267 addi rs,rs,8
268 stw r0,0(rd)
269 stw r6,4(rd)
270 addi rd,rd,8
271
272 Lmds2:
273 bf 29,Lmds3 // test 0x4 bit
274 lwz r0,0(rs)
275 addi rs,rs,4
276 stw r0,0(rd)
277 addi rd,rd,4
278
279 Lmds3:
280 bf 30,Lmds4 // test 0x2 bit
281 lhz r0,0(rs)
282 addi rs,rs,2
283 sth r0,0(rd)
284 addi rd,rd,2
285
286 Lmds4:
287 bflr 31 // test 0x1 bit, return if 0
288 lbz r0,0(rs)
289 addi rs,rs,1
290 stb r0,0(rd)
291 addi rd,rd,1
292 blr
293
294
295 // ***********************
296 // * M o v e U p L o n g *
297 // ***********************
298 //
299 // Subroutine to move 32-byte chunks of memory up (ie, right-to-left.)
300 // The destination is known to be 32-byte aligned, but the source is
301 // *not* necessarily aligned.
302 //
303 // Entry conditions: rs = last byte moved from source (right-to-left)
304 // rd = last byte moved into destination
305 // rc = count of bytes to move
306 // cr = crCached set iff destination is cacheable
307 //
308 // Exit conditions: rs = updated source ptr
309 // rd = updated destination ptr
310 // rc = low order 8 bits of count of bytes to move
311 //
312 // Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
313 //
314
315 LMoveUpLong:
316 srwi. r11,rc,5 // r11 <- #32 byte chunks to move
317 mtctr r11 // prepare loop count
318 beqlr // return if no chunks to move
319 andi. r0,rs,7 // is source at least doubleword aligned?
320 beq Lmup3 // yes, can optimize this case
321 mtcrf 0x1,rc // save low bits of count
322 mtcrf 0x2,rc // (one cr at a time, as 604 prefers)
323
324 Lmup1: // loop over each 32-byte-chunk
325 lwzu r0,-32(rs)
326 subi rd,rd,32 // prepare destination address for 'dcbz'
327 lwz r5,4(rs)
328 lwz r6,8(rs)
329 lwz r7,12(rs)
330 lwz r8,16(rs)
331 lwz r9,20(rs)
332 lwz r10,24(rs)
333 lwz r11,28(rs)
334 stw r0,0(rd)
335 stw r5,4(rd)
336 stw r6,8(rd)
337 stw r7,12(rd)
338 stw r8,16(rd)
339 stw r9,20(rd)
340 stw r10,24(rd)
341 stw r11,28(rd)
342 bdnz Lmup1
343 mfcr rc // restore low bits of count
344 blr // return to caller
345
346 // Aligned operands, so use d.p. floating point registers to move data.
347
348 Lmup3:
349 lfdu f0,-32(rs)
350 subi rd,rd,32 // prepare destination address for 'dcbz'
351 lfd f1,8(rs)
352 lfd f2,16(rs)
353 lfd f3,24(rs)
354 stfd f0,0(rd)
355 stfd f1,8(rd)
356 stfd f2,16(rd)
357 stfd f3,24(rd)
358 bdnz Lmup3
359 blr // return to caller
360
361
362 // ***************************
363 // * M o v e D o w n L o n g *
364 // ***************************
365 //
366 // Subroutine to move 32-byte chunks of memory down (ie, left-to-right.)
367 // The destination is known to be 32-byte aligned, but the source is
368 // *not* necessarily aligned.
369 //
370 // Entry conditions: rs = source ptr (next byte to move)
371 // rd = dest ptr (next byte to move into)
372 // rc = count of bytes to move
373 // cr = crCached set iff destination is cacheable
374 //
375 // Exit conditions: rs = updated source ptr
376 // rd = updated destination ptr
377 // rc = low order 8 bits of count of bytes to move
378 //
379 // Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
380 //
381
382 LMoveDownLong:
383 srwi. r11,rc,5 // r11 <- #32 byte chunks to move
384 mtctr r11 // prepare loop count
385 beqlr // return if no chunks to move
386 andi. r0,rs,7 // is source at least doubleword aligned?
387 beq Lmdown3 // yes, can optimize this case
388 mtcrf 0x1,rc // save low 8 bits of count
389 mtcrf 0x2,rc // (one cr at a time, as 604 prefers)
390
391 Lmdown1: // loop over each 32-byte-chunk
392 lwz r0,0(rs)
393 lwz r5,4(rs)
394 lwz r6,8(rs)
395 lwz r7,12(rs)
396 lwz r8,16(rs)
397 lwz r9,20(rs)
398 lwz r10,24(rs)
399 lwz r11,28(rs)
400 stw r0,0(rd)
401 stw r5,4(rd)
402 stw r6,8(rd)
403 stw r7,12(rd)
404 stw r8,16(rd)
405 stw r9,20(rd)
406 addi rs,rs,32
407 stw r10,24(rd)
408 stw r11,28(rd)
409 addi rd,rd,32
410 bdnz Lmdown1
411 mfcr rc // restore low bits of count
412 blr // return to caller
413
414 // Aligned operands, so use d.p. floating point registers to move data.
415
416 Lmdown3:
417 lfd f0,0(rs)
418 lfd f1,8(rs)
419 lfd f2,16(rs)
420 lfd f3,24(rs)
421 addi rs,rs,32
422 stfd f0,0(rd)
423 stfd f1,8(rd)
424 stfd f2,16(rd)
425 stfd f3,24(rd)
426 addi rd,rd,32
427 bdnz Lmdown3
428 blr // return to caller
429
430 //
431 // Register use conventions are as follows:
432 //
433 // r0 - temp
434 // r6 - copy of VMX SPR at entry
435 // r7 - temp
436 // r8 - constant -1 (also temp and a string op buffer)
437 // r9 - constant 16 or -17 (also temp and a string op buffer)
438 // r10- constant 32 or -33 (also temp and a string op buffer)
439 // r11- constant 48 or -49 (also temp and a string op buffer)
440 // r12- chunk count ("c") in long moves
441 //
442 // v0 - vp - permute vector
443 // v1 - va - 1st quadword of source
444 // v2 - vb - 2nd quadword of source
445 // v3 - vc - 3rd quadword of source
446 // v4 - vd - 4th quadword of source
447 // v5 - vx - temp
448 // v6 - vy - temp
449 // v7 - vz - temp
450
451 #define vp v0
452 #define va v1
453 #define vb v2
454 #define vc v3
455 #define vd v4
456 #define vx v5
457 #define vy v6
458 #define vz v7
459
460 #define VRSave 256
461
462 // kShort should be the crossover point where the long algorithm is faster than the short.
463 // WARNING: kShort must be >= 64
464
465 // Yes, I know, we just checked rc > 128 to get here...
466
467 #define kShort 128
468 LAltivec:
469 cmpwi cr1,rc,kShort //(1) too short to bother using vector regs?
470 sub. r0,rd,rs //(1) must move reverse if (rd-rs)<rc
471 dcbt 0,rs //(2) prefetch first source block
472 cmplw cr6,r0,rc //(2) set cr6 blt iff we must move reverse
473 beqlr- //(2) done if src==dest
474 srawi. r9,rc,4 //(3) r9 <- quadwords to move, test for zero
475 or r8,rs,rd //(3) start to check for word alignment
476 dcbtst 0,rd //(4) prefetch first destination block
477 rlwinm r8,r8,0,30,31 //(4) r8 is zero if word aligned
478 bgt- cr1,LMoveLong //(4) handle long operands
479 cmpwi cr1,r8,0 //(5) word aligned?
480 rlwinm r7,rc,0,28,31 //(5) r7 <- leftover bytes to move after quadwords
481 bltlr- //(5) done if negative count
482 blt- cr6,LShortReverse //(5) handle reverse moves
483 cmpwi cr7,r7,0 //(6) leftover bytes?
484 beq- Leftovers //(6) r9==0, so no quadwords to move
485 mtctr r9 //(7) set up for quadword loop
486 bne- cr1,LUnalignedLoop //(7) not word aligned (less common than word aligned)
487
488
489 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
490 // <><> S H O R T O P E R A N D S <><>
491 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
492
493 LAlignedLoop: // word aligned operands (the common case)
494 lfd f0,0(rs) //(1)
495 lfd f1,8(rs) //(2)
496 addi rs,rs,16 //(2)
497 stfd f0,0(rd) //(3)
498 stfd f1,8(rd) //(4)
499 addi rd,rd,16 //(4)
500 bdnz LAlignedLoop //(4)
501
502 Leftovers:
503 beqlr- cr7 //(8) done if r7==0, ie no leftover bytes
504 mtxer r7 //(9) count of bytes to move (1-15)
505 lswx r8,0,rs
506 stswx r8,0,rd
507 blr //(17)
508
509 LUnalignedLoop: // not word aligned, cannot use lfd/stfd
510 lwz r8,0(rs) //(1)
511 lwz r9,4(rs) //(2)
512 lwz r10,8(rs) //(3)
513 lwz r11,12(rs) //(4)
514 addi rs,rs,16 //(4)
515 stw r8,0(rd) //(5)
516 stw r9,4(rd) //(6)
517 stw r10,8(rd) //(7)
518 stw r11,12(rd) //(8)
519 addi rd,rd,16 //(8)
520 bdnz LUnalignedLoop //(8)
521
522 b Leftovers
523
524
525 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
526 // <><> S H O R T R E V E R S E M O V E S <><>
527 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
528
529 // cr0 & r9 <- #doublewords to move (>=0)
530 // cr1 <- beq if word aligned
531 // r7 <- #leftover bytes to move (0-15)
532
533 LShortReverse:
534 cmpwi cr7,r7,0 // leftover bytes?
535 add rs,rs,rc // point 1 past end of string for reverse moves
536 add rd,rd,rc
537 beq- LeftoversReverse // r9==0, ie no words to move
538 mtctr r9 // set up for quadword loop
539 bne- cr1,LUnalignedLoopReverse
540
541 LAlignedLoopReverse: // word aligned, so use lfd/stfd
542 lfd f0,-8(rs)
543 lfdu f1,-16(rs)
544 stfd f0,-8(rd)
545 stfdu f1,-16(rd)
546 bdnz LAlignedLoopReverse
547
548 LeftoversReverse:
549 beqlr- cr7 // done if r7==0, ie no leftover bytes
550 mtxer r7 // count of bytes to move (1-15)
551 neg r7,r7 // index back by #bytes
552 lswx r8,r7,rs
553 stswx r8,r7,rd
554 blr
555
556 LUnalignedLoopReverse: // not word aligned, cannot use lfd/stfd
557 lwz r8,-4(rs)
558 lwz r9,-8(rs)
559 lwz r10,-12(rs)
560 lwzu r11,-16(rs)
561 stw r8,-4(rd)
562 stw r9,-8(rd)
563 stw r10,-12(rd)
564 stwu r11,-16(rd)
565 bdnz LUnalignedLoopReverse
566
567 b LeftoversReverse
568
569 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
570 // <><> L O N G O P E R A N D S <><>
571 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
572
573 // cr6 set (blt) if must move reverse
574 // r0 <- (rd - rs)
575
576 LMoveLong:
577 mfspr r6,VRSave //(5) save caller's VMX mask register
578 stw r6,-4(r1) // use CR save area so we can use r6 later
579 neg r8,rd //(5) start to compute #bytes to fill in 1st dest quadword
580 rlwinm r0,r0,0,28,31 //(6) start to determine relative alignment
581 andi. r7,r8,0xF //(6) r7 <- #bytes to fill in 1st dest quadword
582 cmpwi cr7,r0,0 //(7) relatively aligned? (ie, 16 bytes apart?)
583 oris r9,r6,0xFF00 //(7) light bits for regs we use (v0-v7)
584 mtspr VRSave,r9 //(8) update live register bitmask
585 blt- cr6,LongReverse //(8) must move reverse direction
586 sub rc,rc,r7 //(9) adjust length while we wait
587 beq- LDest16Aligned //(9) r7==0, ie destination already quadword aligned
588
589 // Align destination on a quadword.
590
591 mtxer r7 //(10) set up byte count (1-15)
592 lswx r8,0,rs // load into r8-r11
593 stswx r8,0,rd // store r8-r11 (measured latency on arthur is 7.2 cycles)
594 add rd,rd,r7 //(18) adjust ptrs
595 add rs,rs,r7 //(18)
596
597 // Begin preparation for inner loop and "dst" stream.
598
599 LDest16Aligned:
600 andi. r0,rd,0x10 //(19) is destination cache-block aligned?
601 li r9,16 //(19) r9 <- constant used to access 2nd quadword
602 li r10,32 //(20) r10<- constant used to access 3rd quadword
603 beq- cr7,LAligned //(20) handle relatively aligned operands
604 lvx va,0,rs //(20) prefetch 1st source quadword
605 li r11,48 //(21) r11<- constant used to access 4th quadword
606 lvsl vp,0,rs //(21) get permute vector to left shift
607 beq LDest32Aligned //(22) destination already cache-block aligned
608
609 // Copy 16 bytes to align destination on 32-byte (cache block) boundary
610 // to maximize store gathering.
611
612 lvx vb,r9,rs //(23) get 2nd source qw
613 subi rc,rc,16 //(23) adjust count
614 addi rs,rs,16 //(24) adjust source ptr
615 vperm vx,va,vb,vp //(25) vx <- 1st destination qw
616 vor va,vb,vb //(25) va <- vb
617 stvx vx,0,rd //(26) assuming store Q deep enough to avoid latency
618 addi rd,rd,16 //(26) adjust dest ptr
619
620 // Destination 32-byte aligned, source alignment unknown.
621
622 LDest32Aligned:
623 srwi. r12,rc,6 //(27) r12<- count of 64-byte chunks to move
624 rlwinm r7,rc,28,30,31 //(27) r7 <- count of 16-byte chunks to move
625 cmpwi cr1,r7,0 //(28) remember if any 16-byte chunks
626 rlwinm r8,r12,0,26,31 //(29) mask chunk count down to 0-63
627 subi r0,r8,1 //(30) r8==0?
628 beq- LNoChunks //(30) r12==0, ie no chunks to move
629 rlwimi r8,r0,0,25,25 //(31) if r8==0, then r8 <- 64
630 li r0,64 //(31) r0 <- used to get 1st quadword of next chunk
631 sub. r12,r12,r8 //(32) adjust chunk count, set cr0
632 mtctr r8 //(32) set up loop count
633 li r8,96 //SKP
634 li r6,128 //SKP
635 // Inner loop for unaligned sources. We copy 64 bytes per iteration.
636 // We loop at most 64 times, then reprime the "dst" and loop again for
637 // the next 4KB. This loop is tuned to keep the CPU flat out, which
638 // means we need to execute a lvx or stvx every cycle.
639
640 LoopBy64:
641 dcbt rs,r8 //SKP
642 dcbt rs,r6 //SKP
643 lvx vb,r9,rs //(1) 2nd source quadword (1st already in va)
644 lvx vc,r10,rs //(2) 3rd
645 lvx vd,r11,rs //(3) 4th
646 vperm vx,va,vb,vp //(3) vx <- 1st destination quadword
647 lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (r0 must be RB!)
648 vperm vy,vb,vc,vp //(4) vy <- 2nd dest qw
649 stvx vx,0,rd //(5)
650 vperm vz,vc,vd,vp //(5) vz <- 3rd dest qw
651 stvx vy,r9,rd //(6)
652 vperm vx,vd,va,vp //(6) vx <- 4th
653 stvx vz,r10,rd //(7)
654 addi rs,rs,64 //(7)
655 stvx vx,r11,rd //(8)
656 addi rd,rd,64 //(8)
657 bdnz LoopBy64 //(8)
658
659 // End of inner loop. Should we reprime dst stream and restart loop?
660 // This block is only executed when we're moving more than 4KB.
661 // It is usually folded out because cr0 is set in the loop prologue.
662
663 beq+ LNoChunks // r12==0, ie no more chunks to move
664 sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
665 mtctr r0 // initialize loop count to 64
666 b LoopBy64 // restart inner loop, xfer another 4KB
667
668 // Fewer than 64 bytes remain to be moved.
669
670 LNoChunks: // r7 and cr1 are set with the number of QWs
671 andi. rc,rc,0xF //(33) rc <- leftover bytes
672 beq- cr1,LCleanup //(33) r7==0, ie fewer than 16 bytes remaining
673 mtctr r7 //(34) we will loop over 1-3 QWs
674
675 LoopBy16:
676 lvx vb,r9,rs //(1) vb <- 2nd source quadword
677 addi rs,rs,16 //(1)
678 vperm vx,va,vb,vp //(3) vx <- next destination quadword
679 vor va,vb,vb //(3) va <- vb
680 stvx vx,0,rd //(4) assuming store Q is deep enough to mask latency
681 addi rd,rd,16 //(4)
682 bdnz LoopBy16 //(4)
683
684 // Move remaining bytes in last quadword. rc and cr0 have the count.
685
686 LCleanup:
687 lwz r6,-4(r1) // load VRSave from CR save area
688 mtspr VRSave,r6 //(35) restore caller's live-register bitmask
689 beqlr //(36) rc==0, ie no leftovers, so done
690 mtxer rc //(37) load byte count (1-15)
691 lswx r8,0,rs
692 stswx r8,0,rd
693 blr //(45)
694
695
696 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
697 // <><> L O N G A L I G N E D M O V E S <><>
698 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
699
700 // rs, rd <- both quadword aligned
701 // cr0 <- beq if dest is cache block (32-byte) aligned
702 // r9 <- 16
703 // r10 <- 32
704
705 LAligned:
706 lvx va,0,rs // prefetch 1st source quadword
707 li r11,48 // r11<- constant used to access 4th quadword
708 beq LAligned32 // destination already cache-block aligned
709
710 // Copy 16 bytes to align destination on 32-byte (cache block) boundary
711 // to maximize store gathering.
712
713 subi rc,rc,16 // adjust count
714 addi rs,rs,16 // adjust source ptr
715 stvx va,0,rd // assuming store Q deep enough to avoid latency
716 addi rd,rd,16 // adjust dest ptr
717
718 // Destination 32-byte aligned, source 16-byte aligned. Set up for inner loop.
719
720 LAligned32:
721 srwi. r12,rc,6 // r12<- count of 64-byte chunks to move
722 rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move
723 cmpwi cr1,r7,0 // remember if any 16-byte chunks
724 rlwinm r8,r12,0,26,31 // mask chunk count down to 0-63
725 subi r0,r8,1 // r8==0?
726 beq- LAlignedNoChunks // r12==0, ie no chunks to move
727 rlwimi r8,r0,0,25,25 // if r8==0, then r8 <- 64
728 li r0,64 // r0 <- used at end of loop
729 sub. r12,r12,r8 // adjust chunk count, set cr0
730 mtctr r8 // set up loop count
731 li r8,96 //SKP
732 li r6,128 //SKP
733
734 // Inner loop for aligned sources. We copy 64 bytes per iteration.
735
736 LAlignedLoopBy64:
737 dcbt rs,r8 //SKP
738 dcbt rs,r6 //SKP
739 lvx va,0,rs //(1)
740 lvx vb,r9,rs //(2)
741 lvx vc,r10,rs //(3)
742 lvx vd,r11,rs //(4)
743 addi rs,rs,64 //(4)
744 stvx va,0,rd //(5)
745 stvx vb,r9,rd //(6)
746 stvx vc,r10,rd //(7)
747 stvx vd,r11,rd //(8)
748 addi rd,rd,64 //(8)
749 bdnz LAlignedLoopBy64 //(8)
750
751 // End of inner loop. Loop again for next 4KB iff any.
752
753 beq+ LAlignedNoChunks // r12==0, ie no more chunks to move
754 sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
755 mtctr r0 // reinitialize loop count to 64
756 b LAlignedLoopBy64 // restart inner loop, xfer another 4KB
757
758 // Fewer than 64 bytes remain to be moved.
759
760 LAlignedNoChunks: // r7 and cr1 are set with the number of QWs
761 andi. rc,rc,0xF // rc <- leftover bytes
762 beq- cr1,LCleanup // r7==0, ie fewer than 16 bytes remaining
763 mtctr r7 // we will loop over 1-3 QWs
764
765 LAlignedLoopBy16:
766 lvx va,0,rs // get next quadword
767 addi rs,rs,16
768 stvx va,0,rd
769 addi rd,rd,16
770 bdnz LAlignedLoopBy16
771
772 b LCleanup // handle last 0-15 bytes, if any
773
774
775 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
776 // <><> L O N G R E V E R S E M O V E S <><>
777 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
778
779 // Reverse moves. These involve overlapping operands, with the source
780 // lower in memory (lower addresses) than the destination. They must be
781 // done right-to-left, ie from high addresses down to low addresses.
782 // Throughout this code, we maintain rs and rd as pointers one byte past
783 // the end of the untransferred operands.
784 //
785 // The byte count is >=kShort and the following registers are already loaded:
786 //
787 // r6 - VMX mask at entry
788 // cr7 - beq if relatively aligned
789 //
790
791 LongReverse:
792 add rd,rd,rc // update source/dest ptrs to be 1 byte past end
793 add rs,rs,rc
794 andi. r7,rd,0xF // r7 <- #bytes needed to move to align destination
795 sub rc,rc,r7 // adjust length while we wait
796 sub rs,rs,r7 // adjust ptrs by #bytes to xfer, also while we wait
797 sub rd,rd,r7
798 beq- LDest16AlignedReverse
799
800 // Align destination on a quadword. Note that we do NOT align on a cache
801 // block boundary for store gathering etc// since all these operands overlap
802 // many dest cache blocks will already be in the L1, so its not clear that
803 // this would be a win.
804
805 mtxer r7 // load byte count
806 lswx r8,0,rs
807 stswx r8,0,rd
808
809 // Prepare for inner loop and start "dstst" stream. Frankly, its not
810 // clear whether "dst" or "dstst" would be better// somebody should
811 // measure. We use "dstst" because, being overlapped, at least some
812 // source cache blocks will also be stored into.
813
814 LDest16AlignedReverse:
815 srwi. r12,rc,6 // r12 <- count of 64-byte chunks to move
816 rlwinm r0,rc,11,9,15 // position quadword count for dst
817 rlwinm r11,r12,0,26,31 // mask chunk count down to 0-63
818 li r9,-17 // r9 <- constant used to access 2nd quadword
819 oris r0,r0,0x0100 // set dst block size to 1 qw
820 li r10,-33 // r10<- constant used to access 3rd quadword
821 ori r0,r0,0xFFE0 // set dst stride to -16 bytes
822 li r8,-1 // r8<- constant used to access 1st quadword
823 dstst rs,r0,3 // start stream 0
824 subi r0,r11,1 // r11==0 ?
825 lvx va,r8,rs // prefetch 1st source quadword
826 rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move
827 lvsl vp,0,rs // get permute vector to right shift
828 cmpwi cr1,r7,0 // remember if any 16-byte chunks
829 beq- LNoChunksReverse // r12==0, so skip inner loop
830 rlwimi r11,r0,0,25,25 // if r11==0, then r11 <- 64
831 sub. r12,r12,r11 // adjust chunk count, set cr0
832 mtctr r11 // set up loop count
833 li r11,-49 // r11<- constant used to access 4th quadword
834 li r0,-64 // r0 <- used for several purposes
835 beq- cr7,LAlignedLoopBy64Reverse
836
837 // Inner loop for unaligned sources. We copy 64 bytes per iteration.
838
839 LoopBy64Reverse:
840 lvx vb,r9,rs //(1) 2nd source quadword (1st already in va)
841 lvx vc,r10,rs //(2) 3rd quadword
842 lvx vd,r11,rs //(3) 4th
843 vperm vx,vb,va,vp //(3) vx <- 1st destination quadword
844 lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (note r0 must be RB)
845 vperm vy,vc,vb,vp //(4) vy <- 2nd dest qw
846 stvx vx,r8,rd //(5)
847 vperm vz,vd,vc,vp //(5) vz <- 3rd destination quadword
848 stvx vy,r9,rd //(6)
849 vperm vx,va,vd,vp //(6) vx <- 4th qw
850 stvx vz,r10,rd //(7)
851 subi rs,rs,64 //(7)
852 stvx vx,r11,rd //(8)
853 subi rd,rd,64 //(8)
854 bdnz LoopBy64Reverse //(8)
855
856 // End of inner loop. Should we reprime dst stream and restart loop?
857 // This block is only executed when we're moving more than 4KB.
858 // It is usually folded out because cr0 is set in the loop prologue.
859
860 beq+ LNoChunksReverse // r12==0, ie no more chunks to move
861 lis r8,0x0440 // dst control: 64 4-qw blocks
862 add. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
863 ori r8,r8,0xFFC0 // stride is -64 bytes
864 dstst rs,r8,3 // restart the prefetch stream
865 li r8,64 // inner loop count
866 mtctr r8 // initialize loop count to 64
867 li r8,-1 // restore qw1 offset for inner loop
868 b LoopBy64Reverse // restart inner loop, xfer another 4KB
869
870 // Fewer than 64 bytes remain to be moved.
871
872 LNoChunksReverse: // r7 and cr1 are set with the number of QWs
873 andi. rc,rc,0xF // rc <- leftover bytes
874 beq- cr1,LCleanupReverse // r7==0, ie fewer than 16 bytes left
875 mtctr r7
876 beq- cr7,LAlignedLoopBy16Reverse
877
878 LoopBy16Reverse:
879 lvx vb,r9,rs // vb <- 2nd source quadword
880 subi rs,rs,16
881 vperm vx,vb,va,vp // vx <- next destination quadword
882 vor va,vb,vb // va <- vb
883 stvx vx,r8,rd
884 subi rd,rd,16
885 bdnz LoopBy16Reverse
886
887 // Fewer that 16 bytes remain to be moved.
888
889 LCleanupReverse: // rc and cr0 set with remaining byte count
890 lwz r6,-4(r1) // load VRSave from CR save area
891 mtspr VRSave,r6 // restore caller's live-register bitmask
892 beqlr // rc==0, ie no leftovers so done
893 neg r7,rc // get -(#bytes)
894 mtxer rc // byte count
895 lswx r8,r7,rs
896 stswx r8,r7,rd
897 blr
898
899
900 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
901 // <><> A L I G N E D L O N G R E V E R S E M O V E S <><>
902 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
903
904 // Inner loop. We copy 64 bytes per iteration.
905
906 LAlignedLoopBy64Reverse:
907 lvx va,r8,rs //(1)
908 lvx vb,r9,rs //(2)
909 lvx vc,r10,rs //(3)
910 lvx vd,r11,rs //(4)
911 subi rs,rs,64 //(4)
912 stvx va,r8,rd //(5)
913 stvx vb,r9,rd //(6)
914 stvx vc,r10,rd //(7)
915 stvx vd,r11,rd //(8)
916 subi rd,rd,64 //(8)
917 bdnz LAlignedLoopBy64Reverse //(8)
918
919 // End of inner loop. Loop for next 4KB iff any.
920
921 beq+ LNoChunksReverse // r12==0, ie no more chunks to move
922 lis r8,0x0440 // dst control: 64 4-qw blocks
923 add. r12,r12,r0 // r12 <- r12 - 64, set cr0
924 ori r8,r8,0xFFC0 // stride is -64 bytes
925 dstst rs,r8,3 // restart the prefetch stream
926 li r8,64 // inner loop count
927 mtctr r8 // initialize loop count to 64
928 li r8,-1 // restore qw1 offset for inner loop
929 b LAlignedLoopBy64Reverse
930
931 // Loop to copy leftover quadwords (1-3).
932
933 LAlignedLoopBy16Reverse:
934 lvx va,r8,rs // get next qw
935 subi rs,rs,16
936 stvx va,r8,rd
937 subi rd,rd,16
938 bdnz LAlignedLoopBy16Reverse
939
940 b LCleanupReverse // handle up to 15 bytes in last qw