]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_g4.s
xnu-792.6.56.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_g4.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /* =======================================
24 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
25 * =======================================
26 *
27 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
28 * reading destination cache lines. Only the 7450 actually benefits from
29 * this, and then only in the cold-cache case. On 7400s and 7455s, we
30 * patch the DCBAs into NOPs.
31 *
32 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
33 * environment. Note also the rather delicate way we assign multiple uses
34 * to the same register. Beware.
35 *
36 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
37 * r2 = "w8" or vrsave ("rv")
38 * r3 = not used, as memcpy and memmove return 1st parameter as a value
39 * r4 = source ptr ("rs")
40 * r5 = count of bytes to move ("rc")
41 * r6 = "w1", "c16", or "cm17"
42 * r7 = "w2", "c32", or "cm33"
43 * r8 = "w3", "c48", or "cm49"
44 * r9 = "w4", or "cm1"
45 * r10 = "w5", "c96", or "cm97"
46 * r11 = "w6", "c128", or "cm129"
47 * r12 = destination ptr ("rd")
48 * v0 = permute vector ("vp")
49 * v1-v4 = qw's loaded from source
50 * v5-v7 = permuted qw's ("vw", "vx", "vy")
51 */
52 #define rs r4
53 #define rd r12
54 #define rc r5
55 #define rv r2
56
57 #define w1 r6
58 #define w2 r7
59 #define w3 r8
60 #define w4 r9
61 #define w5 r10
62 #define w6 r11
63 #define w7 r0
64 #define w8 r2
65
66 #define c16 r6
67 #define cm17 r6
68 #define c32 r7
69 #define cm33 r7
70 #define c48 r8
71 #define cm49 r8
72 #define cm1 r9
73 #define c96 r10
74 #define cm97 r10
75 #define c128 r11
76 #define cm129 r11
77
78 #define vp v0
79 #define vw v5
80 #define vx v6
81 #define vy v7
82
83 #define ASSEMBLER
84 #include <sys/appleapiopts.h>
85 #include <ppc/asm.h>
86 #include <machine/cpu_capabilities.h>
87 #include <machine/commpage.h>
88
89 .text
90
91 #define kMedium 32 // too long for inline loopless code
92 #define kLong 96 // long enough to justify use of Altivec
93
94
95 // Main entry points.
96
97 .align 5
98 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
99 cmplwi rc,kMedium // short or long?
100 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
101 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
102 mr rd,r4 // start to move registers to canonic spot
103 mr rs,r3
104 blt+ LShort // handle short operands
105 dcbt 0,r3 // touch in destination
106 b LMedium // join medium/long operand code
107
108 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
109
110 .align 5
111 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
112 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
113 cmplwi rc,kMedium // short or long?
114 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
115 dcbt 0,r4 // touch in the first line of source
116 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
117 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
118 bge- LMedium // handle medium or long operands
119
120 // Handle short operands.
121
122 LShort:
123 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
124 mtcrf 0x01,rc // put length bits 28-31 in cr7
125 blt- cr1,LShortReverse
126
127 // Forward short operands. This is the most frequent case, so it is inline.
128
129 beq LShort16 // quadword to move?
130 lwz w1,0(rs)
131 lwz w2,4(rs)
132 lwz w3,8(rs)
133 lwz w4,12(rs)
134 addi rs,rs,16
135 stw w1,0(rd)
136 stw w2,4(rd)
137 stw w3,8(rd)
138 stw w4,12(rd)
139 addi rd,rd,16
140 LShort16: // join here to xfer 0-15 bytes
141 bf 28,2f // doubleword?
142 lwz w1,0(rs)
143 lwz w2,4(rs)
144 addi rs,rs,8
145 stw w1,0(rd)
146 stw w2,4(rd)
147 addi rd,rd,8
148 2:
149 bf 29,3f // word?
150 lwz w1,0(rs)
151 addi rs,rs,4
152 stw w1,0(rd)
153 addi rd,rd,4
154 3:
155 bf 30,4f // halfword to move?
156 lhz w1,0(rs)
157 addi rs,rs,2
158 sth w1,0(rd)
159 addi rd,rd,2
160 4:
161 bflr 31 // skip if no odd byte
162 lbz w1,0(rs)
163 stb w1,0(rd)
164 blr
165
166
167 // Handle short reverse operands.
168 // cr0 = bne if bit 27 of length is set
169 // cr7 = bits 28-31 of length
170
171 LShortReverse:
172 add rs,rs,rc // adjust ptrs for reverse move
173 add rd,rd,rc
174 beq LShortReverse16 // quadword to move?
175 lwz w1,-4(rs)
176 lwz w2,-8(rs)
177 lwz w3,-12(rs)
178 lwzu w4,-16(rs)
179 stw w1,-4(rd)
180 stw w2,-8(rd)
181 stw w3,-12(rd)
182 stwu w4,-16(rd)
183 LShortReverse16: // join here to xfer 0-15 bytes and return
184 bf 28,2f // doubleword?
185 lwz w1,-4(rs)
186 lwzu w2,-8(rs)
187 stw w1,-4(rd)
188 stwu w2,-8(rd)
189 2:
190 bf 29,3f // word?
191 lwzu w1,-4(rs)
192 stwu w1,-4(rd)
193 3:
194 bf 30,4f // halfword to move?
195 lhzu w1,-2(rs)
196 sthu w1,-2(rd)
197 4:
198 bflr 31 // done if no odd byte
199 lbz w1,-1(rs) // no update
200 stb w1,-1(rd)
201 blr
202
203
204 // Medium and long operands. Use Altivec if long enough, else scalar loops.
205 // w1 = (rd-rs), used to check for alignment
206 // cr1 = blt iff we must move reverse
207
208 .align 4
209 LMedium:
210 dcbtst 0,rd // touch in destination
211 cmplwi cr7,rc,kLong // long enough for vectors?
212 neg w3,rd // start to compute #bytes to align destination
213 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
214 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
215 blt cr1,LMediumReverse // handle reverse moves
216 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
217 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
218 bge cr7,LFwdLong // long enough for vectors
219
220 // Medium length: use scalar loops.
221 // w6/cr0 = #bytes to 8-byte align destination
222 // cr6 = beq if relatively doubleword aligned
223
224 sub rc,rc,w6 // decrement length remaining
225 beq 1f // skip if dest already doubleword aligned
226 mtxer w6 // set up count for move
227 lswx w1,0,rs // move w6 bytes to align destination
228 stswx w1,0,rd
229 add rs,rs,w6 // bump ptrs past
230 add rd,rd,w6
231 1:
232 srwi r0,rc,4 // get # 16-byte chunks (>=1)
233 mtcrf 0x01,rc // save remaining byte count here for LShort16
234 mtctr r0 // set up 16-byte loop
235 bne cr6,3f // source not 4-byte aligned
236 b 2f
237
238 .align 4
239 2: // loop over 16-byte aligned chunks
240 lfd f0,0(rs)
241 lfd f1,8(rs)
242 addi rs,rs,16
243 stfd f0,0(rd)
244 stfd f1,8(rd)
245 addi rd,rd,16
246 bdnz 2b
247
248 b LShort16
249
250 .align 4
251 3: // loop over 16-byte unaligned chunks
252 lwz w1,0(rs)
253 lwz w2,4(rs)
254 lwz w3,8(rs)
255 lwz w4,12(rs)
256 addi rs,rs,16
257 stw w1,0(rd)
258 stw w2,4(rd)
259 stw w3,8(rd)
260 stw w4,12(rd)
261 addi rd,rd,16
262 bdnz 3b
263
264 b LShort16
265
266
267 // Vector loops. First, we must 32-byte align the destination.
268 // w1 = (rd-rs), used to check for reverse and alignment
269 // w4 = #bytes to 32-byte align destination
270 // rc = long enough for at least one vector loop
271
272 LFwdLong:
273 cmpwi w4,0 // dest already aligned?
274 sub rc,rc,w4 // adjust length
275 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
276 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
277 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
278 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
279 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
280 beq LFwdAligned // dest is already aligned
281
282 // 32-byte align destination.
283
284 bf 31,1f // byte to move?
285 lbz w1,0(rs)
286 addi rs,rs,1
287 stb w1,0(rd)
288 addi rd,rd,1
289 1:
290 bf 30,2f // halfword?
291 lhz w1,0(rs)
292 addi rs,rs,2
293 sth w1,0(rd)
294 addi rd,rd,2
295 2:
296 bf 29,3f // word?
297 lwz w1,0(rs)
298 addi rs,rs,4
299 stw w1,0(rd)
300 addi rd,rd,4
301 3:
302 bf 28,4f // doubleword?
303 lwz w1,0(rs)
304 lwz w2,4(rs)
305 addi rs,rs,8
306 stw w1,0(rd)
307 stw w2,4(rd)
308 addi rd,rd,8
309 4:
310 bf 27,LFwdAligned // quadword?
311 lwz w1,0(rs)
312 lwz w2,4(rs)
313 lwz w3,8(rs)
314 lwz w4,12(rs)
315 addi rs,rs,16
316 stw w1,0(rd)
317 stw w2,4(rd)
318 stw w3,8(rd)
319 stw w4,12(rd)
320 addi rd,rd,16
321
322
323 // Destination is 32-byte aligned.
324 // r0 = count of 64-byte chunks to move (not 0)
325 // rd = 32-byte aligned
326 // rc = bytes remaining
327 // cr5 = beq if source is 16-byte aligned
328 // We set up many registers:
329 // ctr = number of 64-byte chunks to move
330 // r0/cr0 = leftover QWs to move
331 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
332 // cr6 = beq if leftover byte count is 0
333 // rv = original value of vrsave
334 // c16 etc = loaded
335
336 LFwdAligned:
337 mfspr rv,vrsave // get bitmap of live vector registers
338 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
339 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
340 mtctr r0 // set up loop count
341 cmpwi cr6,w3,0 // set cr6 on leftover byte count
342 oris w1,rv,0xFF00 // we use v0-v7
343 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
344 mtspr vrsave,w1 // update mask
345 li c16,16 // get constants used in ldvx/stvx
346 li c32,32
347 li c48,48
348 li c96,96
349 li c128,128
350 bne cr5,LForwardVecUnal // handle unaligned operands
351 b 1f
352
353 .align 4
354 1: // loop over 64-byte chunks
355 dcbt c96,rs
356 dcbt c128,rs
357 lvx v1,0,rs
358 lvx v2,c16,rs
359 lvx v3,c32,rs
360 lvx v4,c48,rs
361 addi rs,rs,64
362 dcba 0,rd // patched to NOP on some machines
363 stvx v1,0,rd
364 stvx v2,c16,rd
365 dcba c32,rd // patched to NOP on some machines
366 stvx v3,c32,rd
367 stvx v4,c48,rd
368 addi rd,rd,64
369 bdnz 1b
370
371 beq 4f // no leftover quadwords
372 mtctr r0
373 3: // loop over remaining quadwords (1-3)
374 lvx v1,0,rs
375 addi rs,rs,16
376 stvx v1,0,rd
377 addi rd,rd,16
378 bdnz 3b
379 4:
380 mtspr vrsave,rv // restore bitmap of live vr's
381 bne cr6,LShort16 // handle last 0-15 bytes if any
382 blr
383
384
385 // Long, forward, unaligned vector loop.
386
387 LForwardVecUnal:
388 lvsl vp,0,rs // get permute vector to shift left
389 lvx v1,0,rs // prefetch 1st source quadword
390 b 1f
391
392 .align 4 // align inner loops
393 1: // loop over 64-byte chunks
394 lvx v2,c16,rs
395 dcbt c96,rs
396 lvx v3,c32,rs
397 dcbt c128,rs
398 lvx v4,c48,rs
399 addi rs,rs,64
400 vperm vw,v1,v2,vp
401 lvx v1,0,rs
402 vperm vx,v2,v3,vp
403 dcba 0,rd // patched to NOP on some machines
404 stvx vw,0,rd
405 vperm vy,v3,v4,vp
406 stvx vx,c16,rd
407 vperm vw,v4,v1,vp
408 dcba c32,rd // patched to NOP on some machines
409 stvx vy,c32,rd
410 stvx vw,c48,rd
411 addi rd,rd,64
412 bdnz 1b
413
414 beq- 4f // no leftover quadwords
415 mtctr r0
416 3: // loop over remaining quadwords
417 lvx v2,c16,rs
418 addi rs,rs,16
419 vperm vx,v1,v2,vp
420 vor v1,v2,v2 // v1 <- v2
421 stvx vx,0,rd
422 addi rd,rd,16
423 bdnz 3b
424 4:
425 mtspr vrsave,rv // restore bitmap of live vr's
426 bne cr6,LShort16 // handle last 0-15 bytes if any
427 blr
428
429
430 // Medium and long, reverse moves. We use altivec if the operands are long enough,
431 // else a lwz/stx loop.
432 // w1 = (rd-rs), used to check for reverse and alignment
433 // cr7 = bge if long
434
435 LMediumReverse:
436 add rd,rd,rc // point to end of operands
437 add rs,rs,rc
438 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
439 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
440 bge cr7,LLongReverse // long enough for vectors
441
442 // Scalar loop.
443 // w6 = #bytes to 4-byte align destination
444
445 sub rc,rc,w6 // decrement length remaining
446 mtxer w6 // set up count for move
447 sub rs,rs,w6 // back up ptrs
448 sub rd,rd,w6
449 srwi r0,rc,4 // get # 16-byte chunks (>=1)
450 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
451 lswx w1,0,rs // move w6 bytes to align destination
452 stswx w1,0,rd
453 mtctr r0 // set up 16-byte loop
454 b 1f
455
456 .align 4
457 1: // loop over 16-byte aligned chunks
458 lwz w1,-4(rs)
459 lwz w2,-8(rs)
460 lwz w3,-12(rs)
461 lwzu w4,-16(rs)
462 stw w1,-4(rd)
463 stw w2,-8(rd)
464 stw w3,-12(rd)
465 stwu w4,-16(rd)
466 bdnz 1b
467
468 b LShortReverse16
469
470
471 // Reverse vector loops. First, we must 32-byte align the destination.
472 // w1 = (rd-rs), used to check for reverse and alignment
473 // w4/cr0 = #bytes to 32-byte align destination
474 // rc = long enough for at least one vector loop
475
476 LLongReverse:
477 sub rc,rc,w4 // adjust length
478 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
479 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
480 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
481 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
482 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
483 beq LReverseAligned // dest is already aligned
484
485 // 32-byte align destination.
486
487 bf 31,1f // byte to move?
488 lbzu w1,-1(rs)
489 stbu w1,-1(rd)
490 1:
491 bf 30,2f // halfword?
492 lhzu w1,-2(rs)
493 sthu w1,-2(rd)
494 2:
495 bf 29,3f // word?
496 lwzu w1,-4(rs)
497 stwu w1,-4(rd)
498 3:
499 bf 28,4f // doubleword?
500 lwz w1,-4(rs)
501 lwzu w2,-8(rs)
502 stw w1,-4(rd)
503 stwu w2,-8(rd)
504 4:
505 bf 27,LReverseAligned // quadword?
506 lwz w1,-4(rs)
507 lwz w2,-8(rs)
508 lwz w3,-12(rs)
509 lwzu w4,-16(rs)
510 stw w1,-4(rd)
511 stw w2,-8(rd)
512 stw w3,-12(rd)
513 stwu w4,-16(rd)
514
515 // Destination is 32-byte aligned.
516 // r0 = count of 64-byte chunks to move (not 0)
517 // rd = 32-byte aligned
518 // rc = bytes remaining
519 // cr5 = beq if source is 16-byte aligned
520 // We set up many registers:
521 // ctr = number of 64-byte chunks to move
522 // r0/cr0 = leftover QWs to move
523 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
524 // cr6 = beq if leftover byte count is 0
525 // rv = original value of vrsave
526 // cm1 etc = loaded
527
528 LReverseAligned:
529 mfspr rv,vrsave // get bitmap of live vector registers
530 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
531 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
532 mtctr r0 // set up loop count
533 cmpwi cr6,w3,0 // set cr6 on leftover byte count
534 oris w1,rv,0xFF00 // we use v0-v7
535 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
536 mtspr vrsave,w1 // update mask
537 li cm1,-1 // get constants used in ldvx/stvx
538 li cm17,-17
539 li cm33,-33
540 li cm49,-49
541 li cm97,-97
542 li cm129,-129
543 bne cr5,LReverseVecUnal // handle unaligned operands
544 b 1f
545
546 .align 4 // align inner loops
547 1: // loop over 64-byte chunks
548 dcbt cm97,rs
549 dcbt cm129,rs
550 lvx v1,cm1,rs
551 lvx v2,cm17,rs
552 lvx v3,cm33,rs
553 lvx v4,cm49,rs
554 subi rs,rs,64
555 stvx v1,cm1,rd
556 stvx v2,cm17,rd
557 stvx v3,cm33,rd
558 stvx v4,cm49,rd
559 subi rd,rd,64
560 bdnz 1b
561
562 beq 4f // no leftover quadwords
563 mtctr r0
564 3: // loop over remaining quadwords (1-7)
565 lvx v1,cm1,rs
566 subi rs,rs,16
567 stvx v1,cm1,rd
568 subi rd,rd,16
569 bdnz 3b
570 4:
571 mtspr vrsave,rv // restore bitmap of live vr's
572 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
573 blr
574
575
576 // Long, reverse, unaligned vector loop.
577
578 LReverseVecUnal:
579 lvsl vp,0,rs // get permute vector to shift left
580 lvx v1,cm1,rs // v1 always looks ahead
581 b 1f
582
583 .align 4 // align the inner loops
584 1: // loop over 64-byte chunks
585 lvx v2,cm17,rs
586 dcbt cm97,rs
587 lvx v3,cm33,rs
588 dcbt cm129,rs
589 lvx v4,cm49,rs
590 subi rs,rs,64
591 vperm vw,v2,v1,vp
592 lvx v1,cm1,rs
593 vperm vx,v3,v2,vp
594 stvx vw,cm1,rd
595 vperm vy,v4,v3,vp
596 stvx vx,cm17,rd
597 vperm vw,v1,v4,vp
598 stvx vy,cm33,rd
599 stvx vw,cm49,rd
600 subi rd,rd,64
601 bdnz 1b
602
603 beq 3f // no leftover quadwords
604 mtctr r0
605 2: // loop over 1-3 quadwords
606 lvx v2,cm17,rs
607 subi rs,rs,16
608 vperm vx,v2,v1,vp
609 vor v1,v2,v2 // v1 <- v2
610 stvx vx,cm1,rd
611 subi rd,rd,16
612 bdnz 2b
613 3:
614 mtspr vrsave,rv // restore bitmap of live vr's
615 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
616 blr
617
618 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)