]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_g4.s
xnu-344.21.74.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_g4.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /* =======================================
26 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
27 * =======================================
28 *
29 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
30 * reading destination cache lines. Only the 7450 actually benefits from
31 * this, and then only in the cold-cache case. On 7400s and 7455s, we
32 * patch the DCBAs into NOPs.
33 *
34 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
35 * environment. Note also the rather delicate way we assign multiple uses
36 * to the same register. Beware.
37 *
38 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
39 * r2 = "w8" or vrsave ("rv")
40 * r3 = not used, as memcpy and memmove return 1st parameter as a value
41 * r4 = source ptr ("rs")
42 * r5 = count of bytes to move ("rc")
43 * r6 = "w1", "c16", or "cm17"
44 * r7 = "w2", "c32", or "cm33"
45 * r8 = "w3", "c48", or "cm49"
46 * r9 = "w4", or "cm1"
47 * r10 = "w5", "c96", or "cm97"
48 * r11 = "w6", "c128", or "cm129"
49 * r12 = destination ptr ("rd")
50 * v0 = permute vector ("vp")
51 * v1-v4 = qw's loaded from source
52 * v5-v7 = permuted qw's ("vw", "vx", "vy")
53 */
54 #define rs r4
55 #define rd r12
56 #define rc r5
57 #define rv r2
58
59 #define w1 r6
60 #define w2 r7
61 #define w3 r8
62 #define w4 r9
63 #define w5 r10
64 #define w6 r11
65 #define w7 r0
66 #define w8 r2
67
68 #define c16 r6
69 #define cm17 r6
70 #define c32 r7
71 #define cm33 r7
72 #define c48 r8
73 #define cm49 r8
74 #define cm1 r9
75 #define c96 r10
76 #define cm97 r10
77 #define c128 r11
78 #define cm129 r11
79
80 #define vp v0
81 #define vw v5
82 #define vx v6
83 #define vy v7
84
85 #define ASSEMBLER
86 #include <sys/appleapiopts.h>
87 #include <ppc/asm.h>
88 #include <machine/cpu_capabilities.h>
89 #include <machine/commpage.h>
90
91 .text
92 .globl EXT(bcopy_g4)
93
94 #define kMedium 32 // too long for inline loopless code
95 #define kLong 96 // long enough to justify use of Altivec
96
97
98 // Main entry points.
99
100 .align 5
101 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
102 cmplwi rc,kMedium // short or long?
103 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
104 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
105 mr rd,r4 // start to move registers to canonic spot
106 mr rs,r3
107 blt+ LShort // handle short operands
108 dcbt 0,r3 // touch in destination
109 b LMedium // join medium/long operand code
110
111 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
112
113 .align 5
114 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
115 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
116 cmplwi rc,kMedium // short or long?
117 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
118 dcbt 0,r4 // touch in the first line of source
119 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
120 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
121 bge- LMedium // handle medium or long operands
122
123 // Handle short operands.
124
125 LShort:
126 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
127 mtcrf 0x01,rc // put length bits 28-31 in cr7
128 blt- cr1,LShortReverse
129
130 // Forward short operands. This is the most frequent case, so it is inline.
131
132 beq LShort16 // quadword to move?
133 lwz w1,0(rs)
134 lwz w2,4(rs)
135 lwz w3,8(rs)
136 lwz w4,12(rs)
137 addi rs,rs,16
138 stw w1,0(rd)
139 stw w2,4(rd)
140 stw w3,8(rd)
141 stw w4,12(rd)
142 addi rd,rd,16
143 LShort16: // join here to xfer 0-15 bytes
144 bf 28,2f // doubleword?
145 lwz w1,0(rs)
146 lwz w2,4(rs)
147 addi rs,rs,8
148 stw w1,0(rd)
149 stw w2,4(rd)
150 addi rd,rd,8
151 2:
152 bf 29,3f // word?
153 lwz w1,0(rs)
154 addi rs,rs,4
155 stw w1,0(rd)
156 addi rd,rd,4
157 3:
158 bf 30,4f // halfword to move?
159 lhz w1,0(rs)
160 addi rs,rs,2
161 sth w1,0(rd)
162 addi rd,rd,2
163 4:
164 bflr 31 // skip if no odd byte
165 lbz w1,0(rs)
166 stb w1,0(rd)
167 blr
168
169
170 // Handle short reverse operands.
171 // cr0 = bne if bit 27 of length is set
172 // cr7 = bits 28-31 of length
173
174 LShortReverse:
175 add rs,rs,rc // adjust ptrs for reverse move
176 add rd,rd,rc
177 beq LShortReverse16 // quadword to move?
178 lwz w1,-4(rs)
179 lwz w2,-8(rs)
180 lwz w3,-12(rs)
181 lwzu w4,-16(rs)
182 stw w1,-4(rd)
183 stw w2,-8(rd)
184 stw w3,-12(rd)
185 stwu w4,-16(rd)
186 LShortReverse16: // join here to xfer 0-15 bytes and return
187 bf 28,2f // doubleword?
188 lwz w1,-4(rs)
189 lwzu w2,-8(rs)
190 stw w1,-4(rd)
191 stwu w2,-8(rd)
192 2:
193 bf 29,3f // word?
194 lwzu w1,-4(rs)
195 stwu w1,-4(rd)
196 3:
197 bf 30,4f // halfword to move?
198 lhzu w1,-2(rs)
199 sthu w1,-2(rd)
200 4:
201 bflr 31 // done if no odd byte
202 lbz w1,-1(rs) // no update
203 stb w1,-1(rd)
204 blr
205
206
207 // Medium and long operands. Use Altivec if long enough, else scalar loops.
208 // w1 = (rd-rs), used to check for alignment
209 // cr1 = blt iff we must move reverse
210
211 .align 4
212 LMedium:
213 dcbtst 0,rd // touch in destination
214 cmplwi cr7,rc,kLong // long enough for vectors?
215 neg w3,rd // start to compute #bytes to align destination
216 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
217 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
218 blt cr1,LMediumReverse // handle reverse moves
219 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
220 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
221 bge cr7,LFwdLong // long enough for vectors
222
223 // Medium length: use scalar loops.
224 // w6/cr0 = #bytes to 8-byte align destination
225 // cr6 = beq if relatively doubleword aligned
226
227 sub rc,rc,w6 // decrement length remaining
228 beq 1f // skip if dest already doubleword aligned
229 mtxer w6 // set up count for move
230 lswx w1,0,rs // move w6 bytes to align destination
231 stswx w1,0,rd
232 add rs,rs,w6 // bump ptrs past
233 add rd,rd,w6
234 1:
235 srwi r0,rc,4 // get # 16-byte chunks (>=1)
236 mtcrf 0x01,rc // save remaining byte count here for LShort16
237 mtctr r0 // set up 16-byte loop
238 bne cr6,3f // source not 4-byte aligned
239 b 2f
240
241 .align 4
242 2: // loop over 16-byte aligned chunks
243 lfd f0,0(rs)
244 lfd f1,8(rs)
245 addi rs,rs,16
246 stfd f0,0(rd)
247 stfd f1,8(rd)
248 addi rd,rd,16
249 bdnz 2b
250
251 b LShort16
252
253 .align 4
254 3: // loop over 16-byte unaligned chunks
255 lwz w1,0(rs)
256 lwz w2,4(rs)
257 lwz w3,8(rs)
258 lwz w4,12(rs)
259 addi rs,rs,16
260 stw w1,0(rd)
261 stw w2,4(rd)
262 stw w3,8(rd)
263 stw w4,12(rd)
264 addi rd,rd,16
265 bdnz 3b
266
267 b LShort16
268
269
270 // Vector loops. First, we must 32-byte align the destination.
271 // w1 = (rd-rs), used to check for reverse and alignment
272 // w4 = #bytes to 32-byte align destination
273 // rc = long enough for at least one vector loop
274
275 LFwdLong:
276 cmpwi w4,0 // dest already aligned?
277 sub rc,rc,w4 // adjust length
278 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
279 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
280 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
281 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
282 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
283 beq LFwdAligned // dest is already aligned
284
285 // 32-byte align destination.
286
287 bf 31,1f // byte to move?
288 lbz w1,0(rs)
289 addi rs,rs,1
290 stb w1,0(rd)
291 addi rd,rd,1
292 1:
293 bf 30,2f // halfword?
294 lhz w1,0(rs)
295 addi rs,rs,2
296 sth w1,0(rd)
297 addi rd,rd,2
298 2:
299 bf 29,3f // word?
300 lwz w1,0(rs)
301 addi rs,rs,4
302 stw w1,0(rd)
303 addi rd,rd,4
304 3:
305 bf 28,4f // doubleword?
306 lwz w1,0(rs)
307 lwz w2,4(rs)
308 addi rs,rs,8
309 stw w1,0(rd)
310 stw w2,4(rd)
311 addi rd,rd,8
312 4:
313 bf 27,LFwdAligned // quadword?
314 lwz w1,0(rs)
315 lwz w2,4(rs)
316 lwz w3,8(rs)
317 lwz w4,12(rs)
318 addi rs,rs,16
319 stw w1,0(rd)
320 stw w2,4(rd)
321 stw w3,8(rd)
322 stw w4,12(rd)
323 addi rd,rd,16
324
325
326 // Destination is 32-byte aligned.
327 // r0 = count of 64-byte chunks to move (not 0)
328 // rd = 32-byte aligned
329 // rc = bytes remaining
330 // cr5 = beq if source is 16-byte aligned
331 // We set up many registers:
332 // ctr = number of 64-byte chunks to move
333 // r0/cr0 = leftover QWs to move
334 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
335 // cr6 = beq if leftover byte count is 0
336 // rv = original value of vrsave
337 // c16 etc = loaded
338
339 LFwdAligned:
340 mfspr rv,vrsave // get bitmap of live vector registers
341 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
342 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
343 mtctr r0 // set up loop count
344 cmpwi cr6,w3,0 // set cr6 on leftover byte count
345 oris w1,rv,0xFF00 // we use v0-v7
346 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
347 mtspr vrsave,w1 // update mask
348 li c16,16 // get constants used in ldvx/stvx
349 li c32,32
350 li c48,48
351 li c96,96
352 li c128,128
353 bne cr5,LForwardVecUnal // handle unaligned operands
354 b 1f
355
356 .align 4
357 1: // loop over 64-byte chunks
358 dcbt c96,rs
359 dcbt c128,rs
360 lvx v1,0,rs
361 lvx v2,c16,rs
362 lvx v3,c32,rs
363 lvx v4,c48,rs
364 addi rs,rs,64
365 dcba 0,rd // patched to NOP on some machines
366 stvx v1,0,rd
367 stvx v2,c16,rd
368 dcba c32,rd // patched to NOP on some machines
369 stvx v3,c32,rd
370 stvx v4,c48,rd
371 addi rd,rd,64
372 bdnz 1b
373
374 beq 4f // no leftover quadwords
375 mtctr r0
376 3: // loop over remaining quadwords (1-3)
377 lvx v1,0,rs
378 addi rs,rs,16
379 stvx v1,0,rd
380 addi rd,rd,16
381 bdnz 3b
382 4:
383 mtspr vrsave,rv // restore bitmap of live vr's
384 bne cr6,LShort16 // handle last 0-15 bytes if any
385 blr
386
387
388 // Long, forward, unaligned vector loop.
389
390 LForwardVecUnal:
391 lvsl vp,0,rs // get permute vector to shift left
392 lvx v1,0,rs // prefetch 1st source quadword
393 b 1f
394
395 .align 4 // align inner loops
396 1: // loop over 64-byte chunks
397 lvx v2,c16,rs
398 dcbt c96,rs
399 lvx v3,c32,rs
400 dcbt c128,rs
401 lvx v4,c48,rs
402 addi rs,rs,64
403 vperm vw,v1,v2,vp
404 lvx v1,0,rs
405 vperm vx,v2,v3,vp
406 dcba 0,rd // patched to NOP on some machines
407 stvx vw,0,rd
408 vperm vy,v3,v4,vp
409 stvx vx,c16,rd
410 vperm vw,v4,v1,vp
411 dcba c32,rd // patched to NOP on some machines
412 stvx vy,c32,rd
413 stvx vw,c48,rd
414 addi rd,rd,64
415 bdnz 1b
416
417 beq- 4f // no leftover quadwords
418 mtctr r0
419 3: // loop over remaining quadwords
420 lvx v2,c16,rs
421 addi rs,rs,16
422 vperm vx,v1,v2,vp
423 vor v1,v2,v2 // v1 <- v2
424 stvx vx,0,rd
425 addi rd,rd,16
426 bdnz 3b
427 4:
428 mtspr vrsave,rv // restore bitmap of live vr's
429 bne cr6,LShort16 // handle last 0-15 bytes if any
430 blr
431
432
433 // Medium and long, reverse moves. We use altivec if the operands are long enough,
434 // else a lwz/stx loop.
435 // w1 = (rd-rs), used to check for reverse and alignment
436 // cr7 = bge if long
437
438 LMediumReverse:
439 add rd,rd,rc // point to end of operands
440 add rs,rs,rc
441 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
442 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
443 bge cr7,LLongReverse // long enough for vectors
444
445 // Scalar loop.
446 // w6 = #bytes to 4-byte align destination
447
448 sub rc,rc,w6 // decrement length remaining
449 mtxer w6 // set up count for move
450 sub rs,rs,w6 // back up ptrs
451 sub rd,rd,w6
452 srwi r0,rc,4 // get # 16-byte chunks (>=1)
453 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
454 lswx w1,0,rs // move w6 bytes to align destination
455 stswx w1,0,rd
456 mtctr r0 // set up 16-byte loop
457 b 1f
458
459 .align 4
460 1: // loop over 16-byte aligned chunks
461 lwz w1,-4(rs)
462 lwz w2,-8(rs)
463 lwz w3,-12(rs)
464 lwzu w4,-16(rs)
465 stw w1,-4(rd)
466 stw w2,-8(rd)
467 stw w3,-12(rd)
468 stwu w4,-16(rd)
469 bdnz 1b
470
471 b LShortReverse16
472
473
474 // Reverse vector loops. First, we must 32-byte align the destination.
475 // w1 = (rd-rs), used to check for reverse and alignment
476 // w4/cr0 = #bytes to 32-byte align destination
477 // rc = long enough for at least one vector loop
478
479 LLongReverse:
480 sub rc,rc,w4 // adjust length
481 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
482 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
483 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
484 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
485 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
486 beq LReverseAligned // dest is already aligned
487
488 // 32-byte align destination.
489
490 bf 31,1f // byte to move?
491 lbzu w1,-1(rs)
492 stbu w1,-1(rd)
493 1:
494 bf 30,2f // halfword?
495 lhzu w1,-2(rs)
496 sthu w1,-2(rd)
497 2:
498 bf 29,3f // word?
499 lwzu w1,-4(rs)
500 stwu w1,-4(rd)
501 3:
502 bf 28,4f // doubleword?
503 lwz w1,-4(rs)
504 lwzu w2,-8(rs)
505 stw w1,-4(rd)
506 stwu w2,-8(rd)
507 4:
508 bf 27,LReverseAligned // quadword?
509 lwz w1,-4(rs)
510 lwz w2,-8(rs)
511 lwz w3,-12(rs)
512 lwzu w4,-16(rs)
513 stw w1,-4(rd)
514 stw w2,-8(rd)
515 stw w3,-12(rd)
516 stwu w4,-16(rd)
517
518 // Destination is 32-byte aligned.
519 // r0 = count of 64-byte chunks to move (not 0)
520 // rd = 32-byte aligned
521 // rc = bytes remaining
522 // cr5 = beq if source is 16-byte aligned
523 // We set up many registers:
524 // ctr = number of 64-byte chunks to move
525 // r0/cr0 = leftover QWs to move
526 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
527 // cr6 = beq if leftover byte count is 0
528 // rv = original value of vrsave
529 // cm1 etc = loaded
530
531 LReverseAligned:
532 mfspr rv,vrsave // get bitmap of live vector registers
533 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
534 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
535 mtctr r0 // set up loop count
536 cmpwi cr6,w3,0 // set cr6 on leftover byte count
537 oris w1,rv,0xFF00 // we use v0-v7
538 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
539 mtspr vrsave,w1 // update mask
540 li cm1,-1 // get constants used in ldvx/stvx
541 li cm17,-17
542 li cm33,-33
543 li cm49,-49
544 li cm97,-97
545 li cm129,-129
546 bne cr5,LReverseVecUnal // handle unaligned operands
547 b 1f
548
549 .align 4 // align inner loops
550 1: // loop over 64-byte chunks
551 dcbt cm97,rs
552 dcbt cm129,rs
553 lvx v1,cm1,rs
554 lvx v2,cm17,rs
555 lvx v3,cm33,rs
556 lvx v4,cm49,rs
557 subi rs,rs,64
558 stvx v1,cm1,rd
559 stvx v2,cm17,rd
560 stvx v3,cm33,rd
561 stvx v4,cm49,rd
562 subi rd,rd,64
563 bdnz 1b
564
565 beq 4f // no leftover quadwords
566 mtctr r0
567 3: // loop over remaining quadwords (1-7)
568 lvx v1,cm1,rs
569 subi rs,rs,16
570 stvx v1,cm1,rd
571 subi rd,rd,16
572 bdnz 3b
573 4:
574 mtspr vrsave,rv // restore bitmap of live vr's
575 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
576 blr
577
578
579 // Long, reverse, unaligned vector loop.
580
581 LReverseVecUnal:
582 lvsl vp,0,rs // get permute vector to shift left
583 lvx v1,cm1,rs // v1 always looks ahead
584 b 1f
585
586 .align 4 // align the inner loops
587 1: // loop over 64-byte chunks
588 lvx v2,cm17,rs
589 dcbt cm97,rs
590 lvx v3,cm33,rs
591 dcbt cm129,rs
592 lvx v4,cm49,rs
593 subi rs,rs,64
594 vperm vw,v2,v1,vp
595 lvx v1,cm1,rs
596 vperm vx,v3,v2,vp
597 stvx vw,cm1,rd
598 vperm vy,v4,v3,vp
599 stvx vx,cm17,rd
600 vperm vw,v1,v4,vp
601 stvx vy,cm33,rd
602 stvx vw,cm49,rd
603 subi rd,rd,64
604 bdnz 1b
605
606 beq 3f // no leftover quadwords
607 mtctr r0
608 2: // loop over 1-3 quadwords
609 lvx v2,cm17,rs
610 subi rs,rs,16
611 vperm vx,v2,v1,vp
612 vor v1,v2,v2 // v1 <- v2
613 stvx vx,cm1,rd
614 subi rd,rd,16
615 bdnz 2b
616 3:
617 mtspr vrsave,rv // restore bitmap of live vr's
618 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
619 blr
620
621 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA)