]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_g4.s
xnu-1228.12.14.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_g4.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
31 *
32 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
33 * reading destination cache lines. Only the 7450 actually benefits from
34 * this, and then only in the cold-cache case. On 7400s and 7455s, we
35 * patch the DCBAs into NOPs.
36 *
37 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
38 * environment. Note also the rather delicate way we assign multiple uses
39 * to the same register. Beware.
40 *
41 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
42 * r2 = "w8" or vrsave ("rv")
43 * r3 = not used, as memcpy and memmove return 1st parameter as a value
44 * r4 = source ptr ("rs")
45 * r5 = count of bytes to move ("rc")
46 * r6 = "w1", "c16", or "cm17"
47 * r7 = "w2", "c32", or "cm33"
48 * r8 = "w3", "c48", or "cm49"
49 * r9 = "w4", or "cm1"
50 * r10 = "w5", "c96", or "cm97"
51 * r11 = "w6", "c128", or "cm129"
52 * r12 = destination ptr ("rd")
53 * v0 = permute vector ("vp")
54 * v1-v4 = qw's loaded from source
55 * v5-v7 = permuted qw's ("vw", "vx", "vy")
56 */
57 #define rs r4
58 #define rd r12
59 #define rc r5
60 #define rv r2
61
62 #define w1 r6
63 #define w2 r7
64 #define w3 r8
65 #define w4 r9
66 #define w5 r10
67 #define w6 r11
68 #define w7 r0
69 #define w8 r2
70
71 #define c16 r6
72 #define cm17 r6
73 #define c32 r7
74 #define cm33 r7
75 #define c48 r8
76 #define cm49 r8
77 #define cm1 r9
78 #define c96 r10
79 #define cm97 r10
80 #define c128 r11
81 #define cm129 r11
82
83 #define vp v0
84 #define vw v5
85 #define vx v6
86 #define vy v7
87
88 #include <sys/appleapiopts.h>
89 #include <ppc/asm.h>
90 #include <machine/cpu_capabilities.h>
91 #include <machine/commpage.h>
92
93 .text
94
95 #define kMedium 32 // too long for inline loopless code
96 #define kLong 96 // long enough to justify use of Altivec
97
98
99 // Main entry points.
100
101 .align 5
102 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
103 cmplwi rc,kMedium // short or long?
104 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
105 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
106 mr rd,r4 // start to move registers to canonic spot
107 mr rs,r3
108 blt+ LShort // handle short operands
109 dcbt 0,r3 // touch in destination
110 b LMedium // join medium/long operand code
111
112 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
113
114 .align 5
115 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
116 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
117 cmplwi rc,kMedium // short or long?
118 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
119 dcbt 0,r4 // touch in the first line of source
120 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
121 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
122 bge- LMedium // handle medium or long operands
123
124 // Handle short operands.
125
126 LShort:
127 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
128 mtcrf 0x01,rc // put length bits 28-31 in cr7
129 blt- cr1,LShortReverse
130
131 // Forward short operands. This is the most frequent case, so it is inline.
132
133 beq LShort16 // quadword to move?
134 lwz w1,0(rs)
135 lwz w2,4(rs)
136 lwz w3,8(rs)
137 lwz w4,12(rs)
138 addi rs,rs,16
139 stw w1,0(rd)
140 stw w2,4(rd)
141 stw w3,8(rd)
142 stw w4,12(rd)
143 addi rd,rd,16
144 LShort16: // join here to xfer 0-15 bytes
145 bf 28,2f // doubleword?
146 lwz w1,0(rs)
147 lwz w2,4(rs)
148 addi rs,rs,8
149 stw w1,0(rd)
150 stw w2,4(rd)
151 addi rd,rd,8
152 2:
153 bf 29,3f // word?
154 lwz w1,0(rs)
155 addi rs,rs,4
156 stw w1,0(rd)
157 addi rd,rd,4
158 3:
159 bf 30,4f // halfword to move?
160 lhz w1,0(rs)
161 addi rs,rs,2
162 sth w1,0(rd)
163 addi rd,rd,2
164 4:
165 bflr 31 // skip if no odd byte
166 lbz w1,0(rs)
167 stb w1,0(rd)
168 blr
169
170
171 // Handle short reverse operands.
172 // cr0 = bne if bit 27 of length is set
173 // cr7 = bits 28-31 of length
174
175 LShortReverse:
176 add rs,rs,rc // adjust ptrs for reverse move
177 add rd,rd,rc
178 beq LShortReverse16 // quadword to move?
179 lwz w1,-4(rs)
180 lwz w2,-8(rs)
181 lwz w3,-12(rs)
182 lwzu w4,-16(rs)
183 stw w1,-4(rd)
184 stw w2,-8(rd)
185 stw w3,-12(rd)
186 stwu w4,-16(rd)
187 LShortReverse16: // join here to xfer 0-15 bytes and return
188 bf 28,2f // doubleword?
189 lwz w1,-4(rs)
190 lwzu w2,-8(rs)
191 stw w1,-4(rd)
192 stwu w2,-8(rd)
193 2:
194 bf 29,3f // word?
195 lwzu w1,-4(rs)
196 stwu w1,-4(rd)
197 3:
198 bf 30,4f // halfword to move?
199 lhzu w1,-2(rs)
200 sthu w1,-2(rd)
201 4:
202 bflr 31 // done if no odd byte
203 lbz w1,-1(rs) // no update
204 stb w1,-1(rd)
205 blr
206
207
208 // Medium and long operands. Use Altivec if long enough, else scalar loops.
209 // w1 = (rd-rs), used to check for alignment
210 // cr1 = blt iff we must move reverse
211
212 .align 4
213 LMedium:
214 dcbtst 0,rd // touch in destination
215 cmplwi cr7,rc,kLong // long enough for vectors?
216 neg w3,rd // start to compute #bytes to align destination
217 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
218 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
219 blt cr1,LMediumReverse // handle reverse moves
220 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
221 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
222 bge cr7,LFwdLong // long enough for vectors
223
224 // Medium length: use scalar loops.
225 // w6/cr0 = #bytes to 8-byte align destination
226 // cr6 = beq if relatively doubleword aligned
227
228 sub rc,rc,w6 // decrement length remaining
229 beq 1f // skip if dest already doubleword aligned
230 mtxer w6 // set up count for move
231 lswx w1,0,rs // move w6 bytes to align destination
232 stswx w1,0,rd
233 add rs,rs,w6 // bump ptrs past
234 add rd,rd,w6
235 1:
236 srwi r0,rc,4 // get # 16-byte chunks (>=1)
237 mtcrf 0x01,rc // save remaining byte count here for LShort16
238 mtctr r0 // set up 16-byte loop
239 bne cr6,3f // source not 4-byte aligned
240 b 2f
241
242 .align 4
243 2: // loop over 16-byte aligned chunks
244 lfd f0,0(rs)
245 lfd f1,8(rs)
246 addi rs,rs,16
247 stfd f0,0(rd)
248 stfd f1,8(rd)
249 addi rd,rd,16
250 bdnz 2b
251
252 b LShort16
253
254 .align 4
255 3: // loop over 16-byte unaligned chunks
256 lwz w1,0(rs)
257 lwz w2,4(rs)
258 lwz w3,8(rs)
259 lwz w4,12(rs)
260 addi rs,rs,16
261 stw w1,0(rd)
262 stw w2,4(rd)
263 stw w3,8(rd)
264 stw w4,12(rd)
265 addi rd,rd,16
266 bdnz 3b
267
268 b LShort16
269
270
271 // Vector loops. First, we must 32-byte align the destination.
272 // w1 = (rd-rs), used to check for reverse and alignment
273 // w4 = #bytes to 32-byte align destination
274 // rc = long enough for at least one vector loop
275
276 LFwdLong:
277 cmpwi w4,0 // dest already aligned?
278 sub rc,rc,w4 // adjust length
279 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
280 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
281 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
282 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
283 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
284 beq LFwdAligned // dest is already aligned
285
286 // 32-byte align destination.
287
288 bf 31,1f // byte to move?
289 lbz w1,0(rs)
290 addi rs,rs,1
291 stb w1,0(rd)
292 addi rd,rd,1
293 1:
294 bf 30,2f // halfword?
295 lhz w1,0(rs)
296 addi rs,rs,2
297 sth w1,0(rd)
298 addi rd,rd,2
299 2:
300 bf 29,3f // word?
301 lwz w1,0(rs)
302 addi rs,rs,4
303 stw w1,0(rd)
304 addi rd,rd,4
305 3:
306 bf 28,4f // doubleword?
307 lwz w1,0(rs)
308 lwz w2,4(rs)
309 addi rs,rs,8
310 stw w1,0(rd)
311 stw w2,4(rd)
312 addi rd,rd,8
313 4:
314 bf 27,LFwdAligned // quadword?
315 lwz w1,0(rs)
316 lwz w2,4(rs)
317 lwz w3,8(rs)
318 lwz w4,12(rs)
319 addi rs,rs,16
320 stw w1,0(rd)
321 stw w2,4(rd)
322 stw w3,8(rd)
323 stw w4,12(rd)
324 addi rd,rd,16
325
326
327 // Destination is 32-byte aligned.
328 // r0 = count of 64-byte chunks to move (not 0)
329 // rd = 32-byte aligned
330 // rc = bytes remaining
331 // cr5 = beq if source is 16-byte aligned
332 // We set up many registers:
333 // ctr = number of 64-byte chunks to move
334 // r0/cr0 = leftover QWs to move
335 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
336 // cr6 = beq if leftover byte count is 0
337 // rv = original value of vrsave
338 // c16 etc = loaded
339
340 LFwdAligned:
341 mfspr rv,vrsave // get bitmap of live vector registers
342 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
343 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
344 mtctr r0 // set up loop count
345 cmpwi cr6,w3,0 // set cr6 on leftover byte count
346 oris w1,rv,0xFF00 // we use v0-v7
347 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
348 mtspr vrsave,w1 // update mask
349 li c16,16 // get constants used in ldvx/stvx
350 li c32,32
351 li c48,48
352 li c96,96
353 li c128,128
354 bne cr5,LForwardVecUnal // handle unaligned operands
355 b 1f
356
357 .align 4
358 1: // loop over 64-byte chunks
359 dcbt c96,rs
360 dcbt c128,rs
361 lvx v1,0,rs
362 lvx v2,c16,rs
363 lvx v3,c32,rs
364 lvx v4,c48,rs
365 addi rs,rs,64
366 dcba 0,rd // patched to NOP on some machines
367 stvx v1,0,rd
368 stvx v2,c16,rd
369 dcba c32,rd // patched to NOP on some machines
370 stvx v3,c32,rd
371 stvx v4,c48,rd
372 addi rd,rd,64
373 bdnz 1b
374
375 beq 4f // no leftover quadwords
376 mtctr r0
377 3: // loop over remaining quadwords (1-3)
378 lvx v1,0,rs
379 addi rs,rs,16
380 stvx v1,0,rd
381 addi rd,rd,16
382 bdnz 3b
383 4:
384 mtspr vrsave,rv // restore bitmap of live vr's
385 bne cr6,LShort16 // handle last 0-15 bytes if any
386 blr
387
388
389 // Long, forward, unaligned vector loop.
390
391 LForwardVecUnal:
392 lvsl vp,0,rs // get permute vector to shift left
393 lvx v1,0,rs // prefetch 1st source quadword
394 b 1f
395
396 .align 4 // align inner loops
397 1: // loop over 64-byte chunks
398 lvx v2,c16,rs
399 dcbt c96,rs
400 lvx v3,c32,rs
401 dcbt c128,rs
402 lvx v4,c48,rs
403 addi rs,rs,64
404 vperm vw,v1,v2,vp
405 lvx v1,0,rs
406 vperm vx,v2,v3,vp
407 dcba 0,rd // patched to NOP on some machines
408 stvx vw,0,rd
409 vperm vy,v3,v4,vp
410 stvx vx,c16,rd
411 vperm vw,v4,v1,vp
412 dcba c32,rd // patched to NOP on some machines
413 stvx vy,c32,rd
414 stvx vw,c48,rd
415 addi rd,rd,64
416 bdnz 1b
417
418 beq- 4f // no leftover quadwords
419 mtctr r0
420 3: // loop over remaining quadwords
421 lvx v2,c16,rs
422 addi rs,rs,16
423 vperm vx,v1,v2,vp
424 vor v1,v2,v2 // v1 <- v2
425 stvx vx,0,rd
426 addi rd,rd,16
427 bdnz 3b
428 4:
429 mtspr vrsave,rv // restore bitmap of live vr's
430 bne cr6,LShort16 // handle last 0-15 bytes if any
431 blr
432
433
434 // Medium and long, reverse moves. We use altivec if the operands are long enough,
435 // else a lwz/stx loop.
436 // w1 = (rd-rs), used to check for reverse and alignment
437 // cr7 = bge if long
438
439 LMediumReverse:
440 add rd,rd,rc // point to end of operands
441 add rs,rs,rc
442 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
443 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
444 bge cr7,LLongReverse // long enough for vectors
445
446 // Scalar loop.
447 // w6 = #bytes to 4-byte align destination
448
449 sub rc,rc,w6 // decrement length remaining
450 mtxer w6 // set up count for move
451 sub rs,rs,w6 // back up ptrs
452 sub rd,rd,w6
453 srwi r0,rc,4 // get # 16-byte chunks (>=1)
454 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
455 lswx w1,0,rs // move w6 bytes to align destination
456 stswx w1,0,rd
457 mtctr r0 // set up 16-byte loop
458 b 1f
459
460 .align 4
461 1: // loop over 16-byte aligned chunks
462 lwz w1,-4(rs)
463 lwz w2,-8(rs)
464 lwz w3,-12(rs)
465 lwzu w4,-16(rs)
466 stw w1,-4(rd)
467 stw w2,-8(rd)
468 stw w3,-12(rd)
469 stwu w4,-16(rd)
470 bdnz 1b
471
472 b LShortReverse16
473
474
475 // Reverse vector loops. First, we must 32-byte align the destination.
476 // w1 = (rd-rs), used to check for reverse and alignment
477 // w4/cr0 = #bytes to 32-byte align destination
478 // rc = long enough for at least one vector loop
479
480 LLongReverse:
481 sub rc,rc,w4 // adjust length
482 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
483 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
484 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
485 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
486 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
487 beq LReverseAligned // dest is already aligned
488
489 // 32-byte align destination.
490
491 bf 31,1f // byte to move?
492 lbzu w1,-1(rs)
493 stbu w1,-1(rd)
494 1:
495 bf 30,2f // halfword?
496 lhzu w1,-2(rs)
497 sthu w1,-2(rd)
498 2:
499 bf 29,3f // word?
500 lwzu w1,-4(rs)
501 stwu w1,-4(rd)
502 3:
503 bf 28,4f // doubleword?
504 lwz w1,-4(rs)
505 lwzu w2,-8(rs)
506 stw w1,-4(rd)
507 stwu w2,-8(rd)
508 4:
509 bf 27,LReverseAligned // quadword?
510 lwz w1,-4(rs)
511 lwz w2,-8(rs)
512 lwz w3,-12(rs)
513 lwzu w4,-16(rs)
514 stw w1,-4(rd)
515 stw w2,-8(rd)
516 stw w3,-12(rd)
517 stwu w4,-16(rd)
518
519 // Destination is 32-byte aligned.
520 // r0 = count of 64-byte chunks to move (not 0)
521 // rd = 32-byte aligned
522 // rc = bytes remaining
523 // cr5 = beq if source is 16-byte aligned
524 // We set up many registers:
525 // ctr = number of 64-byte chunks to move
526 // r0/cr0 = leftover QWs to move
527 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
528 // cr6 = beq if leftover byte count is 0
529 // rv = original value of vrsave
530 // cm1 etc = loaded
531
532 LReverseAligned:
533 mfspr rv,vrsave // get bitmap of live vector registers
534 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
535 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
536 mtctr r0 // set up loop count
537 cmpwi cr6,w3,0 // set cr6 on leftover byte count
538 oris w1,rv,0xFF00 // we use v0-v7
539 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
540 mtspr vrsave,w1 // update mask
541 li cm1,-1 // get constants used in ldvx/stvx
542 li cm17,-17
543 li cm33,-33
544 li cm49,-49
545 li cm97,-97
546 li cm129,-129
547 bne cr5,LReverseVecUnal // handle unaligned operands
548 b 1f
549
550 .align 4 // align inner loops
551 1: // loop over 64-byte chunks
552 dcbt cm97,rs
553 dcbt cm129,rs
554 lvx v1,cm1,rs
555 lvx v2,cm17,rs
556 lvx v3,cm33,rs
557 lvx v4,cm49,rs
558 subi rs,rs,64
559 stvx v1,cm1,rd
560 stvx v2,cm17,rd
561 stvx v3,cm33,rd
562 stvx v4,cm49,rd
563 subi rd,rd,64
564 bdnz 1b
565
566 beq 4f // no leftover quadwords
567 mtctr r0
568 3: // loop over remaining quadwords (1-7)
569 lvx v1,cm1,rs
570 subi rs,rs,16
571 stvx v1,cm1,rd
572 subi rd,rd,16
573 bdnz 3b
574 4:
575 mtspr vrsave,rv // restore bitmap of live vr's
576 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
577 blr
578
579
580 // Long, reverse, unaligned vector loop.
581
582 LReverseVecUnal:
583 lvsl vp,0,rs // get permute vector to shift left
584 lvx v1,cm1,rs // v1 always looks ahead
585 b 1f
586
587 .align 4 // align the inner loops
588 1: // loop over 64-byte chunks
589 lvx v2,cm17,rs
590 dcbt cm97,rs
591 lvx v3,cm33,rs
592 dcbt cm129,rs
593 lvx v4,cm49,rs
594 subi rs,rs,64
595 vperm vw,v2,v1,vp
596 lvx v1,cm1,rs
597 vperm vx,v3,v2,vp
598 stvx vw,cm1,rd
599 vperm vy,v4,v3,vp
600 stvx vx,cm17,rd
601 vperm vw,v1,v4,vp
602 stvx vy,cm33,rd
603 stvx vw,cm49,rd
604 subi rd,rd,64
605 bdnz 1b
606
607 beq 3f // no leftover quadwords
608 mtctr r0
609 2: // loop over 1-3 quadwords
610 lvx v2,cm17,rs
611 subi rs,rs,16
612 vperm vx,v2,v1,vp
613 vor v1,v2,v2 // v1 <- v2
614 stvx vx,cm1,rd
615 subi rd,rd,16
616 bdnz 2b
617 3:
618 mtspr vrsave,rv // restore bitmap of live vr's
619 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
620 blr
621
622 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)