]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_g4.s
xnu-792.17.14.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_g4.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
31 *
32 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
33 * reading destination cache lines. Only the 7450 actually benefits from
34 * this, and then only in the cold-cache case. On 7400s and 7455s, we
35 * patch the DCBAs into NOPs.
36 *
37 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
38 * environment. Note also the rather delicate way we assign multiple uses
39 * to the same register. Beware.
40 *
41 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
42 * r2 = "w8" or vrsave ("rv")
43 * r3 = not used, as memcpy and memmove return 1st parameter as a value
44 * r4 = source ptr ("rs")
45 * r5 = count of bytes to move ("rc")
46 * r6 = "w1", "c16", or "cm17"
47 * r7 = "w2", "c32", or "cm33"
48 * r8 = "w3", "c48", or "cm49"
49 * r9 = "w4", or "cm1"
50 * r10 = "w5", "c96", or "cm97"
51 * r11 = "w6", "c128", or "cm129"
52 * r12 = destination ptr ("rd")
53 * v0 = permute vector ("vp")
54 * v1-v4 = qw's loaded from source
55 * v5-v7 = permuted qw's ("vw", "vx", "vy")
56 */
57 #define rs r4
58 #define rd r12
59 #define rc r5
60 #define rv r2
61
62 #define w1 r6
63 #define w2 r7
64 #define w3 r8
65 #define w4 r9
66 #define w5 r10
67 #define w6 r11
68 #define w7 r0
69 #define w8 r2
70
71 #define c16 r6
72 #define cm17 r6
73 #define c32 r7
74 #define cm33 r7
75 #define c48 r8
76 #define cm49 r8
77 #define cm1 r9
78 #define c96 r10
79 #define cm97 r10
80 #define c128 r11
81 #define cm129 r11
82
83 #define vp v0
84 #define vw v5
85 #define vx v6
86 #define vy v7
87
88 #define ASSEMBLER
89 #include <sys/appleapiopts.h>
90 #include <ppc/asm.h>
91 #include <machine/cpu_capabilities.h>
92 #include <machine/commpage.h>
93
94 .text
95
96 #define kMedium 32 // too long for inline loopless code
97 #define kLong 96 // long enough to justify use of Altivec
98
99
100 // Main entry points.
101
102 .align 5
103 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
104 cmplwi rc,kMedium // short or long?
105 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
106 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
107 mr rd,r4 // start to move registers to canonic spot
108 mr rs,r3
109 blt+ LShort // handle short operands
110 dcbt 0,r3 // touch in destination
111 b LMedium // join medium/long operand code
112
113 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
114
115 .align 5
116 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
117 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
118 cmplwi rc,kMedium // short or long?
119 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
120 dcbt 0,r4 // touch in the first line of source
121 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
122 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
123 bge- LMedium // handle medium or long operands
124
125 // Handle short operands.
126
127 LShort:
128 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
129 mtcrf 0x01,rc // put length bits 28-31 in cr7
130 blt- cr1,LShortReverse
131
132 // Forward short operands. This is the most frequent case, so it is inline.
133
134 beq LShort16 // quadword to move?
135 lwz w1,0(rs)
136 lwz w2,4(rs)
137 lwz w3,8(rs)
138 lwz w4,12(rs)
139 addi rs,rs,16
140 stw w1,0(rd)
141 stw w2,4(rd)
142 stw w3,8(rd)
143 stw w4,12(rd)
144 addi rd,rd,16
145 LShort16: // join here to xfer 0-15 bytes
146 bf 28,2f // doubleword?
147 lwz w1,0(rs)
148 lwz w2,4(rs)
149 addi rs,rs,8
150 stw w1,0(rd)
151 stw w2,4(rd)
152 addi rd,rd,8
153 2:
154 bf 29,3f // word?
155 lwz w1,0(rs)
156 addi rs,rs,4
157 stw w1,0(rd)
158 addi rd,rd,4
159 3:
160 bf 30,4f // halfword to move?
161 lhz w1,0(rs)
162 addi rs,rs,2
163 sth w1,0(rd)
164 addi rd,rd,2
165 4:
166 bflr 31 // skip if no odd byte
167 lbz w1,0(rs)
168 stb w1,0(rd)
169 blr
170
171
172 // Handle short reverse operands.
173 // cr0 = bne if bit 27 of length is set
174 // cr7 = bits 28-31 of length
175
176 LShortReverse:
177 add rs,rs,rc // adjust ptrs for reverse move
178 add rd,rd,rc
179 beq LShortReverse16 // quadword to move?
180 lwz w1,-4(rs)
181 lwz w2,-8(rs)
182 lwz w3,-12(rs)
183 lwzu w4,-16(rs)
184 stw w1,-4(rd)
185 stw w2,-8(rd)
186 stw w3,-12(rd)
187 stwu w4,-16(rd)
188 LShortReverse16: // join here to xfer 0-15 bytes and return
189 bf 28,2f // doubleword?
190 lwz w1,-4(rs)
191 lwzu w2,-8(rs)
192 stw w1,-4(rd)
193 stwu w2,-8(rd)
194 2:
195 bf 29,3f // word?
196 lwzu w1,-4(rs)
197 stwu w1,-4(rd)
198 3:
199 bf 30,4f // halfword to move?
200 lhzu w1,-2(rs)
201 sthu w1,-2(rd)
202 4:
203 bflr 31 // done if no odd byte
204 lbz w1,-1(rs) // no update
205 stb w1,-1(rd)
206 blr
207
208
209 // Medium and long operands. Use Altivec if long enough, else scalar loops.
210 // w1 = (rd-rs), used to check for alignment
211 // cr1 = blt iff we must move reverse
212
213 .align 4
214 LMedium:
215 dcbtst 0,rd // touch in destination
216 cmplwi cr7,rc,kLong // long enough for vectors?
217 neg w3,rd // start to compute #bytes to align destination
218 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
219 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
220 blt cr1,LMediumReverse // handle reverse moves
221 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
222 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
223 bge cr7,LFwdLong // long enough for vectors
224
225 // Medium length: use scalar loops.
226 // w6/cr0 = #bytes to 8-byte align destination
227 // cr6 = beq if relatively doubleword aligned
228
229 sub rc,rc,w6 // decrement length remaining
230 beq 1f // skip if dest already doubleword aligned
231 mtxer w6 // set up count for move
232 lswx w1,0,rs // move w6 bytes to align destination
233 stswx w1,0,rd
234 add rs,rs,w6 // bump ptrs past
235 add rd,rd,w6
236 1:
237 srwi r0,rc,4 // get # 16-byte chunks (>=1)
238 mtcrf 0x01,rc // save remaining byte count here for LShort16
239 mtctr r0 // set up 16-byte loop
240 bne cr6,3f // source not 4-byte aligned
241 b 2f
242
243 .align 4
244 2: // loop over 16-byte aligned chunks
245 lfd f0,0(rs)
246 lfd f1,8(rs)
247 addi rs,rs,16
248 stfd f0,0(rd)
249 stfd f1,8(rd)
250 addi rd,rd,16
251 bdnz 2b
252
253 b LShort16
254
255 .align 4
256 3: // loop over 16-byte unaligned chunks
257 lwz w1,0(rs)
258 lwz w2,4(rs)
259 lwz w3,8(rs)
260 lwz w4,12(rs)
261 addi rs,rs,16
262 stw w1,0(rd)
263 stw w2,4(rd)
264 stw w3,8(rd)
265 stw w4,12(rd)
266 addi rd,rd,16
267 bdnz 3b
268
269 b LShort16
270
271
272 // Vector loops. First, we must 32-byte align the destination.
273 // w1 = (rd-rs), used to check for reverse and alignment
274 // w4 = #bytes to 32-byte align destination
275 // rc = long enough for at least one vector loop
276
277 LFwdLong:
278 cmpwi w4,0 // dest already aligned?
279 sub rc,rc,w4 // adjust length
280 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
281 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
282 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
283 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
284 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
285 beq LFwdAligned // dest is already aligned
286
287 // 32-byte align destination.
288
289 bf 31,1f // byte to move?
290 lbz w1,0(rs)
291 addi rs,rs,1
292 stb w1,0(rd)
293 addi rd,rd,1
294 1:
295 bf 30,2f // halfword?
296 lhz w1,0(rs)
297 addi rs,rs,2
298 sth w1,0(rd)
299 addi rd,rd,2
300 2:
301 bf 29,3f // word?
302 lwz w1,0(rs)
303 addi rs,rs,4
304 stw w1,0(rd)
305 addi rd,rd,4
306 3:
307 bf 28,4f // doubleword?
308 lwz w1,0(rs)
309 lwz w2,4(rs)
310 addi rs,rs,8
311 stw w1,0(rd)
312 stw w2,4(rd)
313 addi rd,rd,8
314 4:
315 bf 27,LFwdAligned // quadword?
316 lwz w1,0(rs)
317 lwz w2,4(rs)
318 lwz w3,8(rs)
319 lwz w4,12(rs)
320 addi rs,rs,16
321 stw w1,0(rd)
322 stw w2,4(rd)
323 stw w3,8(rd)
324 stw w4,12(rd)
325 addi rd,rd,16
326
327
328 // Destination is 32-byte aligned.
329 // r0 = count of 64-byte chunks to move (not 0)
330 // rd = 32-byte aligned
331 // rc = bytes remaining
332 // cr5 = beq if source is 16-byte aligned
333 // We set up many registers:
334 // ctr = number of 64-byte chunks to move
335 // r0/cr0 = leftover QWs to move
336 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
337 // cr6 = beq if leftover byte count is 0
338 // rv = original value of vrsave
339 // c16 etc = loaded
340
341 LFwdAligned:
342 mfspr rv,vrsave // get bitmap of live vector registers
343 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
344 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
345 mtctr r0 // set up loop count
346 cmpwi cr6,w3,0 // set cr6 on leftover byte count
347 oris w1,rv,0xFF00 // we use v0-v7
348 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
349 mtspr vrsave,w1 // update mask
350 li c16,16 // get constants used in ldvx/stvx
351 li c32,32
352 li c48,48
353 li c96,96
354 li c128,128
355 bne cr5,LForwardVecUnal // handle unaligned operands
356 b 1f
357
358 .align 4
359 1: // loop over 64-byte chunks
360 dcbt c96,rs
361 dcbt c128,rs
362 lvx v1,0,rs
363 lvx v2,c16,rs
364 lvx v3,c32,rs
365 lvx v4,c48,rs
366 addi rs,rs,64
367 dcba 0,rd // patched to NOP on some machines
368 stvx v1,0,rd
369 stvx v2,c16,rd
370 dcba c32,rd // patched to NOP on some machines
371 stvx v3,c32,rd
372 stvx v4,c48,rd
373 addi rd,rd,64
374 bdnz 1b
375
376 beq 4f // no leftover quadwords
377 mtctr r0
378 3: // loop over remaining quadwords (1-3)
379 lvx v1,0,rs
380 addi rs,rs,16
381 stvx v1,0,rd
382 addi rd,rd,16
383 bdnz 3b
384 4:
385 mtspr vrsave,rv // restore bitmap of live vr's
386 bne cr6,LShort16 // handle last 0-15 bytes if any
387 blr
388
389
390 // Long, forward, unaligned vector loop.
391
392 LForwardVecUnal:
393 lvsl vp,0,rs // get permute vector to shift left
394 lvx v1,0,rs // prefetch 1st source quadword
395 b 1f
396
397 .align 4 // align inner loops
398 1: // loop over 64-byte chunks
399 lvx v2,c16,rs
400 dcbt c96,rs
401 lvx v3,c32,rs
402 dcbt c128,rs
403 lvx v4,c48,rs
404 addi rs,rs,64
405 vperm vw,v1,v2,vp
406 lvx v1,0,rs
407 vperm vx,v2,v3,vp
408 dcba 0,rd // patched to NOP on some machines
409 stvx vw,0,rd
410 vperm vy,v3,v4,vp
411 stvx vx,c16,rd
412 vperm vw,v4,v1,vp
413 dcba c32,rd // patched to NOP on some machines
414 stvx vy,c32,rd
415 stvx vw,c48,rd
416 addi rd,rd,64
417 bdnz 1b
418
419 beq- 4f // no leftover quadwords
420 mtctr r0
421 3: // loop over remaining quadwords
422 lvx v2,c16,rs
423 addi rs,rs,16
424 vperm vx,v1,v2,vp
425 vor v1,v2,v2 // v1 <- v2
426 stvx vx,0,rd
427 addi rd,rd,16
428 bdnz 3b
429 4:
430 mtspr vrsave,rv // restore bitmap of live vr's
431 bne cr6,LShort16 // handle last 0-15 bytes if any
432 blr
433
434
435 // Medium and long, reverse moves. We use altivec if the operands are long enough,
436 // else a lwz/stx loop.
437 // w1 = (rd-rs), used to check for reverse and alignment
438 // cr7 = bge if long
439
440 LMediumReverse:
441 add rd,rd,rc // point to end of operands
442 add rs,rs,rc
443 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
444 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
445 bge cr7,LLongReverse // long enough for vectors
446
447 // Scalar loop.
448 // w6 = #bytes to 4-byte align destination
449
450 sub rc,rc,w6 // decrement length remaining
451 mtxer w6 // set up count for move
452 sub rs,rs,w6 // back up ptrs
453 sub rd,rd,w6
454 srwi r0,rc,4 // get # 16-byte chunks (>=1)
455 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
456 lswx w1,0,rs // move w6 bytes to align destination
457 stswx w1,0,rd
458 mtctr r0 // set up 16-byte loop
459 b 1f
460
461 .align 4
462 1: // loop over 16-byte aligned chunks
463 lwz w1,-4(rs)
464 lwz w2,-8(rs)
465 lwz w3,-12(rs)
466 lwzu w4,-16(rs)
467 stw w1,-4(rd)
468 stw w2,-8(rd)
469 stw w3,-12(rd)
470 stwu w4,-16(rd)
471 bdnz 1b
472
473 b LShortReverse16
474
475
476 // Reverse vector loops. First, we must 32-byte align the destination.
477 // w1 = (rd-rs), used to check for reverse and alignment
478 // w4/cr0 = #bytes to 32-byte align destination
479 // rc = long enough for at least one vector loop
480
481 LLongReverse:
482 sub rc,rc,w4 // adjust length
483 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
484 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
485 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
486 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
487 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
488 beq LReverseAligned // dest is already aligned
489
490 // 32-byte align destination.
491
492 bf 31,1f // byte to move?
493 lbzu w1,-1(rs)
494 stbu w1,-1(rd)
495 1:
496 bf 30,2f // halfword?
497 lhzu w1,-2(rs)
498 sthu w1,-2(rd)
499 2:
500 bf 29,3f // word?
501 lwzu w1,-4(rs)
502 stwu w1,-4(rd)
503 3:
504 bf 28,4f // doubleword?
505 lwz w1,-4(rs)
506 lwzu w2,-8(rs)
507 stw w1,-4(rd)
508 stwu w2,-8(rd)
509 4:
510 bf 27,LReverseAligned // quadword?
511 lwz w1,-4(rs)
512 lwz w2,-8(rs)
513 lwz w3,-12(rs)
514 lwzu w4,-16(rs)
515 stw w1,-4(rd)
516 stw w2,-8(rd)
517 stw w3,-12(rd)
518 stwu w4,-16(rd)
519
520 // Destination is 32-byte aligned.
521 // r0 = count of 64-byte chunks to move (not 0)
522 // rd = 32-byte aligned
523 // rc = bytes remaining
524 // cr5 = beq if source is 16-byte aligned
525 // We set up many registers:
526 // ctr = number of 64-byte chunks to move
527 // r0/cr0 = leftover QWs to move
528 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
529 // cr6 = beq if leftover byte count is 0
530 // rv = original value of vrsave
531 // cm1 etc = loaded
532
533 LReverseAligned:
534 mfspr rv,vrsave // get bitmap of live vector registers
535 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
536 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
537 mtctr r0 // set up loop count
538 cmpwi cr6,w3,0 // set cr6 on leftover byte count
539 oris w1,rv,0xFF00 // we use v0-v7
540 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
541 mtspr vrsave,w1 // update mask
542 li cm1,-1 // get constants used in ldvx/stvx
543 li cm17,-17
544 li cm33,-33
545 li cm49,-49
546 li cm97,-97
547 li cm129,-129
548 bne cr5,LReverseVecUnal // handle unaligned operands
549 b 1f
550
551 .align 4 // align inner loops
552 1: // loop over 64-byte chunks
553 dcbt cm97,rs
554 dcbt cm129,rs
555 lvx v1,cm1,rs
556 lvx v2,cm17,rs
557 lvx v3,cm33,rs
558 lvx v4,cm49,rs
559 subi rs,rs,64
560 stvx v1,cm1,rd
561 stvx v2,cm17,rd
562 stvx v3,cm33,rd
563 stvx v4,cm49,rd
564 subi rd,rd,64
565 bdnz 1b
566
567 beq 4f // no leftover quadwords
568 mtctr r0
569 3: // loop over remaining quadwords (1-7)
570 lvx v1,cm1,rs
571 subi rs,rs,16
572 stvx v1,cm1,rd
573 subi rd,rd,16
574 bdnz 3b
575 4:
576 mtspr vrsave,rv // restore bitmap of live vr's
577 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
578 blr
579
580
581 // Long, reverse, unaligned vector loop.
582
583 LReverseVecUnal:
584 lvsl vp,0,rs // get permute vector to shift left
585 lvx v1,cm1,rs // v1 always looks ahead
586 b 1f
587
588 .align 4 // align the inner loops
589 1: // loop over 64-byte chunks
590 lvx v2,cm17,rs
591 dcbt cm97,rs
592 lvx v3,cm33,rs
593 dcbt cm129,rs
594 lvx v4,cm49,rs
595 subi rs,rs,64
596 vperm vw,v2,v1,vp
597 lvx v1,cm1,rs
598 vperm vx,v3,v2,vp
599 stvx vw,cm1,rd
600 vperm vy,v4,v3,vp
601 stvx vx,cm17,rd
602 vperm vw,v1,v4,vp
603 stvx vy,cm33,rd
604 stvx vw,cm49,rd
605 subi rd,rd,64
606 bdnz 1b
607
608 beq 3f // no leftover quadwords
609 mtctr r0
610 2: // loop over 1-3 quadwords
611 lvx v2,cm17,rs
612 subi rs,rs,16
613 vperm vx,v2,v1,vp
614 vor v1,v2,v2 // v1 <- v2
615 stvx vx,cm1,rd
616 subi rd,rd,16
617 bdnz 2b
618 3:
619 mtspr vrsave,rv // restore bitmap of live vr's
620 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
621 blr
622
623 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)