]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_g4.s
fa4fcb02f9914ca250895c162bd534faba3977ae
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_g4.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /* =======================================
31 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
32 * =======================================
33 *
34 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
35 * reading destination cache lines. Only the 7450 actually benefits from
36 * this, and then only in the cold-cache case. On 7400s and 7455s, we
37 * patch the DCBAs into NOPs.
38 *
39 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
40 * environment. Note also the rather delicate way we assign multiple uses
41 * to the same register. Beware.
42 *
43 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
44 * r2 = "w8" or vrsave ("rv")
45 * r3 = not used, as memcpy and memmove return 1st parameter as a value
46 * r4 = source ptr ("rs")
47 * r5 = count of bytes to move ("rc")
48 * r6 = "w1", "c16", or "cm17"
49 * r7 = "w2", "c32", or "cm33"
50 * r8 = "w3", "c48", or "cm49"
51 * r9 = "w4", or "cm1"
52 * r10 = "w5", "c96", or "cm97"
53 * r11 = "w6", "c128", or "cm129"
54 * r12 = destination ptr ("rd")
55 * v0 = permute vector ("vp")
56 * v1-v4 = qw's loaded from source
57 * v5-v7 = permuted qw's ("vw", "vx", "vy")
58 */
59 #define rs r4
60 #define rd r12
61 #define rc r5
62 #define rv r2
63
64 #define w1 r6
65 #define w2 r7
66 #define w3 r8
67 #define w4 r9
68 #define w5 r10
69 #define w6 r11
70 #define w7 r0
71 #define w8 r2
72
73 #define c16 r6
74 #define cm17 r6
75 #define c32 r7
76 #define cm33 r7
77 #define c48 r8
78 #define cm49 r8
79 #define cm1 r9
80 #define c96 r10
81 #define cm97 r10
82 #define c128 r11
83 #define cm129 r11
84
85 #define vp v0
86 #define vw v5
87 #define vx v6
88 #define vy v7
89
90 #define ASSEMBLER
91 #include <sys/appleapiopts.h>
92 #include <ppc/asm.h>
93 #include <machine/cpu_capabilities.h>
94 #include <machine/commpage.h>
95
96 .text
97
98 #define kMedium 32 // too long for inline loopless code
99 #define kLong 96 // long enough to justify use of Altivec
100
101
102 // Main entry points.
103
104 .align 5
105 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
106 cmplwi rc,kMedium // short or long?
107 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
108 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
109 mr rd,r4 // start to move registers to canonic spot
110 mr rs,r3
111 blt+ LShort // handle short operands
112 dcbt 0,r3 // touch in destination
113 b LMedium // join medium/long operand code
114
115 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
116
117 .align 5
118 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
119 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
120 cmplwi rc,kMedium // short or long?
121 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
122 dcbt 0,r4 // touch in the first line of source
123 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
124 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
125 bge- LMedium // handle medium or long operands
126
127 // Handle short operands.
128
129 LShort:
130 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
131 mtcrf 0x01,rc // put length bits 28-31 in cr7
132 blt- cr1,LShortReverse
133
134 // Forward short operands. This is the most frequent case, so it is inline.
135
136 beq LShort16 // quadword to move?
137 lwz w1,0(rs)
138 lwz w2,4(rs)
139 lwz w3,8(rs)
140 lwz w4,12(rs)
141 addi rs,rs,16
142 stw w1,0(rd)
143 stw w2,4(rd)
144 stw w3,8(rd)
145 stw w4,12(rd)
146 addi rd,rd,16
147 LShort16: // join here to xfer 0-15 bytes
148 bf 28,2f // doubleword?
149 lwz w1,0(rs)
150 lwz w2,4(rs)
151 addi rs,rs,8
152 stw w1,0(rd)
153 stw w2,4(rd)
154 addi rd,rd,8
155 2:
156 bf 29,3f // word?
157 lwz w1,0(rs)
158 addi rs,rs,4
159 stw w1,0(rd)
160 addi rd,rd,4
161 3:
162 bf 30,4f // halfword to move?
163 lhz w1,0(rs)
164 addi rs,rs,2
165 sth w1,0(rd)
166 addi rd,rd,2
167 4:
168 bflr 31 // skip if no odd byte
169 lbz w1,0(rs)
170 stb w1,0(rd)
171 blr
172
173
174 // Handle short reverse operands.
175 // cr0 = bne if bit 27 of length is set
176 // cr7 = bits 28-31 of length
177
178 LShortReverse:
179 add rs,rs,rc // adjust ptrs for reverse move
180 add rd,rd,rc
181 beq LShortReverse16 // quadword to move?
182 lwz w1,-4(rs)
183 lwz w2,-8(rs)
184 lwz w3,-12(rs)
185 lwzu w4,-16(rs)
186 stw w1,-4(rd)
187 stw w2,-8(rd)
188 stw w3,-12(rd)
189 stwu w4,-16(rd)
190 LShortReverse16: // join here to xfer 0-15 bytes and return
191 bf 28,2f // doubleword?
192 lwz w1,-4(rs)
193 lwzu w2,-8(rs)
194 stw w1,-4(rd)
195 stwu w2,-8(rd)
196 2:
197 bf 29,3f // word?
198 lwzu w1,-4(rs)
199 stwu w1,-4(rd)
200 3:
201 bf 30,4f // halfword to move?
202 lhzu w1,-2(rs)
203 sthu w1,-2(rd)
204 4:
205 bflr 31 // done if no odd byte
206 lbz w1,-1(rs) // no update
207 stb w1,-1(rd)
208 blr
209
210
211 // Medium and long operands. Use Altivec if long enough, else scalar loops.
212 // w1 = (rd-rs), used to check for alignment
213 // cr1 = blt iff we must move reverse
214
215 .align 4
216 LMedium:
217 dcbtst 0,rd // touch in destination
218 cmplwi cr7,rc,kLong // long enough for vectors?
219 neg w3,rd // start to compute #bytes to align destination
220 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
221 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
222 blt cr1,LMediumReverse // handle reverse moves
223 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
224 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
225 bge cr7,LFwdLong // long enough for vectors
226
227 // Medium length: use scalar loops.
228 // w6/cr0 = #bytes to 8-byte align destination
229 // cr6 = beq if relatively doubleword aligned
230
231 sub rc,rc,w6 // decrement length remaining
232 beq 1f // skip if dest already doubleword aligned
233 mtxer w6 // set up count for move
234 lswx w1,0,rs // move w6 bytes to align destination
235 stswx w1,0,rd
236 add rs,rs,w6 // bump ptrs past
237 add rd,rd,w6
238 1:
239 srwi r0,rc,4 // get # 16-byte chunks (>=1)
240 mtcrf 0x01,rc // save remaining byte count here for LShort16
241 mtctr r0 // set up 16-byte loop
242 bne cr6,3f // source not 4-byte aligned
243 b 2f
244
245 .align 4
246 2: // loop over 16-byte aligned chunks
247 lfd f0,0(rs)
248 lfd f1,8(rs)
249 addi rs,rs,16
250 stfd f0,0(rd)
251 stfd f1,8(rd)
252 addi rd,rd,16
253 bdnz 2b
254
255 b LShort16
256
257 .align 4
258 3: // loop over 16-byte unaligned chunks
259 lwz w1,0(rs)
260 lwz w2,4(rs)
261 lwz w3,8(rs)
262 lwz w4,12(rs)
263 addi rs,rs,16
264 stw w1,0(rd)
265 stw w2,4(rd)
266 stw w3,8(rd)
267 stw w4,12(rd)
268 addi rd,rd,16
269 bdnz 3b
270
271 b LShort16
272
273
274 // Vector loops. First, we must 32-byte align the destination.
275 // w1 = (rd-rs), used to check for reverse and alignment
276 // w4 = #bytes to 32-byte align destination
277 // rc = long enough for at least one vector loop
278
279 LFwdLong:
280 cmpwi w4,0 // dest already aligned?
281 sub rc,rc,w4 // adjust length
282 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
283 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
284 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
285 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
286 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
287 beq LFwdAligned // dest is already aligned
288
289 // 32-byte align destination.
290
291 bf 31,1f // byte to move?
292 lbz w1,0(rs)
293 addi rs,rs,1
294 stb w1,0(rd)
295 addi rd,rd,1
296 1:
297 bf 30,2f // halfword?
298 lhz w1,0(rs)
299 addi rs,rs,2
300 sth w1,0(rd)
301 addi rd,rd,2
302 2:
303 bf 29,3f // word?
304 lwz w1,0(rs)
305 addi rs,rs,4
306 stw w1,0(rd)
307 addi rd,rd,4
308 3:
309 bf 28,4f // doubleword?
310 lwz w1,0(rs)
311 lwz w2,4(rs)
312 addi rs,rs,8
313 stw w1,0(rd)
314 stw w2,4(rd)
315 addi rd,rd,8
316 4:
317 bf 27,LFwdAligned // quadword?
318 lwz w1,0(rs)
319 lwz w2,4(rs)
320 lwz w3,8(rs)
321 lwz w4,12(rs)
322 addi rs,rs,16
323 stw w1,0(rd)
324 stw w2,4(rd)
325 stw w3,8(rd)
326 stw w4,12(rd)
327 addi rd,rd,16
328
329
330 // Destination is 32-byte aligned.
331 // r0 = count of 64-byte chunks to move (not 0)
332 // rd = 32-byte aligned
333 // rc = bytes remaining
334 // cr5 = beq if source is 16-byte aligned
335 // We set up many registers:
336 // ctr = number of 64-byte chunks to move
337 // r0/cr0 = leftover QWs to move
338 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
339 // cr6 = beq if leftover byte count is 0
340 // rv = original value of vrsave
341 // c16 etc = loaded
342
343 LFwdAligned:
344 mfspr rv,vrsave // get bitmap of live vector registers
345 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
346 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
347 mtctr r0 // set up loop count
348 cmpwi cr6,w3,0 // set cr6 on leftover byte count
349 oris w1,rv,0xFF00 // we use v0-v7
350 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
351 mtspr vrsave,w1 // update mask
352 li c16,16 // get constants used in ldvx/stvx
353 li c32,32
354 li c48,48
355 li c96,96
356 li c128,128
357 bne cr5,LForwardVecUnal // handle unaligned operands
358 b 1f
359
360 .align 4
361 1: // loop over 64-byte chunks
362 dcbt c96,rs
363 dcbt c128,rs
364 lvx v1,0,rs
365 lvx v2,c16,rs
366 lvx v3,c32,rs
367 lvx v4,c48,rs
368 addi rs,rs,64
369 dcba 0,rd // patched to NOP on some machines
370 stvx v1,0,rd
371 stvx v2,c16,rd
372 dcba c32,rd // patched to NOP on some machines
373 stvx v3,c32,rd
374 stvx v4,c48,rd
375 addi rd,rd,64
376 bdnz 1b
377
378 beq 4f // no leftover quadwords
379 mtctr r0
380 3: // loop over remaining quadwords (1-3)
381 lvx v1,0,rs
382 addi rs,rs,16
383 stvx v1,0,rd
384 addi rd,rd,16
385 bdnz 3b
386 4:
387 mtspr vrsave,rv // restore bitmap of live vr's
388 bne cr6,LShort16 // handle last 0-15 bytes if any
389 blr
390
391
392 // Long, forward, unaligned vector loop.
393
394 LForwardVecUnal:
395 lvsl vp,0,rs // get permute vector to shift left
396 lvx v1,0,rs // prefetch 1st source quadword
397 b 1f
398
399 .align 4 // align inner loops
400 1: // loop over 64-byte chunks
401 lvx v2,c16,rs
402 dcbt c96,rs
403 lvx v3,c32,rs
404 dcbt c128,rs
405 lvx v4,c48,rs
406 addi rs,rs,64
407 vperm vw,v1,v2,vp
408 lvx v1,0,rs
409 vperm vx,v2,v3,vp
410 dcba 0,rd // patched to NOP on some machines
411 stvx vw,0,rd
412 vperm vy,v3,v4,vp
413 stvx vx,c16,rd
414 vperm vw,v4,v1,vp
415 dcba c32,rd // patched to NOP on some machines
416 stvx vy,c32,rd
417 stvx vw,c48,rd
418 addi rd,rd,64
419 bdnz 1b
420
421 beq- 4f // no leftover quadwords
422 mtctr r0
423 3: // loop over remaining quadwords
424 lvx v2,c16,rs
425 addi rs,rs,16
426 vperm vx,v1,v2,vp
427 vor v1,v2,v2 // v1 <- v2
428 stvx vx,0,rd
429 addi rd,rd,16
430 bdnz 3b
431 4:
432 mtspr vrsave,rv // restore bitmap of live vr's
433 bne cr6,LShort16 // handle last 0-15 bytes if any
434 blr
435
436
437 // Medium and long, reverse moves. We use altivec if the operands are long enough,
438 // else a lwz/stx loop.
439 // w1 = (rd-rs), used to check for reverse and alignment
440 // cr7 = bge if long
441
442 LMediumReverse:
443 add rd,rd,rc // point to end of operands
444 add rs,rs,rc
445 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
446 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
447 bge cr7,LLongReverse // long enough for vectors
448
449 // Scalar loop.
450 // w6 = #bytes to 4-byte align destination
451
452 sub rc,rc,w6 // decrement length remaining
453 mtxer w6 // set up count for move
454 sub rs,rs,w6 // back up ptrs
455 sub rd,rd,w6
456 srwi r0,rc,4 // get # 16-byte chunks (>=1)
457 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
458 lswx w1,0,rs // move w6 bytes to align destination
459 stswx w1,0,rd
460 mtctr r0 // set up 16-byte loop
461 b 1f
462
463 .align 4
464 1: // loop over 16-byte aligned chunks
465 lwz w1,-4(rs)
466 lwz w2,-8(rs)
467 lwz w3,-12(rs)
468 lwzu w4,-16(rs)
469 stw w1,-4(rd)
470 stw w2,-8(rd)
471 stw w3,-12(rd)
472 stwu w4,-16(rd)
473 bdnz 1b
474
475 b LShortReverse16
476
477
478 // Reverse vector loops. First, we must 32-byte align the destination.
479 // w1 = (rd-rs), used to check for reverse and alignment
480 // w4/cr0 = #bytes to 32-byte align destination
481 // rc = long enough for at least one vector loop
482
483 LLongReverse:
484 sub rc,rc,w4 // adjust length
485 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
486 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
487 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
488 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
489 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
490 beq LReverseAligned // dest is already aligned
491
492 // 32-byte align destination.
493
494 bf 31,1f // byte to move?
495 lbzu w1,-1(rs)
496 stbu w1,-1(rd)
497 1:
498 bf 30,2f // halfword?
499 lhzu w1,-2(rs)
500 sthu w1,-2(rd)
501 2:
502 bf 29,3f // word?
503 lwzu w1,-4(rs)
504 stwu w1,-4(rd)
505 3:
506 bf 28,4f // doubleword?
507 lwz w1,-4(rs)
508 lwzu w2,-8(rs)
509 stw w1,-4(rd)
510 stwu w2,-8(rd)
511 4:
512 bf 27,LReverseAligned // quadword?
513 lwz w1,-4(rs)
514 lwz w2,-8(rs)
515 lwz w3,-12(rs)
516 lwzu w4,-16(rs)
517 stw w1,-4(rd)
518 stw w2,-8(rd)
519 stw w3,-12(rd)
520 stwu w4,-16(rd)
521
522 // Destination is 32-byte aligned.
523 // r0 = count of 64-byte chunks to move (not 0)
524 // rd = 32-byte aligned
525 // rc = bytes remaining
526 // cr5 = beq if source is 16-byte aligned
527 // We set up many registers:
528 // ctr = number of 64-byte chunks to move
529 // r0/cr0 = leftover QWs to move
530 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
531 // cr6 = beq if leftover byte count is 0
532 // rv = original value of vrsave
533 // cm1 etc = loaded
534
535 LReverseAligned:
536 mfspr rv,vrsave // get bitmap of live vector registers
537 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
538 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
539 mtctr r0 // set up loop count
540 cmpwi cr6,w3,0 // set cr6 on leftover byte count
541 oris w1,rv,0xFF00 // we use v0-v7
542 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
543 mtspr vrsave,w1 // update mask
544 li cm1,-1 // get constants used in ldvx/stvx
545 li cm17,-17
546 li cm33,-33
547 li cm49,-49
548 li cm97,-97
549 li cm129,-129
550 bne cr5,LReverseVecUnal // handle unaligned operands
551 b 1f
552
553 .align 4 // align inner loops
554 1: // loop over 64-byte chunks
555 dcbt cm97,rs
556 dcbt cm129,rs
557 lvx v1,cm1,rs
558 lvx v2,cm17,rs
559 lvx v3,cm33,rs
560 lvx v4,cm49,rs
561 subi rs,rs,64
562 stvx v1,cm1,rd
563 stvx v2,cm17,rd
564 stvx v3,cm33,rd
565 stvx v4,cm49,rd
566 subi rd,rd,64
567 bdnz 1b
568
569 beq 4f // no leftover quadwords
570 mtctr r0
571 3: // loop over remaining quadwords (1-7)
572 lvx v1,cm1,rs
573 subi rs,rs,16
574 stvx v1,cm1,rd
575 subi rd,rd,16
576 bdnz 3b
577 4:
578 mtspr vrsave,rv // restore bitmap of live vr's
579 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
580 blr
581
582
583 // Long, reverse, unaligned vector loop.
584
585 LReverseVecUnal:
586 lvsl vp,0,rs // get permute vector to shift left
587 lvx v1,cm1,rs // v1 always looks ahead
588 b 1f
589
590 .align 4 // align the inner loops
591 1: // loop over 64-byte chunks
592 lvx v2,cm17,rs
593 dcbt cm97,rs
594 lvx v3,cm33,rs
595 dcbt cm129,rs
596 lvx v4,cm49,rs
597 subi rs,rs,64
598 vperm vw,v2,v1,vp
599 lvx v1,cm1,rs
600 vperm vx,v3,v2,vp
601 stvx vw,cm1,rd
602 vperm vy,v4,v3,vp
603 stvx vx,cm17,rd
604 vperm vw,v1,v4,vp
605 stvx vy,cm33,rd
606 stvx vw,cm49,rd
607 subi rd,rd,64
608 bdnz 1b
609
610 beq 3f // no leftover quadwords
611 mtctr r0
612 2: // loop over 1-3 quadwords
613 lvx v2,cm17,rs
614 subi rs,rs,16
615 vperm vx,v2,v1,vp
616 vor v1,v2,v2 // v1 <- v2
617 stvx vx,cm1,rd
618 subi rd,rd,16
619 bdnz 2b
620 3:
621 mtspr vrsave,rv // restore bitmap of live vr's
622 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
623 blr
624
625 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)