]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_970.s
xnu-1228.12.14.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
31 *
32 * Version of 6/11/2003, tuned for the IBM 970.
33 *
34 * Register usage. Note the rather delicate way we assign multiple uses
35 * to the same register. Beware.
36 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
37 * r3 = not used, as memcpy and memmove return 1st parameter as a value
38 * r4 = source ptr ("rs")
39 * r5 = count of bytes to move ("rc")
40 * r6 = "w1", "c16", or "cm17"
41 * r7 = "w2", "c32", or "cm33"
42 * r8 = "w3", "c48", or "cm49"
43 * r9 = "w4", or "cm1"
44 * r10 = vrsave ("rv")
45 * r11 = unused
46 * r12 = destination ptr ("rd")
47 * v0 = permute vector ("vp")
48 * v1-v8 = qw's loaded from source
49 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
50 */
51 #define rs r4
52 #define rd r12
53 #define rc r5
54 #define rv r10
55
56 #define w1 r6
57 #define w2 r7
58 #define w3 r8
59 #define w4 r9
60
61 #define c16 r6
62 #define cm17 r6
63 #define c32 r7
64 #define cm33 r7
65 #define c48 r8
66 #define cm49 r8
67 #define cm1 r9
68
69 #define vp v0
70 #define vw v9
71 #define vx v10
72 #define vy v11
73 #define vz v12
74
75 #include <sys/appleapiopts.h>
76 #include <ppc/asm.h>
77 #include <machine/cpu_capabilities.h>
78 #include <machine/commpage.h>
79
80 .text
81 /*
82 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
83 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
84 * simple transformations:
85 * - all word compares are changed to doubleword
86 * - all "srwi[.]" opcodes are changed to "srdi[.]"
87 * Nothing else is done. For this to work, the following rules must be
88 * carefully followed:
89 * - do not use carry or overflow
90 * - only use record mode if you are sure the results are mode-invariant
91 * for example, all "andi." and almost all "rlwinm." are fine
92 * - do not use "slwi", "slw", or "srw"
93 * An imaginative programmer could break the porting model in other ways, but the above
94 * are the most likely problem areas. It is perhaps surprising how well in practice
95 * this simple method works.
96 */
97
98 #define kShort 64
99 #define kVeryLong (128*1024)
100
101
102 // Main entry points.
103
104 .align 5
105 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
106 cmplwi rc,kShort // short or long?
107 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
108 mr rd,r4 // move registers to canonic spot
109 mr rs,r3
110 blt LShort // handle short operands
111 dcbt 0,rs // touch in the first line of source
112 dcbtst 0,rd // touch in destination
113 b LLong1 // join long operand code
114
115 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
116
117 .align 5
118 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
119 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
120 cmplwi rc,kShort // short or long?
121 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
122 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
123 bge LLong0 // handle long operands
124
125 // Handle short operands.
126 // rs = source
127 // rd = destination
128 // rc = count
129 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
130
131 LShort:
132 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
133 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
134 mtcrf 0x01,rc
135 blt-- cr1,LShortReverse
136
137 // Forward short operands. This is the most frequent case, so it is inline.
138
139 bf 26,0f // 32-byte chunk to move?
140 ld w1,0(rs)
141 ld w2,8(rs)
142 ld w3,16(rs)
143 ld w4,24(rs)
144 addi rs,rs,32
145 std w1,0(rd)
146 std w2,8(rd)
147 std w3,16(rd)
148 std w4,24(rd)
149 addi rd,rd,32
150 0:
151 LShort32:
152 bf 27,1f // quadword to move?
153 ld w1,0(rs)
154 ld w3,8(rs)
155 addi rs,rs,16
156 std w1,0(rd)
157 std w3,8(rd)
158 addi rd,rd,16
159 1:
160 LShort16: // join here to xfer 0-15 bytes
161 bf 28,2f // doubleword?
162 ld w1,0(rs)
163 addi rs,rs,8
164 std w1,0(rd)
165 addi rd,rd,8
166 2:
167 bf 29,3f // word?
168 lwz w1,0(rs)
169 addi rs,rs,4
170 stw w1,0(rd)
171 addi rd,rd,4
172 3:
173 bf 30,4f // halfword to move?
174 lhz w1,0(rs)
175 addi rs,rs,2
176 sth w1,0(rd)
177 addi rd,rd,2
178 4:
179 bflr 31 // skip if no odd byte
180 lbz w1,0(rs)
181 stb w1,0(rd)
182 blr
183
184
185 // Handle short reverse operands.
186 // cr = length in bits 26-31
187
188 LShortReverse:
189 add rs,rs,rc // adjust ptrs for reverse move
190 add rd,rd,rc
191 bf 26,0f // 32 bytes to move?
192 ld w1,-8(rs)
193 ld w2,-16(rs)
194 ld w3,-24(rs)
195 ldu w4,-32(rs)
196 std w1,-8(rd)
197 std w2,-16(rd)
198 std w3,-24(rd)
199 stdu w4,-32(rd)
200 0:
201 bf 27,1f // quadword to move?
202 ld w1,-8(rs)
203 ldu w2,-16(rs)
204 std w1,-8(rd)
205 stdu w2,-16(rd)
206 1:
207 LShortReverse16: // join here to xfer 0-15 bytes and return
208 bf 28,2f // doubleword?
209 ldu w1,-8(rs)
210 stdu w1,-8(rd)
211 2:
212 bf 29,3f // word?
213 lwzu w1,-4(rs)
214 stwu w1,-4(rd)
215 3:
216 bf 30,4f // halfword to move?
217 lhzu w1,-2(rs)
218 sthu w1,-2(rd)
219 4:
220 bflr 31 // done if no odd byte
221 lbz w1,-1(rs) // no update
222 stb w1,-1(rd)
223 blr
224
225
226 // Long operands, use Altivec in most cases.
227 // rs = source
228 // rd = destination
229 // rc = count
230 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
231
232 LLong0: // entry from memmove()
233 dcbt 0,rs // touch in source
234 dcbtst 0,rd // touch in destination
235 LLong1: // entry from bcopy() with operands already touched in
236 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
237 neg w3,rd // start to compute #bytes to align destination
238 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
239 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
240 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
241 blt-- cr1,LLongReverse // handle reverse moves
242 sub rc,rc,w4 // adjust length for aligning destination
243 srwi r0,rc,7 // get #cache lines to copy (may be 0)
244 cmpwi cr1,r0,0 // set cr1 on #chunks
245 beq LFwdAligned // dest is already aligned
246
247 // 16-byte align destination.
248
249 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
250 bf 31,1f // byte to move?
251 lbz w1,0(rs)
252 addi rs,rs,1
253 stb w1,0(rd)
254 addi rd,rd,1
255 1:
256 bf 30,2f // halfword?
257 lhz w1,0(rs)
258 addi rs,rs,2
259 sth w1,0(rd)
260 addi rd,rd,2
261 2:
262 bf 29,3f // word?
263 lwz w1,0(rs)
264 addi rs,rs,4
265 stw w1,0(rd)
266 addi rd,rd,4
267 3:
268 bf 28,LFwdAligned // doubleword?
269 ld w1,0(rs)
270 addi rs,rs,8
271 std w1,0(rd)
272 addi rd,rd,8
273
274
275 // Forward, destination is 16-byte aligned. There are five cases:
276 // 1. If the length>=kVeryLong (ie, several pages), then use the
277 // "bigcopy" path that pulls all the punches. This is the fastest
278 // case for cold-cache operands, as any this long will likely be.
279 // 2. If length>=128 and source is 16-byte aligned, then use the
280 // lvx/stvx loop over 128-byte chunks. This is the fastest
281 // case for hot-cache operands, 2nd fastest for cold.
282 // 3. If length>=128 and source is not 16-byte aligned, then use the
283 // lvx/vperm/stvx loop over 128-byte chunks.
284 // 4. If length<128 and source is 8-byte aligned, then use the
285 // ld/std loop over 32-byte chunks.
286 // 5. If length<128 and source is not 8-byte aligned, then use the
287 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
288 // Registers at this point:
289 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
290 // rs = alignment unknown
291 // rd = 16-byte aligned
292 // rc = bytes remaining
293 // w2 = low 4 bits of (rd-rs), used to check alignment
294 // cr5 = beq if source is also 16-byte aligned
295
296 LFwdAligned:
297 andi. w3,w2,7 // is source at least 8-byte aligned?
298 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
299 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
300 srwi w1,rc,5 // get 32-byte chunk count
301 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
302 mtctr w1 // set up 32-byte loop (w1!=0)
303 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
304 mfspr rv,vrsave // get bitmap of live vector registers
305 oris w4,rv,0xFFF8 // we use v0-v12
306 li c16,16 // get constant used in lvx
307 li c32,32
308 mtspr vrsave,w4 // update mask
309 lvx v1,0,rs // prefetch 1st source quadword
310 lvsl vp,0,rs // get permute vector to shift left
311
312
313 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
314
315 1: // loop over 32-byte chunks
316 lvx v2,c16,rs
317 lvx v3,c32,rs
318 addi rs,rs,32
319 vperm vx,v1,v2,vp
320 vperm vy,v2,v3,vp
321 vor v1,v3,v3 // v1 <- v3
322 stvx vx,0,rd
323 stvx vy,c16,rd
324 addi rd,rd,32
325 bdnz 1b
326
327 mtspr vrsave,rv // restore bitmap of live vr's
328 b LShort32
329
330
331 // Fewer than 128 bytes and doubleword aligned: use ld/std.
332
333 .align 5
334 LFwdMedAligned: // loop over 32-byte chunks
335 ld w1,0(rs)
336 ld w2,8(rs)
337 ld w3,16(rs)
338 ld w4,24(rs)
339 addi rs,rs,32
340 std w1,0(rd)
341 std w2,8(rd)
342 std w3,16(rd)
343 std w4,24(rd)
344 addi rd,rd,32
345 bdnz LFwdMedAligned
346
347 b LShort32
348
349
350 // Forward, 128 bytes or more: use vectors. When entered:
351 // r0 = 128-byte chunks to move (>0)
352 // rd = 16-byte aligned
353 // cr5 = beq if source is 16-byte aligned
354 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
355 // We set up many registers:
356 // ctr = number of 128-byte chunks to move
357 // r0/cr0 = leftover QWs to move
358 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
359 // cr6 = beq if leftover byte count is 0
360 // rv = original value of VRSave
361 // c16,c32,c48 = loaded
362
363 LFwdLongVectors:
364 mfspr rv,vrsave // get bitmap of live vector registers
365 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
366 cmplw cr1,rc,w3 // very long operand?
367 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
368 bge-- cr1,LBigCopy // handle big copies separately
369 mtctr r0 // set up loop count
370 cmpwi cr6,w3,0 // set cr6 on leftover byte count
371 oris w4,rv,0xFFF8 // we use v0-v12
372 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
373 li c16,16 // get constants used in ldvx/stvx
374 mtspr vrsave,w4 // update mask
375 li c32,32
376 li c48,48
377 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
378 lvsl vp,0,rs // get permute vector to shift left
379 lvx v1,0,rs // prefetch 1st source quadword
380 b LFwdLongUnaligned
381
382
383 // Forward, long, unaligned vector loop.
384
385 .align 5 // align inner loops
386 LFwdLongUnaligned: // loop over 128-byte chunks
387 addi w4,rs,64
388 lvx v2,c16,rs
389 lvx v3,c32,rs
390 lvx v4,c48,rs
391 lvx v5,0,w4
392 lvx v6,c16,w4
393 vperm vw,v1,v2,vp
394 lvx v7,c32,w4
395 lvx v8,c48,w4
396 addi rs,rs,128
397 vperm vx,v2,v3,vp
398 addi w4,rd,64
399 lvx v1,0,rs
400 stvx vw,0,rd
401 vperm vy,v3,v4,vp
402 stvx vx,c16,rd
403 vperm vz,v4,v5,vp
404 stvx vy,c32,rd
405 vperm vw,v5,v6,vp
406 stvx vz,c48,rd
407 vperm vx,v6,v7,vp
408 addi rd,rd,128
409 stvx vw,0,w4
410 vperm vy,v7,v8,vp
411 stvx vx,c16,w4
412 vperm vz,v8,v1,vp
413 stvx vy,c32,w4
414 stvx vz,c48,w4
415 bdnz LFwdLongUnaligned
416
417 beq 4f // no leftover quadwords
418 mtctr r0
419 3: // loop over remaining quadwords
420 lvx v2,c16,rs
421 addi rs,rs,16
422 vperm vx,v1,v2,vp
423 vor v1,v2,v2 // v1 <- v2
424 stvx vx,0,rd
425 addi rd,rd,16
426 bdnz 3b
427 4:
428 mtspr vrsave,rv // restore bitmap of live vr's
429 bne cr6,LShort16 // handle last 0-15 bytes if any
430 blr
431
432
433 // Forward, long, 16-byte aligned vector loop.
434
435 .align 5
436 LFwdLongAligned: // loop over 128-byte chunks
437 addi w4,rs,64
438 lvx v1,0,rs
439 lvx v2,c16,rs
440 lvx v3,c32,rs
441 lvx v4,c48,rs
442 lvx v5,0,w4
443 lvx v6,c16,w4
444 lvx v7,c32,w4
445 lvx v8,c48,w4
446 addi rs,rs,128
447 addi w4,rd,64
448 stvx v1,0,rd
449 stvx v2,c16,rd
450 stvx v3,c32,rd
451 stvx v4,c48,rd
452 stvx v5,0,w4
453 stvx v6,c16,w4
454 stvx v7,c32,w4
455 stvx v8,c48,w4
456 addi rd,rd,128
457 bdnz LFwdLongAligned
458
459 beq 4f // no leftover quadwords
460 mtctr r0
461 3: // loop over remaining quadwords (1-7)
462 lvx v1,0,rs
463 addi rs,rs,16
464 stvx v1,0,rd
465 addi rd,rd,16
466 bdnz 3b
467 4:
468 mtspr vrsave,rv // restore bitmap of live vr's
469 bne cr6,LShort16 // handle last 0-15 bytes if any
470 blr
471
472
473 // Long, reverse moves.
474 // rs = source
475 // rd = destination
476 // rc = count
477 // cr5 = beq if relatively 16-byte aligned
478
479 LLongReverse:
480 add rd,rd,rc // point to end of operands
481 add rs,rs,rc
482 andi. r0,rd,0xF // #bytes to 16-byte align destination
483 beq 2f // already aligned
484
485 // 16-byte align destination.
486
487 mtctr r0 // set up for loop
488 sub rc,rc,r0
489 1:
490 lbzu w1,-1(rs)
491 stbu w1,-1(rd)
492 bdnz 1b
493
494 // Prepare for reverse vector loop. When entered:
495 // rd = 16-byte aligned
496 // cr5 = beq if source also 16-byte aligned
497 // We set up many registers:
498 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
499 // r0/cr0 = leftover QWs to move
500 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
501 // cr6 = beq if leftover byte count is 0
502 // cm1 = -1
503 // rv = original value of vrsave
504
505 2:
506 mfspr rv,vrsave // get bitmap of live vector registers
507 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
508 oris w1,rv,0xFFF8 // we use v0-v12
509 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
510 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
511 cmpwi cr1,r0,0 // set cr1 on chunk count
512 mtspr vrsave,w1 // update mask
513 mtctr r0 // set up loop count
514 cmpwi cr6,w3,0 // set cr6 on leftover byte count
515 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
516 li cm1,-1 // get constants used in ldvx/stvx
517
518 bne cr5,LReverseVecUnal // handle unaligned operands
519 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
520 li cm17,-17
521 li cm33,-33
522 li cm49,-49
523 b 1f
524
525 // Long, reverse 16-byte-aligned vector loop.
526
527 .align 5 // align inner loops
528 1: // loop over 64-byte chunks
529 lvx v1,cm1,rs
530 lvx v2,cm17,rs
531 lvx v3,cm33,rs
532 lvx v4,cm49,rs
533 subi rs,rs,64
534 stvx v1,cm1,rd
535 stvx v2,cm17,rd
536 stvx v3,cm33,rd
537 stvx v4,cm49,rd
538 subi rd,rd,64
539 bdnz 1b
540
541 beq 4f // no leftover quadwords
542 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
543 mtctr r0
544 3: // loop over remaining quadwords (1-7)
545 lvx v1,cm1,rs
546 subi rs,rs,16
547 stvx v1,cm1,rd
548 subi rd,rd,16
549 bdnz 3b
550 4:
551 mtspr vrsave,rv // restore bitmap of live vr's
552 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
553 blr
554
555
556 // Long, reverse, unaligned vector loop.
557 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
558 // r0/cr0 = leftover QWs to move
559 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
560 // cr6 = beq if leftover byte count is 0
561 // rv = original value of vrsave
562 // cm1 = -1
563
564 LReverseVecUnal:
565 lvsl vp,0,rs // get permute vector to shift left
566 lvx v1,cm1,rs // v1 always looks ahead
567 li cm17,-17
568 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
569 li cm33,-33
570 li cm49,-49
571 b 1f
572
573 .align 5 // align the inner loops
574 1: // loop over 64-byte chunks
575 lvx v2,cm17,rs
576 lvx v3,cm33,rs
577 lvx v4,cm49,rs
578 subi rs,rs,64
579 vperm vx,v2,v1,vp
580 lvx v1,cm1,rs
581 vperm vy,v3,v2,vp
582 stvx vx,cm1,rd
583 vperm vz,v4,v3,vp
584 stvx vy,cm17,rd
585 vperm vx,v1,v4,vp
586 stvx vz,cm33,rd
587 stvx vx,cm49,rd
588 subi rd,rd,64
589 bdnz 1b
590
591 beq 4f // no leftover quadwords
592 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
593 mtctr r0
594 3: // loop over 1-3 quadwords
595 lvx v2,cm17,rs
596 subi rs,rs,16
597 vperm vx,v2,v1,vp
598 vor v1,v2,v2 // v1 <- v2
599 stvx vx,cm1,rd
600 subi rd,rd,16
601 bdnz 3b
602 4:
603 mtspr vrsave,rv // restore bitmap of live vr's
604 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
605 blr
606
607
608 // Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
609 // The conditions bigcopy expects are:
610 // r0 = return address (also stored in caller's SF)
611 // r4 = source ptr
612 // r5 = length (at least several pages)
613 // r12 = dest ptr
614
615 LBigCopy:
616 lis r2,0x4000 // r2 <- 0x40000000
617 mflr r0 // get our return address
618 add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
619 stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
620 blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
621 std r0,16(r1) // save return in correct spot for 64-bit mode
622 ba _COMM_PAGE_BIGCOPY // then join big operand code
623
624
625 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
626 kCommPageMTCRF+kCommPageBoth+kPort32to64)