]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_970.s
xnu-792.13.8.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /* =======================================
31 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
32 * =======================================
33 *
34 * Version of 6/11/2003, tuned for the IBM 970.
35 *
36 * Register usage. Note the rather delicate way we assign multiple uses
37 * to the same register. Beware.
38 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
39 * r3 = not used, as memcpy and memmove return 1st parameter as a value
40 * r4 = source ptr ("rs")
41 * r5 = count of bytes to move ("rc")
42 * r6 = "w1", "c16", or "cm17"
43 * r7 = "w2", "c32", or "cm33"
44 * r8 = "w3", "c48", or "cm49"
45 * r9 = "w4", or "cm1"
46 * r10 = vrsave ("rv")
47 * r11 = unused
48 * r12 = destination ptr ("rd")
49 * v0 = permute vector ("vp")
50 * v1-v8 = qw's loaded from source
51 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
52 */
53 #define rs r4
54 #define rd r12
55 #define rc r5
56 #define rv r10
57
58 #define w1 r6
59 #define w2 r7
60 #define w3 r8
61 #define w4 r9
62
63 #define c16 r6
64 #define cm17 r6
65 #define c32 r7
66 #define cm33 r7
67 #define c48 r8
68 #define cm49 r8
69 #define cm1 r9
70
71 #define vp v0
72 #define vw v9
73 #define vx v10
74 #define vy v11
75 #define vz v12
76
77 #define ASSEMBLER
78 #include <sys/appleapiopts.h>
79 #include <ppc/asm.h>
80 #include <machine/cpu_capabilities.h>
81 #include <machine/commpage.h>
82
83 .text
84 /*
85 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
86 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
87 * simple transformations:
88 * - all word compares are changed to doubleword
89 * - all "srwi[.]" opcodes are changed to "srdi[.]"
90 * Nothing else is done. For this to work, the following rules must be
91 * carefully followed:
92 * - do not use carry or overflow
93 * - only use record mode if you are sure the results are mode-invariant
94 * for example, all "andi." and almost all "rlwinm." are fine
95 * - do not use "slwi", "slw", or "srw"
96 * An imaginative programmer could break the porting model in other ways, but the above
97 * are the most likely problem areas. It is perhaps surprising how well in practice
98 * this simple method works.
99 */
100
101 #define kShort 64
102 #define kVeryLong (128*1024)
103
104
105 // Main entry points.
106
107 .align 5
108 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
109 cmplwi rc,kShort // short or long?
110 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
111 mr rd,r4 // move registers to canonic spot
112 mr rs,r3
113 blt LShort // handle short operands
114 dcbt 0,rs // touch in the first line of source
115 dcbtst 0,rd // touch in destination
116 b LLong1 // join long operand code
117
118 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
119
120 .align 5
121 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
122 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
123 cmplwi rc,kShort // short or long?
124 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
125 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
126 bge LLong0 // handle long operands
127
128 // Handle short operands.
129 // rs = source
130 // rd = destination
131 // rc = count
132 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
133
134 LShort:
135 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
136 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
137 mtcrf 0x01,rc
138 blt-- cr1,LShortReverse
139
140 // Forward short operands. This is the most frequent case, so it is inline.
141
142 bf 26,0f // 32-byte chunk to move?
143 ld w1,0(rs)
144 ld w2,8(rs)
145 ld w3,16(rs)
146 ld w4,24(rs)
147 addi rs,rs,32
148 std w1,0(rd)
149 std w2,8(rd)
150 std w3,16(rd)
151 std w4,24(rd)
152 addi rd,rd,32
153 0:
154 LShort32:
155 bf 27,1f // quadword to move?
156 ld w1,0(rs)
157 ld w3,8(rs)
158 addi rs,rs,16
159 std w1,0(rd)
160 std w3,8(rd)
161 addi rd,rd,16
162 1:
163 LShort16: // join here to xfer 0-15 bytes
164 bf 28,2f // doubleword?
165 ld w1,0(rs)
166 addi rs,rs,8
167 std w1,0(rd)
168 addi rd,rd,8
169 2:
170 bf 29,3f // word?
171 lwz w1,0(rs)
172 addi rs,rs,4
173 stw w1,0(rd)
174 addi rd,rd,4
175 3:
176 bf 30,4f // halfword to move?
177 lhz w1,0(rs)
178 addi rs,rs,2
179 sth w1,0(rd)
180 addi rd,rd,2
181 4:
182 bflr 31 // skip if no odd byte
183 lbz w1,0(rs)
184 stb w1,0(rd)
185 blr
186
187
188 // Handle short reverse operands.
189 // cr = length in bits 26-31
190
191 LShortReverse:
192 add rs,rs,rc // adjust ptrs for reverse move
193 add rd,rd,rc
194 bf 26,0f // 32 bytes to move?
195 ld w1,-8(rs)
196 ld w2,-16(rs)
197 ld w3,-24(rs)
198 ldu w4,-32(rs)
199 std w1,-8(rd)
200 std w2,-16(rd)
201 std w3,-24(rd)
202 stdu w4,-32(rd)
203 0:
204 bf 27,1f // quadword to move?
205 ld w1,-8(rs)
206 ldu w2,-16(rs)
207 std w1,-8(rd)
208 stdu w2,-16(rd)
209 1:
210 LShortReverse16: // join here to xfer 0-15 bytes and return
211 bf 28,2f // doubleword?
212 ldu w1,-8(rs)
213 stdu w1,-8(rd)
214 2:
215 bf 29,3f // word?
216 lwzu w1,-4(rs)
217 stwu w1,-4(rd)
218 3:
219 bf 30,4f // halfword to move?
220 lhzu w1,-2(rs)
221 sthu w1,-2(rd)
222 4:
223 bflr 31 // done if no odd byte
224 lbz w1,-1(rs) // no update
225 stb w1,-1(rd)
226 blr
227
228
229 // Long operands, use Altivec in most cases.
230 // rs = source
231 // rd = destination
232 // rc = count
233 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
234
235 LLong0: // entry from memmove()
236 dcbt 0,rs // touch in source
237 dcbtst 0,rd // touch in destination
238 LLong1: // entry from bcopy() with operands already touched in
239 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
240 neg w3,rd // start to compute #bytes to align destination
241 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
242 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
243 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
244 blt-- cr1,LLongReverse // handle reverse moves
245 sub rc,rc,w4 // adjust length for aligning destination
246 srwi r0,rc,7 // get #cache lines to copy (may be 0)
247 cmpwi cr1,r0,0 // set cr1 on #chunks
248 beq LFwdAligned // dest is already aligned
249
250 // 16-byte align destination.
251
252 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
253 bf 31,1f // byte to move?
254 lbz w1,0(rs)
255 addi rs,rs,1
256 stb w1,0(rd)
257 addi rd,rd,1
258 1:
259 bf 30,2f // halfword?
260 lhz w1,0(rs)
261 addi rs,rs,2
262 sth w1,0(rd)
263 addi rd,rd,2
264 2:
265 bf 29,3f // word?
266 lwz w1,0(rs)
267 addi rs,rs,4
268 stw w1,0(rd)
269 addi rd,rd,4
270 3:
271 bf 28,LFwdAligned // doubleword?
272 ld w1,0(rs)
273 addi rs,rs,8
274 std w1,0(rd)
275 addi rd,rd,8
276
277
278 // Forward, destination is 16-byte aligned. There are five cases:
279 // 1. If the length>=kVeryLong (ie, several pages), then use the
280 // "bigcopy" path that pulls all the punches. This is the fastest
281 // case for cold-cache operands, as any this long will likely be.
282 // 2. If length>=128 and source is 16-byte aligned, then use the
283 // lvx/stvx loop over 128-byte chunks. This is the fastest
284 // case for hot-cache operands, 2nd fastest for cold.
285 // 3. If length>=128 and source is not 16-byte aligned, then use the
286 // lvx/vperm/stvx loop over 128-byte chunks.
287 // 4. If length<128 and source is 8-byte aligned, then use the
288 // ld/std loop over 32-byte chunks.
289 // 5. If length<128 and source is not 8-byte aligned, then use the
290 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
291 // Registers at this point:
292 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
293 // rs = alignment unknown
294 // rd = 16-byte aligned
295 // rc = bytes remaining
296 // w2 = low 4 bits of (rd-rs), used to check alignment
297 // cr5 = beq if source is also 16-byte aligned
298
299 LFwdAligned:
300 andi. w3,w2,7 // is source at least 8-byte aligned?
301 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
302 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
303 srwi w1,rc,5 // get 32-byte chunk count
304 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
305 mtctr w1 // set up 32-byte loop (w1!=0)
306 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
307 mfspr rv,vrsave // get bitmap of live vector registers
308 oris w4,rv,0xFFF8 // we use v0-v12
309 li c16,16 // get constant used in lvx
310 li c32,32
311 mtspr vrsave,w4 // update mask
312 lvx v1,0,rs // prefetch 1st source quadword
313 lvsl vp,0,rs // get permute vector to shift left
314
315
316 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
317
318 1: // loop over 32-byte chunks
319 lvx v2,c16,rs
320 lvx v3,c32,rs
321 addi rs,rs,32
322 vperm vx,v1,v2,vp
323 vperm vy,v2,v3,vp
324 vor v1,v3,v3 // v1 <- v3
325 stvx vx,0,rd
326 stvx vy,c16,rd
327 addi rd,rd,32
328 bdnz 1b
329
330 mtspr vrsave,rv // restore bitmap of live vr's
331 b LShort32
332
333
334 // Fewer than 128 bytes and doubleword aligned: use ld/std.
335
336 .align 5
337 LFwdMedAligned: // loop over 32-byte chunks
338 ld w1,0(rs)
339 ld w2,8(rs)
340 ld w3,16(rs)
341 ld w4,24(rs)
342 addi rs,rs,32
343 std w1,0(rd)
344 std w2,8(rd)
345 std w3,16(rd)
346 std w4,24(rd)
347 addi rd,rd,32
348 bdnz LFwdMedAligned
349
350 b LShort32
351
352
353 // Forward, 128 bytes or more: use vectors. When entered:
354 // r0 = 128-byte chunks to move (>0)
355 // rd = 16-byte aligned
356 // cr5 = beq if source is 16-byte aligned
357 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
358 // We set up many registers:
359 // ctr = number of 128-byte chunks to move
360 // r0/cr0 = leftover QWs to move
361 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
362 // cr6 = beq if leftover byte count is 0
363 // rv = original value of VRSave
364 // c16,c32,c48 = loaded
365
366 LFwdLongVectors:
367 mfspr rv,vrsave // get bitmap of live vector registers
368 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
369 cmplw cr1,rc,w3 // very long operand?
370 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
371 bge-- cr1,LBigCopy // handle big copies separately
372 mtctr r0 // set up loop count
373 cmpwi cr6,w3,0 // set cr6 on leftover byte count
374 oris w4,rv,0xFFF8 // we use v0-v12
375 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
376 li c16,16 // get constants used in ldvx/stvx
377 mtspr vrsave,w4 // update mask
378 li c32,32
379 li c48,48
380 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
381 lvsl vp,0,rs // get permute vector to shift left
382 lvx v1,0,rs // prefetch 1st source quadword
383 b LFwdLongUnaligned
384
385
386 // Forward, long, unaligned vector loop.
387
388 .align 5 // align inner loops
389 LFwdLongUnaligned: // loop over 128-byte chunks
390 addi w4,rs,64
391 lvx v2,c16,rs
392 lvx v3,c32,rs
393 lvx v4,c48,rs
394 lvx v5,0,w4
395 lvx v6,c16,w4
396 vperm vw,v1,v2,vp
397 lvx v7,c32,w4
398 lvx v8,c48,w4
399 addi rs,rs,128
400 vperm vx,v2,v3,vp
401 addi w4,rd,64
402 lvx v1,0,rs
403 stvx vw,0,rd
404 vperm vy,v3,v4,vp
405 stvx vx,c16,rd
406 vperm vz,v4,v5,vp
407 stvx vy,c32,rd
408 vperm vw,v5,v6,vp
409 stvx vz,c48,rd
410 vperm vx,v6,v7,vp
411 addi rd,rd,128
412 stvx vw,0,w4
413 vperm vy,v7,v8,vp
414 stvx vx,c16,w4
415 vperm vz,v8,v1,vp
416 stvx vy,c32,w4
417 stvx vz,c48,w4
418 bdnz LFwdLongUnaligned
419
420 beq 4f // no leftover quadwords
421 mtctr r0
422 3: // loop over remaining quadwords
423 lvx v2,c16,rs
424 addi rs,rs,16
425 vperm vx,v1,v2,vp
426 vor v1,v2,v2 // v1 <- v2
427 stvx vx,0,rd
428 addi rd,rd,16
429 bdnz 3b
430 4:
431 mtspr vrsave,rv // restore bitmap of live vr's
432 bne cr6,LShort16 // handle last 0-15 bytes if any
433 blr
434
435
436 // Forward, long, 16-byte aligned vector loop.
437
438 .align 5
439 LFwdLongAligned: // loop over 128-byte chunks
440 addi w4,rs,64
441 lvx v1,0,rs
442 lvx v2,c16,rs
443 lvx v3,c32,rs
444 lvx v4,c48,rs
445 lvx v5,0,w4
446 lvx v6,c16,w4
447 lvx v7,c32,w4
448 lvx v8,c48,w4
449 addi rs,rs,128
450 addi w4,rd,64
451 stvx v1,0,rd
452 stvx v2,c16,rd
453 stvx v3,c32,rd
454 stvx v4,c48,rd
455 stvx v5,0,w4
456 stvx v6,c16,w4
457 stvx v7,c32,w4
458 stvx v8,c48,w4
459 addi rd,rd,128
460 bdnz LFwdLongAligned
461
462 beq 4f // no leftover quadwords
463 mtctr r0
464 3: // loop over remaining quadwords (1-7)
465 lvx v1,0,rs
466 addi rs,rs,16
467 stvx v1,0,rd
468 addi rd,rd,16
469 bdnz 3b
470 4:
471 mtspr vrsave,rv // restore bitmap of live vr's
472 bne cr6,LShort16 // handle last 0-15 bytes if any
473 blr
474
475
476 // Long, reverse moves.
477 // rs = source
478 // rd = destination
479 // rc = count
480 // cr5 = beq if relatively 16-byte aligned
481
482 LLongReverse:
483 add rd,rd,rc // point to end of operands
484 add rs,rs,rc
485 andi. r0,rd,0xF // #bytes to 16-byte align destination
486 beq 2f // already aligned
487
488 // 16-byte align destination.
489
490 mtctr r0 // set up for loop
491 sub rc,rc,r0
492 1:
493 lbzu w1,-1(rs)
494 stbu w1,-1(rd)
495 bdnz 1b
496
497 // Prepare for reverse vector loop. When entered:
498 // rd = 16-byte aligned
499 // cr5 = beq if source also 16-byte aligned
500 // We set up many registers:
501 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
502 // r0/cr0 = leftover QWs to move
503 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
504 // cr6 = beq if leftover byte count is 0
505 // cm1 = -1
506 // rv = original value of vrsave
507
508 2:
509 mfspr rv,vrsave // get bitmap of live vector registers
510 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
511 oris w1,rv,0xFFF8 // we use v0-v12
512 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
513 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
514 cmpwi cr1,r0,0 // set cr1 on chunk count
515 mtspr vrsave,w1 // update mask
516 mtctr r0 // set up loop count
517 cmpwi cr6,w3,0 // set cr6 on leftover byte count
518 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
519 li cm1,-1 // get constants used in ldvx/stvx
520
521 bne cr5,LReverseVecUnal // handle unaligned operands
522 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
523 li cm17,-17
524 li cm33,-33
525 li cm49,-49
526 b 1f
527
528 // Long, reverse 16-byte-aligned vector loop.
529
530 .align 5 // align inner loops
531 1: // loop over 64-byte chunks
532 lvx v1,cm1,rs
533 lvx v2,cm17,rs
534 lvx v3,cm33,rs
535 lvx v4,cm49,rs
536 subi rs,rs,64
537 stvx v1,cm1,rd
538 stvx v2,cm17,rd
539 stvx v3,cm33,rd
540 stvx v4,cm49,rd
541 subi rd,rd,64
542 bdnz 1b
543
544 beq 4f // no leftover quadwords
545 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
546 mtctr r0
547 3: // loop over remaining quadwords (1-7)
548 lvx v1,cm1,rs
549 subi rs,rs,16
550 stvx v1,cm1,rd
551 subi rd,rd,16
552 bdnz 3b
553 4:
554 mtspr vrsave,rv // restore bitmap of live vr's
555 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
556 blr
557
558
559 // Long, reverse, unaligned vector loop.
560 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
561 // r0/cr0 = leftover QWs to move
562 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
563 // cr6 = beq if leftover byte count is 0
564 // rv = original value of vrsave
565 // cm1 = -1
566
567 LReverseVecUnal:
568 lvsl vp,0,rs // get permute vector to shift left
569 lvx v1,cm1,rs // v1 always looks ahead
570 li cm17,-17
571 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
572 li cm33,-33
573 li cm49,-49
574 b 1f
575
576 .align 5 // align the inner loops
577 1: // loop over 64-byte chunks
578 lvx v2,cm17,rs
579 lvx v3,cm33,rs
580 lvx v4,cm49,rs
581 subi rs,rs,64
582 vperm vx,v2,v1,vp
583 lvx v1,cm1,rs
584 vperm vy,v3,v2,vp
585 stvx vx,cm1,rd
586 vperm vz,v4,v3,vp
587 stvx vy,cm17,rd
588 vperm vx,v1,v4,vp
589 stvx vz,cm33,rd
590 stvx vx,cm49,rd
591 subi rd,rd,64
592 bdnz 1b
593
594 beq 4f // no leftover quadwords
595 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
596 mtctr r0
597 3: // loop over 1-3 quadwords
598 lvx v2,cm17,rs
599 subi rs,rs,16
600 vperm vx,v2,v1,vp
601 vor v1,v2,v2 // v1 <- v2
602 stvx vx,cm1,rd
603 subi rd,rd,16
604 bdnz 3b
605 4:
606 mtspr vrsave,rv // restore bitmap of live vr's
607 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
608 blr
609
610
611 // Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
612 // The conditions bigcopy expects are:
613 // r0 = return address (also stored in caller's SF)
614 // r4 = source ptr
615 // r5 = length (at least several pages)
616 // r12 = dest ptr
617
618 LBigCopy:
619 lis r2,0x4000 // r2 <- 0x40000000
620 mflr r0 // get our return address
621 add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
622 stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
623 blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
624 std r0,16(r1) // save return in correct spot for 64-bit mode
625 ba _COMM_PAGE_BIGCOPY // then join big operand code
626
627
628 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
629 kCommPageMTCRF+kCommPageBoth+kPort32to64)