]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_970.s
xnu-792.18.15.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
31 *
32 * Version of 6/11/2003, tuned for the IBM 970.
33 *
34 * Register usage. Note the rather delicate way we assign multiple uses
35 * to the same register. Beware.
36 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
37 * r3 = not used, as memcpy and memmove return 1st parameter as a value
38 * r4 = source ptr ("rs")
39 * r5 = count of bytes to move ("rc")
40 * r6 = "w1", "c16", or "cm17"
41 * r7 = "w2", "c32", or "cm33"
42 * r8 = "w3", "c48", or "cm49"
43 * r9 = "w4", or "cm1"
44 * r10 = vrsave ("rv")
45 * r11 = unused
46 * r12 = destination ptr ("rd")
47 * v0 = permute vector ("vp")
48 * v1-v8 = qw's loaded from source
49 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
50 */
51 #define rs r4
52 #define rd r12
53 #define rc r5
54 #define rv r10
55
56 #define w1 r6
57 #define w2 r7
58 #define w3 r8
59 #define w4 r9
60
61 #define c16 r6
62 #define cm17 r6
63 #define c32 r7
64 #define cm33 r7
65 #define c48 r8
66 #define cm49 r8
67 #define cm1 r9
68
69 #define vp v0
70 #define vw v9
71 #define vx v10
72 #define vy v11
73 #define vz v12
74
75 #define ASSEMBLER
76 #include <sys/appleapiopts.h>
77 #include <ppc/asm.h>
78 #include <machine/cpu_capabilities.h>
79 #include <machine/commpage.h>
80
81 .text
82 /*
83 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
84 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
85 * simple transformations:
86 * - all word compares are changed to doubleword
87 * - all "srwi[.]" opcodes are changed to "srdi[.]"
88 * Nothing else is done. For this to work, the following rules must be
89 * carefully followed:
90 * - do not use carry or overflow
91 * - only use record mode if you are sure the results are mode-invariant
92 * for example, all "andi." and almost all "rlwinm." are fine
93 * - do not use "slwi", "slw", or "srw"
94 * An imaginative programmer could break the porting model in other ways, but the above
95 * are the most likely problem areas. It is perhaps surprising how well in practice
96 * this simple method works.
97 */
98
99 #define kShort 64
100 #define kVeryLong (128*1024)
101
102
103 // Main entry points.
104
105 .align 5
106 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
107 cmplwi rc,kShort // short or long?
108 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
109 mr rd,r4 // move registers to canonic spot
110 mr rs,r3
111 blt LShort // handle short operands
112 dcbt 0,rs // touch in the first line of source
113 dcbtst 0,rd // touch in destination
114 b LLong1 // join long operand code
115
116 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
117
118 .align 5
119 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
120 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
121 cmplwi rc,kShort // short or long?
122 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
123 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
124 bge LLong0 // handle long operands
125
126 // Handle short operands.
127 // rs = source
128 // rd = destination
129 // rc = count
130 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
131
132 LShort:
133 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
134 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
135 mtcrf 0x01,rc
136 blt-- cr1,LShortReverse
137
138 // Forward short operands. This is the most frequent case, so it is inline.
139
140 bf 26,0f // 32-byte chunk to move?
141 ld w1,0(rs)
142 ld w2,8(rs)
143 ld w3,16(rs)
144 ld w4,24(rs)
145 addi rs,rs,32
146 std w1,0(rd)
147 std w2,8(rd)
148 std w3,16(rd)
149 std w4,24(rd)
150 addi rd,rd,32
151 0:
152 LShort32:
153 bf 27,1f // quadword to move?
154 ld w1,0(rs)
155 ld w3,8(rs)
156 addi rs,rs,16
157 std w1,0(rd)
158 std w3,8(rd)
159 addi rd,rd,16
160 1:
161 LShort16: // join here to xfer 0-15 bytes
162 bf 28,2f // doubleword?
163 ld w1,0(rs)
164 addi rs,rs,8
165 std w1,0(rd)
166 addi rd,rd,8
167 2:
168 bf 29,3f // word?
169 lwz w1,0(rs)
170 addi rs,rs,4
171 stw w1,0(rd)
172 addi rd,rd,4
173 3:
174 bf 30,4f // halfword to move?
175 lhz w1,0(rs)
176 addi rs,rs,2
177 sth w1,0(rd)
178 addi rd,rd,2
179 4:
180 bflr 31 // skip if no odd byte
181 lbz w1,0(rs)
182 stb w1,0(rd)
183 blr
184
185
186 // Handle short reverse operands.
187 // cr = length in bits 26-31
188
189 LShortReverse:
190 add rs,rs,rc // adjust ptrs for reverse move
191 add rd,rd,rc
192 bf 26,0f // 32 bytes to move?
193 ld w1,-8(rs)
194 ld w2,-16(rs)
195 ld w3,-24(rs)
196 ldu w4,-32(rs)
197 std w1,-8(rd)
198 std w2,-16(rd)
199 std w3,-24(rd)
200 stdu w4,-32(rd)
201 0:
202 bf 27,1f // quadword to move?
203 ld w1,-8(rs)
204 ldu w2,-16(rs)
205 std w1,-8(rd)
206 stdu w2,-16(rd)
207 1:
208 LShortReverse16: // join here to xfer 0-15 bytes and return
209 bf 28,2f // doubleword?
210 ldu w1,-8(rs)
211 stdu w1,-8(rd)
212 2:
213 bf 29,3f // word?
214 lwzu w1,-4(rs)
215 stwu w1,-4(rd)
216 3:
217 bf 30,4f // halfword to move?
218 lhzu w1,-2(rs)
219 sthu w1,-2(rd)
220 4:
221 bflr 31 // done if no odd byte
222 lbz w1,-1(rs) // no update
223 stb w1,-1(rd)
224 blr
225
226
227 // Long operands, use Altivec in most cases.
228 // rs = source
229 // rd = destination
230 // rc = count
231 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
232
233 LLong0: // entry from memmove()
234 dcbt 0,rs // touch in source
235 dcbtst 0,rd // touch in destination
236 LLong1: // entry from bcopy() with operands already touched in
237 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
238 neg w3,rd // start to compute #bytes to align destination
239 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
240 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
241 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
242 blt-- cr1,LLongReverse // handle reverse moves
243 sub rc,rc,w4 // adjust length for aligning destination
244 srwi r0,rc,7 // get #cache lines to copy (may be 0)
245 cmpwi cr1,r0,0 // set cr1 on #chunks
246 beq LFwdAligned // dest is already aligned
247
248 // 16-byte align destination.
249
250 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
251 bf 31,1f // byte to move?
252 lbz w1,0(rs)
253 addi rs,rs,1
254 stb w1,0(rd)
255 addi rd,rd,1
256 1:
257 bf 30,2f // halfword?
258 lhz w1,0(rs)
259 addi rs,rs,2
260 sth w1,0(rd)
261 addi rd,rd,2
262 2:
263 bf 29,3f // word?
264 lwz w1,0(rs)
265 addi rs,rs,4
266 stw w1,0(rd)
267 addi rd,rd,4
268 3:
269 bf 28,LFwdAligned // doubleword?
270 ld w1,0(rs)
271 addi rs,rs,8
272 std w1,0(rd)
273 addi rd,rd,8
274
275
276 // Forward, destination is 16-byte aligned. There are five cases:
277 // 1. If the length>=kVeryLong (ie, several pages), then use the
278 // "bigcopy" path that pulls all the punches. This is the fastest
279 // case for cold-cache operands, as any this long will likely be.
280 // 2. If length>=128 and source is 16-byte aligned, then use the
281 // lvx/stvx loop over 128-byte chunks. This is the fastest
282 // case for hot-cache operands, 2nd fastest for cold.
283 // 3. If length>=128 and source is not 16-byte aligned, then use the
284 // lvx/vperm/stvx loop over 128-byte chunks.
285 // 4. If length<128 and source is 8-byte aligned, then use the
286 // ld/std loop over 32-byte chunks.
287 // 5. If length<128 and source is not 8-byte aligned, then use the
288 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
289 // Registers at this point:
290 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
291 // rs = alignment unknown
292 // rd = 16-byte aligned
293 // rc = bytes remaining
294 // w2 = low 4 bits of (rd-rs), used to check alignment
295 // cr5 = beq if source is also 16-byte aligned
296
297 LFwdAligned:
298 andi. w3,w2,7 // is source at least 8-byte aligned?
299 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
300 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
301 srwi w1,rc,5 // get 32-byte chunk count
302 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
303 mtctr w1 // set up 32-byte loop (w1!=0)
304 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
305 mfspr rv,vrsave // get bitmap of live vector registers
306 oris w4,rv,0xFFF8 // we use v0-v12
307 li c16,16 // get constant used in lvx
308 li c32,32
309 mtspr vrsave,w4 // update mask
310 lvx v1,0,rs // prefetch 1st source quadword
311 lvsl vp,0,rs // get permute vector to shift left
312
313
314 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
315
316 1: // loop over 32-byte chunks
317 lvx v2,c16,rs
318 lvx v3,c32,rs
319 addi rs,rs,32
320 vperm vx,v1,v2,vp
321 vperm vy,v2,v3,vp
322 vor v1,v3,v3 // v1 <- v3
323 stvx vx,0,rd
324 stvx vy,c16,rd
325 addi rd,rd,32
326 bdnz 1b
327
328 mtspr vrsave,rv // restore bitmap of live vr's
329 b LShort32
330
331
332 // Fewer than 128 bytes and doubleword aligned: use ld/std.
333
334 .align 5
335 LFwdMedAligned: // loop over 32-byte chunks
336 ld w1,0(rs)
337 ld w2,8(rs)
338 ld w3,16(rs)
339 ld w4,24(rs)
340 addi rs,rs,32
341 std w1,0(rd)
342 std w2,8(rd)
343 std w3,16(rd)
344 std w4,24(rd)
345 addi rd,rd,32
346 bdnz LFwdMedAligned
347
348 b LShort32
349
350
351 // Forward, 128 bytes or more: use vectors. When entered:
352 // r0 = 128-byte chunks to move (>0)
353 // rd = 16-byte aligned
354 // cr5 = beq if source is 16-byte aligned
355 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
356 // We set up many registers:
357 // ctr = number of 128-byte chunks to move
358 // r0/cr0 = leftover QWs to move
359 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
360 // cr6 = beq if leftover byte count is 0
361 // rv = original value of VRSave
362 // c16,c32,c48 = loaded
363
364 LFwdLongVectors:
365 mfspr rv,vrsave // get bitmap of live vector registers
366 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
367 cmplw cr1,rc,w3 // very long operand?
368 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
369 bge-- cr1,LBigCopy // handle big copies separately
370 mtctr r0 // set up loop count
371 cmpwi cr6,w3,0 // set cr6 on leftover byte count
372 oris w4,rv,0xFFF8 // we use v0-v12
373 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
374 li c16,16 // get constants used in ldvx/stvx
375 mtspr vrsave,w4 // update mask
376 li c32,32
377 li c48,48
378 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
379 lvsl vp,0,rs // get permute vector to shift left
380 lvx v1,0,rs // prefetch 1st source quadword
381 b LFwdLongUnaligned
382
383
384 // Forward, long, unaligned vector loop.
385
386 .align 5 // align inner loops
387 LFwdLongUnaligned: // loop over 128-byte chunks
388 addi w4,rs,64
389 lvx v2,c16,rs
390 lvx v3,c32,rs
391 lvx v4,c48,rs
392 lvx v5,0,w4
393 lvx v6,c16,w4
394 vperm vw,v1,v2,vp
395 lvx v7,c32,w4
396 lvx v8,c48,w4
397 addi rs,rs,128
398 vperm vx,v2,v3,vp
399 addi w4,rd,64
400 lvx v1,0,rs
401 stvx vw,0,rd
402 vperm vy,v3,v4,vp
403 stvx vx,c16,rd
404 vperm vz,v4,v5,vp
405 stvx vy,c32,rd
406 vperm vw,v5,v6,vp
407 stvx vz,c48,rd
408 vperm vx,v6,v7,vp
409 addi rd,rd,128
410 stvx vw,0,w4
411 vperm vy,v7,v8,vp
412 stvx vx,c16,w4
413 vperm vz,v8,v1,vp
414 stvx vy,c32,w4
415 stvx vz,c48,w4
416 bdnz LFwdLongUnaligned
417
418 beq 4f // no leftover quadwords
419 mtctr r0
420 3: // loop over remaining quadwords
421 lvx v2,c16,rs
422 addi rs,rs,16
423 vperm vx,v1,v2,vp
424 vor v1,v2,v2 // v1 <- v2
425 stvx vx,0,rd
426 addi rd,rd,16
427 bdnz 3b
428 4:
429 mtspr vrsave,rv // restore bitmap of live vr's
430 bne cr6,LShort16 // handle last 0-15 bytes if any
431 blr
432
433
434 // Forward, long, 16-byte aligned vector loop.
435
436 .align 5
437 LFwdLongAligned: // loop over 128-byte chunks
438 addi w4,rs,64
439 lvx v1,0,rs
440 lvx v2,c16,rs
441 lvx v3,c32,rs
442 lvx v4,c48,rs
443 lvx v5,0,w4
444 lvx v6,c16,w4
445 lvx v7,c32,w4
446 lvx v8,c48,w4
447 addi rs,rs,128
448 addi w4,rd,64
449 stvx v1,0,rd
450 stvx v2,c16,rd
451 stvx v3,c32,rd
452 stvx v4,c48,rd
453 stvx v5,0,w4
454 stvx v6,c16,w4
455 stvx v7,c32,w4
456 stvx v8,c48,w4
457 addi rd,rd,128
458 bdnz LFwdLongAligned
459
460 beq 4f // no leftover quadwords
461 mtctr r0
462 3: // loop over remaining quadwords (1-7)
463 lvx v1,0,rs
464 addi rs,rs,16
465 stvx v1,0,rd
466 addi rd,rd,16
467 bdnz 3b
468 4:
469 mtspr vrsave,rv // restore bitmap of live vr's
470 bne cr6,LShort16 // handle last 0-15 bytes if any
471 blr
472
473
474 // Long, reverse moves.
475 // rs = source
476 // rd = destination
477 // rc = count
478 // cr5 = beq if relatively 16-byte aligned
479
480 LLongReverse:
481 add rd,rd,rc // point to end of operands
482 add rs,rs,rc
483 andi. r0,rd,0xF // #bytes to 16-byte align destination
484 beq 2f // already aligned
485
486 // 16-byte align destination.
487
488 mtctr r0 // set up for loop
489 sub rc,rc,r0
490 1:
491 lbzu w1,-1(rs)
492 stbu w1,-1(rd)
493 bdnz 1b
494
495 // Prepare for reverse vector loop. When entered:
496 // rd = 16-byte aligned
497 // cr5 = beq if source also 16-byte aligned
498 // We set up many registers:
499 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
500 // r0/cr0 = leftover QWs to move
501 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
502 // cr6 = beq if leftover byte count is 0
503 // cm1 = -1
504 // rv = original value of vrsave
505
506 2:
507 mfspr rv,vrsave // get bitmap of live vector registers
508 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
509 oris w1,rv,0xFFF8 // we use v0-v12
510 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
511 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
512 cmpwi cr1,r0,0 // set cr1 on chunk count
513 mtspr vrsave,w1 // update mask
514 mtctr r0 // set up loop count
515 cmpwi cr6,w3,0 // set cr6 on leftover byte count
516 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
517 li cm1,-1 // get constants used in ldvx/stvx
518
519 bne cr5,LReverseVecUnal // handle unaligned operands
520 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
521 li cm17,-17
522 li cm33,-33
523 li cm49,-49
524 b 1f
525
526 // Long, reverse 16-byte-aligned vector loop.
527
528 .align 5 // align inner loops
529 1: // loop over 64-byte chunks
530 lvx v1,cm1,rs
531 lvx v2,cm17,rs
532 lvx v3,cm33,rs
533 lvx v4,cm49,rs
534 subi rs,rs,64
535 stvx v1,cm1,rd
536 stvx v2,cm17,rd
537 stvx v3,cm33,rd
538 stvx v4,cm49,rd
539 subi rd,rd,64
540 bdnz 1b
541
542 beq 4f // no leftover quadwords
543 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
544 mtctr r0
545 3: // loop over remaining quadwords (1-7)
546 lvx v1,cm1,rs
547 subi rs,rs,16
548 stvx v1,cm1,rd
549 subi rd,rd,16
550 bdnz 3b
551 4:
552 mtspr vrsave,rv // restore bitmap of live vr's
553 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
554 blr
555
556
557 // Long, reverse, unaligned vector loop.
558 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
559 // r0/cr0 = leftover QWs to move
560 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
561 // cr6 = beq if leftover byte count is 0
562 // rv = original value of vrsave
563 // cm1 = -1
564
565 LReverseVecUnal:
566 lvsl vp,0,rs // get permute vector to shift left
567 lvx v1,cm1,rs // v1 always looks ahead
568 li cm17,-17
569 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
570 li cm33,-33
571 li cm49,-49
572 b 1f
573
574 .align 5 // align the inner loops
575 1: // loop over 64-byte chunks
576 lvx v2,cm17,rs
577 lvx v3,cm33,rs
578 lvx v4,cm49,rs
579 subi rs,rs,64
580 vperm vx,v2,v1,vp
581 lvx v1,cm1,rs
582 vperm vy,v3,v2,vp
583 stvx vx,cm1,rd
584 vperm vz,v4,v3,vp
585 stvx vy,cm17,rd
586 vperm vx,v1,v4,vp
587 stvx vz,cm33,rd
588 stvx vx,cm49,rd
589 subi rd,rd,64
590 bdnz 1b
591
592 beq 4f // no leftover quadwords
593 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
594 mtctr r0
595 3: // loop over 1-3 quadwords
596 lvx v2,cm17,rs
597 subi rs,rs,16
598 vperm vx,v2,v1,vp
599 vor v1,v2,v2 // v1 <- v2
600 stvx vx,cm1,rd
601 subi rd,rd,16
602 bdnz 3b
603 4:
604 mtspr vrsave,rv // restore bitmap of live vr's
605 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
606 blr
607
608
609 // Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
610 // The conditions bigcopy expects are:
611 // r0 = return address (also stored in caller's SF)
612 // r4 = source ptr
613 // r5 = length (at least several pages)
614 // r12 = dest ptr
615
616 LBigCopy:
617 lis r2,0x4000 // r2 <- 0x40000000
618 mflr r0 // get our return address
619 add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
620 stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
621 blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
622 std r0,16(r1) // save return in correct spot for 64-bit mode
623 ba _COMM_PAGE_BIGCOPY // then join big operand code
624
625
626 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
627 kCommPageMTCRF+kCommPageBoth+kPort32to64)