]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_970.s
xnu-344.21.74.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /* =======================================
26 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
27 * =======================================
28 *
29 * Version of 6/11/2003, tuned for the IBM 970.
30 *
31 *
32 * Register usage. Note the rather delicate way we assign multiple uses
33 * to the same register. Beware.
34 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
35 * r3 = not used, as memcpy and memmove return 1st parameter as a value
36 * r4 = source ptr ("rs")
37 * r5 = count of bytes to move ("rc")
38 * r6 = "w1", "c16", or "cm17"
39 * r7 = "w2", "c32", or "cm33"
40 * r8 = "w3", "c48", or "cm49"
41 * r9 = "w4", or "cm1"
42 * r10 = vrsave ("rv")
43 * r11 = unused
44 * r12 = destination ptr ("rd")
45 * v0 = permute vector ("vp")
46 * v1-v8 = qw's loaded from source
47 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
48 */
49 #define rs r4
50 #define rd r12
51 #define rc r5
52 #define rv r10
53
54 #define w1 r6
55 #define w2 r7
56 #define w3 r8
57 #define w4 r9
58
59 #define c16 r6
60 #define cm17 r6
61 #define c32 r7
62 #define cm33 r7
63 #define c48 r8
64 #define cm49 r8
65 #define cm1 r9
66
67 #define vp v0
68 #define vw v9
69 #define vx v10
70 #define vy v11
71 #define vz v12
72
73 #define ASSEMBLER
74 #include <sys/appleapiopts.h>
75 #include <ppc/asm.h>
76 #include <machine/cpu_capabilities.h>
77 #include <machine/commpage.h>
78
79 .text
80 .globl EXT(bcopy_970)
81
82
83 #define kShort 64
84 #define kVeryLong (128*1024)
85
86
87 // Main entry points.
88
89 .align 5
90 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
91 cmplwi rc,kShort // short or long?
92 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
93 mr rd,r4 // move registers to canonic spot
94 mr rs,r3
95 blt LShort // handle short operands
96 dcbt 0,rs // touch in the first line of source
97 dcbtst 0,rd // touch in destination
98 b LLong1 // join long operand code
99
100 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
101
102 .align 5
103 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
104 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
105 cmplwi rc,kShort // short or long?
106 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
107 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
108 bge LLong0 // handle long operands
109
110 // Handle short operands.
111 // rs = source
112 // rd = destination
113 // rc = count
114 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
115
116 LShort:
117 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
118 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
119 mtcrf 0x01,rc
120 blt-- cr1,LShortReverse
121
122 // Forward short operands. This is the most frequent case, so it is inline.
123
124 bf 26,0f // 32-byte chunk to move?
125 ld w1,0(rs)
126 ld w2,8(rs)
127 ld w3,16(rs)
128 ld w4,24(rs)
129 addi rs,rs,32
130 std w1,0(rd)
131 std w2,8(rd)
132 std w3,16(rd)
133 std w4,24(rd)
134 addi rd,rd,32
135 0:
136 LShort32:
137 bf 27,1f // quadword to move?
138 ld w1,0(rs)
139 ld w3,8(rs)
140 addi rs,rs,16
141 std w1,0(rd)
142 std w3,8(rd)
143 addi rd,rd,16
144 1:
145 LShort16: // join here to xfer 0-15 bytes
146 bf 28,2f // doubleword?
147 ld w1,0(rs)
148 addi rs,rs,8
149 std w1,0(rd)
150 addi rd,rd,8
151 2:
152 bf 29,3f // word?
153 lwz w1,0(rs)
154 addi rs,rs,4
155 stw w1,0(rd)
156 addi rd,rd,4
157 3:
158 bf 30,4f // halfword to move?
159 lhz w1,0(rs)
160 addi rs,rs,2
161 sth w1,0(rd)
162 addi rd,rd,2
163 4:
164 bflr 31 // skip if no odd byte
165 lbz w1,0(rs)
166 stb w1,0(rd)
167 blr
168
169
170 // Handle short reverse operands.
171 // cr = length in bits 26-31
172
173 LShortReverse:
174 add rs,rs,rc // adjust ptrs for reverse move
175 add rd,rd,rc
176 bf 26,0f // 32 bytes to move?
177 ld w1,-8(rs)
178 ld w2,-16(rs)
179 ld w3,-24(rs)
180 ldu w4,-32(rs)
181 std w1,-8(rd)
182 std w2,-16(rd)
183 std w3,-24(rd)
184 stdu w4,-32(rd)
185 0:
186 bf 27,1f // quadword to move?
187 ld w1,-8(rs)
188 ldu w2,-16(rs)
189 std w1,-8(rd)
190 stdu w2,-16(rd)
191 1:
192 LShortReverse16: // join here to xfer 0-15 bytes and return
193 bf 28,2f // doubleword?
194 ldu w1,-8(rs)
195 stdu w1,-8(rd)
196 2:
197 bf 29,3f // word?
198 lwzu w1,-4(rs)
199 stwu w1,-4(rd)
200 3:
201 bf 30,4f // halfword to move?
202 lhzu w1,-2(rs)
203 sthu w1,-2(rd)
204 4:
205 bflr 31 // done if no odd byte
206 lbz w1,-1(rs) // no update
207 stb w1,-1(rd)
208 blr
209
210
211 // Long operands, use Altivec in most cases.
212 // rs = source
213 // rd = destination
214 // rc = count
215 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
216
217 LLong0: // entry from memmove()
218 dcbt 0,rs // touch in source
219 dcbtst 0,rd // touch in destination
220 LLong1: // entry from bcopy() with operands already touched in
221 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
222 neg w3,rd // start to compute #bytes to align destination
223 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
224 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
225 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
226 blt-- cr1,LLongReverse // handle reverse moves
227 sub rc,rc,w4 // adjust length for aligning destination
228 srwi r0,rc,7 // get #cache lines to copy (may be 0)
229 cmpwi cr1,r0,0 // set cr1 on #chunks
230 beq LFwdAligned // dest is already aligned
231
232 // 16-byte align destination.
233
234 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
235 bf 31,1f // byte to move?
236 lbz w1,0(rs)
237 addi rs,rs,1
238 stb w1,0(rd)
239 addi rd,rd,1
240 1:
241 bf 30,2f // halfword?
242 lhz w1,0(rs)
243 addi rs,rs,2
244 sth w1,0(rd)
245 addi rd,rd,2
246 2:
247 bf 29,3f // word?
248 lwz w1,0(rs)
249 addi rs,rs,4
250 stw w1,0(rd)
251 addi rd,rd,4
252 3:
253 bf 28,LFwdAligned // doubleword?
254 ld w1,0(rs)
255 addi rs,rs,8
256 std w1,0(rd)
257 addi rd,rd,8
258
259
260 // Forward, destination is 16-byte aligned. There are five cases:
261 // 1. If the length>=kVeryLong (ie, several pages), then use the
262 // "bigcopy" path that pulls all the punches. This is the fastest
263 // case for cold-cache operands, as any this long will likely be.
264 // 2. If length>=128 and source is 16-byte aligned, then use the
265 // lvx/stvx loop over 128-byte chunks. This is the fastest
266 // case for hot-cache operands, 2nd fastest for cold.
267 // 3. If length>=128 and source is not 16-byte aligned, then use the
268 // lvx/vperm/stvx loop over 128-byte chunks.
269 // 4. If length<128 and source is 8-byte aligned, then use the
270 // ld/std loop over 32-byte chunks.
271 // 5. If length<128 and source is not 8-byte aligned, then use the
272 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
273 // Registers at this point:
274 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
275 // rs = alignment unknown
276 // rd = 16-byte aligned
277 // rc = bytes remaining
278 // w2 = low 4 bits of (rd-rs), used to check alignment
279 // cr5 = beq if source is also 16-byte aligned
280
281 LFwdAligned:
282 andi. w3,w2,7 // is source at least 8-byte aligned?
283 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
284 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
285 srwi w1,rc,5 // get 32-byte chunk count
286 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
287 mtctr w1 // set up 32-byte loop (w1!=0)
288 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
289 mfspr rv,vrsave // get bitmap of live vector registers
290 oris w4,rv,0xFFF8 // we use v0-v12
291 li c16,16 // get constants used in lvx
292 li c32,32
293 mtspr vrsave,w4 // update mask
294 lvx v1,0,rs // prefetch 1st source quadword
295 lvsl vp,0,rs // get permute vector to shift left
296
297
298 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
299
300 1: // loop over 32-byte chunks
301 lvx v2,c16,rs
302 lvx v3,c32,rs
303 addi rs,rs,32
304 vperm vx,v1,v2,vp
305 vperm vy,v2,v3,vp
306 vor v1,v3,v3 // v1 <- v3
307 stvx vx,0,rd
308 stvx vy,c16,rd
309 addi rd,rd,32
310 bdnz 1b
311
312 mtspr vrsave,rv // restore bitmap of live vr's
313 b LShort32
314
315
316 // Fewer than 128 bytes and doubleword aligned: use ld/std.
317
318 .align 5
319 LFwdMedAligned: // loop over 32-byte chunks
320 ld w1,0(rs)
321 ld w2,8(rs)
322 ld w3,16(rs)
323 ld w4,24(rs)
324 addi rs,rs,32
325 std w1,0(rd)
326 std w2,8(rd)
327 std w3,16(rd)
328 std w4,24(rd)
329 addi rd,rd,32
330 bdnz LFwdMedAligned
331
332 b LShort32
333
334
335 // Forward, 128 bytes or more: use vectors. When entered:
336 // r0 = 128-byte chunks to move (>0)
337 // rd = 16-byte aligned
338 // cr5 = beq if source is 16-byte aligned
339 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
340 // We set up many registers:
341 // ctr = number of 128-byte chunks to move
342 // r0/cr0 = leftover QWs to move
343 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
344 // cr6 = beq if leftover byte count is 0
345 // rv = original value of VRSave
346 // c16,c32,c48 = loaded
347
348 LFwdLongVectors:
349 mfspr rv,vrsave // get bitmap of live vector registers
350 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
351 cmplw cr1,rc,w3 // very long operand?
352 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
353 bgea-- cr1,_COMM_PAGE_BIGCOPY // handle big copies separately
354 mtctr r0 // set up loop count
355 cmpwi cr6,w3,0 // set cr6 on leftover byte count
356 oris w4,rv,0xFFF8 // we use v0-v12
357 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
358 li c16,16 // get constants used in ldvx/stvx
359 mtspr vrsave,w4 // update mask
360 li c32,32
361 li c48,48
362 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
363 lvsl vp,0,rs // get permute vector to shift left
364 lvx v1,0,rs // prefetch 1st source quadword
365 b LFwdLongUnaligned
366
367
368 // Forward, long, unaligned vector loop.
369
370 .align 5 // align inner loops
371 LFwdLongUnaligned: // loop over 128-byte chunks
372 addi w4,rs,64
373 lvx v2,c16,rs
374 lvx v3,c32,rs
375 lvx v4,c48,rs
376 lvx v5,0,w4
377 lvx v6,c16,w4
378 vperm vw,v1,v2,vp
379 lvx v7,c32,w4
380 lvx v8,c48,w4
381 addi rs,rs,128
382 vperm vx,v2,v3,vp
383 addi w4,rd,64
384 lvx v1,0,rs
385 stvx vw,0,rd
386 vperm vy,v3,v4,vp
387 stvx vx,c16,rd
388 vperm vz,v4,v5,vp
389 stvx vy,c32,rd
390 vperm vw,v5,v6,vp
391 stvx vz,c48,rd
392 vperm vx,v6,v7,vp
393 addi rd,rd,128
394 stvx vw,0,w4
395 vperm vy,v7,v8,vp
396 stvx vx,c16,w4
397 vperm vz,v8,v1,vp
398 stvx vy,c32,w4
399 stvx vz,c48,w4
400 bdnz LFwdLongUnaligned
401
402 beq 4f // no leftover quadwords
403 mtctr r0
404 3: // loop over remaining quadwords
405 lvx v2,c16,rs
406 addi rs,rs,16
407 vperm vx,v1,v2,vp
408 vor v1,v2,v2 // v1 <- v2
409 stvx vx,0,rd
410 addi rd,rd,16
411 bdnz 3b
412 4:
413 mtspr vrsave,rv // restore bitmap of live vr's
414 bne cr6,LShort16 // handle last 0-15 bytes if any
415 blr
416
417
418 // Forward, long, 16-byte aligned vector loop.
419
420 .align 5
421 LFwdLongAligned: // loop over 128-byte chunks
422 addi w4,rs,64
423 lvx v1,0,rs
424 lvx v2,c16,rs
425 lvx v3,c32,rs
426 lvx v4,c48,rs
427 lvx v5,0,w4
428 lvx v6,c16,w4
429 lvx v7,c32,w4
430 lvx v8,c48,w4
431 addi rs,rs,128
432 addi w4,rd,64
433 stvx v1,0,rd
434 stvx v2,c16,rd
435 stvx v3,c32,rd
436 stvx v4,c48,rd
437 stvx v5,0,w4
438 stvx v6,c16,w4
439 stvx v7,c32,w4
440 stvx v8,c48,w4
441 addi rd,rd,128
442 bdnz LFwdLongAligned
443
444 beq 4f // no leftover quadwords
445 mtctr r0
446 3: // loop over remaining quadwords (1-7)
447 lvx v1,0,rs
448 addi rs,rs,16
449 stvx v1,0,rd
450 addi rd,rd,16
451 bdnz 3b
452 4:
453 mtspr vrsave,rv // restore bitmap of live vr's
454 bne cr6,LShort16 // handle last 0-15 bytes if any
455 blr
456
457
458 // Long, reverse moves.
459 // rs = source
460 // rd = destination
461 // rc = count
462 // cr5 = beq if relatively 16-byte aligned
463
464 LLongReverse:
465 add rd,rd,rc // point to end of operands
466 add rs,rs,rc
467 andi. r0,rd,0xF // #bytes to 16-byte align destination
468 beq 2f // already aligned
469
470 // 16-byte align destination.
471
472 mtctr r0 // set up for loop
473 sub rc,rc,r0
474 1:
475 lbzu w1,-1(rs)
476 stbu w1,-1(rd)
477 bdnz 1b
478
479 // Prepare for reverse vector loop. When entered:
480 // rd = 16-byte aligned
481 // cr5 = beq if source also 16-byte aligned
482 // We set up many registers:
483 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
484 // r0/cr0 = leftover QWs to move
485 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
486 // cr6 = beq if leftover byte count is 0
487 // cm1 = -1
488 // rv = original value of vrsave
489
490 2:
491 mfspr rv,vrsave // get bitmap of live vector registers
492 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
493 oris w1,rv,0xFFF8 // we use v0-v12
494 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
495 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
496 cmpwi cr1,r0,0 // set cr1 on chunk count
497 mtspr vrsave,w1 // update mask
498 mtctr r0 // set up loop count
499 cmpwi cr6,w3,0 // set cr6 on leftover byte count
500 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
501 li cm1,-1 // get constants used in ldvx/stvx
502
503 bne cr5,LReverseVecUnal // handle unaligned operands
504 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
505 li cm17,-17
506 li cm33,-33
507 li cm49,-49
508 b 1f
509
510 // Long, reverse 16-byte-aligned vector loop.
511
512 .align 5 // align inner loops
513 1: // loop over 64-byte chunks
514 lvx v1,cm1,rs
515 lvx v2,cm17,rs
516 lvx v3,cm33,rs
517 lvx v4,cm49,rs
518 subi rs,rs,64
519 stvx v1,cm1,rd
520 stvx v2,cm17,rd
521 stvx v3,cm33,rd
522 stvx v4,cm49,rd
523 subi rd,rd,64
524 bdnz 1b
525
526 beq 4f // no leftover quadwords
527 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
528 mtctr r0
529 3: // loop over remaining quadwords (1-7)
530 lvx v1,cm1,rs
531 subi rs,rs,16
532 stvx v1,cm1,rd
533 subi rd,rd,16
534 bdnz 3b
535 4:
536 mtspr vrsave,rv // restore bitmap of live vr's
537 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
538 blr
539
540
541 // Long, reverse, unaligned vector loop.
542 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
543 // r0/cr0 = leftover QWs to move
544 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
545 // cr6 = beq if leftover byte count is 0
546 // rv = original value of vrsave
547 // cm1 = -1
548
549 LReverseVecUnal:
550 lvsl vp,0,rs // get permute vector to shift left
551 lvx v1,cm1,rs // v1 always looks ahead
552 li cm17,-17
553 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
554 li cm33,-33
555 li cm49,-49
556 b 1f
557
558 .align 5 // align the inner loops
559 1: // loop over 64-byte chunks
560 lvx v2,cm17,rs
561 lvx v3,cm33,rs
562 lvx v4,cm49,rs
563 subi rs,rs,64
564 vperm vx,v2,v1,vp
565 lvx v1,cm1,rs
566 vperm vy,v3,v2,vp
567 stvx vx,cm1,rd
568 vperm vz,v4,v3,vp
569 stvx vy,cm17,rd
570 vperm vx,v1,v4,vp
571 stvx vz,cm33,rd
572 stvx vx,cm49,rd
573 subi rd,rd,64
574 bdnz 1b
575
576 beq 4f // no leftover quadwords
577 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
578 mtctr r0
579 3: // loop over 1-3 quadwords
580 lvx v2,cm17,rs
581 subi rs,rs,16
582 vperm vx,v2,v1,vp
583 vor v1,v2,v2 // v1 <- v2
584 stvx vx,cm1,rd
585 subi rd,rd,16
586 bdnz 3b
587 4:
588 mtspr vrsave,rv // restore bitmap of live vr's
589 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
590 blr
591
592 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0,kCommPageMTCRF)