]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_970.s
xnu-517.7.7.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* =======================================
23 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
24 * =======================================
25 *
26 * Version of 6/11/2003, tuned for the IBM 970.
27 *
28 *
29 * Register usage. Note the rather delicate way we assign multiple uses
30 * to the same register. Beware.
31 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
32 * r3 = not used, as memcpy and memmove return 1st parameter as a value
33 * r4 = source ptr ("rs")
34 * r5 = count of bytes to move ("rc")
35 * r6 = "w1", "c16", or "cm17"
36 * r7 = "w2", "c32", or "cm33"
37 * r8 = "w3", "c48", or "cm49"
38 * r9 = "w4", or "cm1"
39 * r10 = vrsave ("rv")
40 * r11 = unused
41 * r12 = destination ptr ("rd")
42 * v0 = permute vector ("vp")
43 * v1-v8 = qw's loaded from source
44 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
45 */
46 #define rs r4
47 #define rd r12
48 #define rc r5
49 #define rv r10
50
51 #define w1 r6
52 #define w2 r7
53 #define w3 r8
54 #define w4 r9
55
56 #define c16 r6
57 #define cm17 r6
58 #define c32 r7
59 #define cm33 r7
60 #define c48 r8
61 #define cm49 r8
62 #define cm1 r9
63
64 #define vp v0
65 #define vw v9
66 #define vx v10
67 #define vy v11
68 #define vz v12
69
70 #define ASSEMBLER
71 #include <sys/appleapiopts.h>
72 #include <ppc/asm.h>
73 #include <machine/cpu_capabilities.h>
74 #include <machine/commpage.h>
75
76 .text
77 .globl EXT(bcopy_970)
78
79
80 #define kShort 64
81 #define kVeryLong (128*1024)
82
83
84 // Main entry points.
85
86 .align 5
87 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
88 cmplwi rc,kShort // short or long?
89 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
90 mr rd,r4 // move registers to canonic spot
91 mr rs,r3
92 blt LShort // handle short operands
93 dcbt 0,rs // touch in the first line of source
94 dcbtst 0,rd // touch in destination
95 b LLong1 // join long operand code
96
97 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
98
99 .align 5
100 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
101 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
102 cmplwi rc,kShort // short or long?
103 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
104 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
105 bge LLong0 // handle long operands
106
107 // Handle short operands.
108 // rs = source
109 // rd = destination
110 // rc = count
111 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
112
113 LShort:
114 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
115 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
116 mtcrf 0x01,rc
117 blt-- cr1,LShortReverse
118
119 // Forward short operands. This is the most frequent case, so it is inline.
120
121 bf 26,0f // 32-byte chunk to move?
122 ld w1,0(rs)
123 ld w2,8(rs)
124 ld w3,16(rs)
125 ld w4,24(rs)
126 addi rs,rs,32
127 std w1,0(rd)
128 std w2,8(rd)
129 std w3,16(rd)
130 std w4,24(rd)
131 addi rd,rd,32
132 0:
133 LShort32:
134 bf 27,1f // quadword to move?
135 ld w1,0(rs)
136 ld w3,8(rs)
137 addi rs,rs,16
138 std w1,0(rd)
139 std w3,8(rd)
140 addi rd,rd,16
141 1:
142 LShort16: // join here to xfer 0-15 bytes
143 bf 28,2f // doubleword?
144 ld w1,0(rs)
145 addi rs,rs,8
146 std w1,0(rd)
147 addi rd,rd,8
148 2:
149 bf 29,3f // word?
150 lwz w1,0(rs)
151 addi rs,rs,4
152 stw w1,0(rd)
153 addi rd,rd,4
154 3:
155 bf 30,4f // halfword to move?
156 lhz w1,0(rs)
157 addi rs,rs,2
158 sth w1,0(rd)
159 addi rd,rd,2
160 4:
161 bflr 31 // skip if no odd byte
162 lbz w1,0(rs)
163 stb w1,0(rd)
164 blr
165
166
167 // Handle short reverse operands.
168 // cr = length in bits 26-31
169
170 LShortReverse:
171 add rs,rs,rc // adjust ptrs for reverse move
172 add rd,rd,rc
173 bf 26,0f // 32 bytes to move?
174 ld w1,-8(rs)
175 ld w2,-16(rs)
176 ld w3,-24(rs)
177 ldu w4,-32(rs)
178 std w1,-8(rd)
179 std w2,-16(rd)
180 std w3,-24(rd)
181 stdu w4,-32(rd)
182 0:
183 bf 27,1f // quadword to move?
184 ld w1,-8(rs)
185 ldu w2,-16(rs)
186 std w1,-8(rd)
187 stdu w2,-16(rd)
188 1:
189 LShortReverse16: // join here to xfer 0-15 bytes and return
190 bf 28,2f // doubleword?
191 ldu w1,-8(rs)
192 stdu w1,-8(rd)
193 2:
194 bf 29,3f // word?
195 lwzu w1,-4(rs)
196 stwu w1,-4(rd)
197 3:
198 bf 30,4f // halfword to move?
199 lhzu w1,-2(rs)
200 sthu w1,-2(rd)
201 4:
202 bflr 31 // done if no odd byte
203 lbz w1,-1(rs) // no update
204 stb w1,-1(rd)
205 blr
206
207
208 // Long operands, use Altivec in most cases.
209 // rs = source
210 // rd = destination
211 // rc = count
212 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
213
214 LLong0: // entry from memmove()
215 dcbt 0,rs // touch in source
216 dcbtst 0,rd // touch in destination
217 LLong1: // entry from bcopy() with operands already touched in
218 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
219 neg w3,rd // start to compute #bytes to align destination
220 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
221 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
222 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
223 blt-- cr1,LLongReverse // handle reverse moves
224 sub rc,rc,w4 // adjust length for aligning destination
225 srwi r0,rc,7 // get #cache lines to copy (may be 0)
226 cmpwi cr1,r0,0 // set cr1 on #chunks
227 beq LFwdAligned // dest is already aligned
228
229 // 16-byte align destination.
230
231 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
232 bf 31,1f // byte to move?
233 lbz w1,0(rs)
234 addi rs,rs,1
235 stb w1,0(rd)
236 addi rd,rd,1
237 1:
238 bf 30,2f // halfword?
239 lhz w1,0(rs)
240 addi rs,rs,2
241 sth w1,0(rd)
242 addi rd,rd,2
243 2:
244 bf 29,3f // word?
245 lwz w1,0(rs)
246 addi rs,rs,4
247 stw w1,0(rd)
248 addi rd,rd,4
249 3:
250 bf 28,LFwdAligned // doubleword?
251 ld w1,0(rs)
252 addi rs,rs,8
253 std w1,0(rd)
254 addi rd,rd,8
255
256
257 // Forward, destination is 16-byte aligned. There are five cases:
258 // 1. If the length>=kVeryLong (ie, several pages), then use the
259 // "bigcopy" path that pulls all the punches. This is the fastest
260 // case for cold-cache operands, as any this long will likely be.
261 // 2. If length>=128 and source is 16-byte aligned, then use the
262 // lvx/stvx loop over 128-byte chunks. This is the fastest
263 // case for hot-cache operands, 2nd fastest for cold.
264 // 3. If length>=128 and source is not 16-byte aligned, then use the
265 // lvx/vperm/stvx loop over 128-byte chunks.
266 // 4. If length<128 and source is 8-byte aligned, then use the
267 // ld/std loop over 32-byte chunks.
268 // 5. If length<128 and source is not 8-byte aligned, then use the
269 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
270 // Registers at this point:
271 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
272 // rs = alignment unknown
273 // rd = 16-byte aligned
274 // rc = bytes remaining
275 // w2 = low 4 bits of (rd-rs), used to check alignment
276 // cr5 = beq if source is also 16-byte aligned
277
278 LFwdAligned:
279 andi. w3,w2,7 // is source at least 8-byte aligned?
280 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
281 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
282 srwi w1,rc,5 // get 32-byte chunk count
283 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
284 mtctr w1 // set up 32-byte loop (w1!=0)
285 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
286 mfspr rv,vrsave // get bitmap of live vector registers
287 oris w4,rv,0xFFF8 // we use v0-v12
288 li c16,16 // get constant used in lvx
289 li c32,32
290 mtspr vrsave,w4 // update mask
291 lvx v1,0,rs // prefetch 1st source quadword
292 lvsl vp,0,rs // get permute vector to shift left
293
294
295 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
296
297 1: // loop over 32-byte chunks
298 lvx v2,c16,rs
299 lvx v3,c32,rs
300 addi rs,rs,32
301 vperm vx,v1,v2,vp
302 vperm vy,v2,v3,vp
303 vor v1,v3,v3 // v1 <- v3
304 stvx vx,0,rd
305 stvx vy,c16,rd
306 addi rd,rd,32
307 bdnz 1b
308
309 mtspr vrsave,rv // restore bitmap of live vr's
310 b LShort32
311
312
313 // Fewer than 128 bytes and doubleword aligned: use ld/std.
314
315 .align 5
316 LFwdMedAligned: // loop over 32-byte chunks
317 ld w1,0(rs)
318 ld w2,8(rs)
319 ld w3,16(rs)
320 ld w4,24(rs)
321 addi rs,rs,32
322 std w1,0(rd)
323 std w2,8(rd)
324 std w3,16(rd)
325 std w4,24(rd)
326 addi rd,rd,32
327 bdnz LFwdMedAligned
328
329 b LShort32
330
331
332 // Forward, 128 bytes or more: use vectors. When entered:
333 // r0 = 128-byte chunks to move (>0)
334 // rd = 16-byte aligned
335 // cr5 = beq if source is 16-byte aligned
336 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
337 // We set up many registers:
338 // ctr = number of 128-byte chunks to move
339 // r0/cr0 = leftover QWs to move
340 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
341 // cr6 = beq if leftover byte count is 0
342 // rv = original value of VRSave
343 // c16,c32,c48 = loaded
344
345 LFwdLongVectors:
346 mfspr rv,vrsave // get bitmap of live vector registers
347 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
348 cmplw cr1,rc,w3 // very long operand?
349 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
350 bgea-- cr1,_COMM_PAGE_BIGCOPY // handle big copies separately
351 mtctr r0 // set up loop count
352 cmpwi cr6,w3,0 // set cr6 on leftover byte count
353 oris w4,rv,0xFFF8 // we use v0-v12
354 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
355 li c16,16 // get constants used in ldvx/stvx
356 mtspr vrsave,w4 // update mask
357 li c32,32
358 li c48,48
359 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
360 lvsl vp,0,rs // get permute vector to shift left
361 lvx v1,0,rs // prefetch 1st source quadword
362 b LFwdLongUnaligned
363
364
365 // Forward, long, unaligned vector loop.
366
367 .align 5 // align inner loops
368 LFwdLongUnaligned: // loop over 128-byte chunks
369 addi w4,rs,64
370 lvx v2,c16,rs
371 lvx v3,c32,rs
372 lvx v4,c48,rs
373 lvx v5,0,w4
374 lvx v6,c16,w4
375 vperm vw,v1,v2,vp
376 lvx v7,c32,w4
377 lvx v8,c48,w4
378 addi rs,rs,128
379 vperm vx,v2,v3,vp
380 addi w4,rd,64
381 lvx v1,0,rs
382 stvx vw,0,rd
383 vperm vy,v3,v4,vp
384 stvx vx,c16,rd
385 vperm vz,v4,v5,vp
386 stvx vy,c32,rd
387 vperm vw,v5,v6,vp
388 stvx vz,c48,rd
389 vperm vx,v6,v7,vp
390 addi rd,rd,128
391 stvx vw,0,w4
392 vperm vy,v7,v8,vp
393 stvx vx,c16,w4
394 vperm vz,v8,v1,vp
395 stvx vy,c32,w4
396 stvx vz,c48,w4
397 bdnz LFwdLongUnaligned
398
399 beq 4f // no leftover quadwords
400 mtctr r0
401 3: // loop over remaining quadwords
402 lvx v2,c16,rs
403 addi rs,rs,16
404 vperm vx,v1,v2,vp
405 vor v1,v2,v2 // v1 <- v2
406 stvx vx,0,rd
407 addi rd,rd,16
408 bdnz 3b
409 4:
410 mtspr vrsave,rv // restore bitmap of live vr's
411 bne cr6,LShort16 // handle last 0-15 bytes if any
412 blr
413
414
415 // Forward, long, 16-byte aligned vector loop.
416
417 .align 5
418 LFwdLongAligned: // loop over 128-byte chunks
419 addi w4,rs,64
420 lvx v1,0,rs
421 lvx v2,c16,rs
422 lvx v3,c32,rs
423 lvx v4,c48,rs
424 lvx v5,0,w4
425 lvx v6,c16,w4
426 lvx v7,c32,w4
427 lvx v8,c48,w4
428 addi rs,rs,128
429 addi w4,rd,64
430 stvx v1,0,rd
431 stvx v2,c16,rd
432 stvx v3,c32,rd
433 stvx v4,c48,rd
434 stvx v5,0,w4
435 stvx v6,c16,w4
436 stvx v7,c32,w4
437 stvx v8,c48,w4
438 addi rd,rd,128
439 bdnz LFwdLongAligned
440
441 beq 4f // no leftover quadwords
442 mtctr r0
443 3: // loop over remaining quadwords (1-7)
444 lvx v1,0,rs
445 addi rs,rs,16
446 stvx v1,0,rd
447 addi rd,rd,16
448 bdnz 3b
449 4:
450 mtspr vrsave,rv // restore bitmap of live vr's
451 bne cr6,LShort16 // handle last 0-15 bytes if any
452 blr
453
454
455 // Long, reverse moves.
456 // rs = source
457 // rd = destination
458 // rc = count
459 // cr5 = beq if relatively 16-byte aligned
460
461 LLongReverse:
462 add rd,rd,rc // point to end of operands
463 add rs,rs,rc
464 andi. r0,rd,0xF // #bytes to 16-byte align destination
465 beq 2f // already aligned
466
467 // 16-byte align destination.
468
469 mtctr r0 // set up for loop
470 sub rc,rc,r0
471 1:
472 lbzu w1,-1(rs)
473 stbu w1,-1(rd)
474 bdnz 1b
475
476 // Prepare for reverse vector loop. When entered:
477 // rd = 16-byte aligned
478 // cr5 = beq if source also 16-byte aligned
479 // We set up many registers:
480 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
481 // r0/cr0 = leftover QWs to move
482 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
483 // cr6 = beq if leftover byte count is 0
484 // cm1 = -1
485 // rv = original value of vrsave
486
487 2:
488 mfspr rv,vrsave // get bitmap of live vector registers
489 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
490 oris w1,rv,0xFFF8 // we use v0-v12
491 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
492 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
493 cmpwi cr1,r0,0 // set cr1 on chunk count
494 mtspr vrsave,w1 // update mask
495 mtctr r0 // set up loop count
496 cmpwi cr6,w3,0 // set cr6 on leftover byte count
497 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
498 li cm1,-1 // get constants used in ldvx/stvx
499
500 bne cr5,LReverseVecUnal // handle unaligned operands
501 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
502 li cm17,-17
503 li cm33,-33
504 li cm49,-49
505 b 1f
506
507 // Long, reverse 16-byte-aligned vector loop.
508
509 .align 5 // align inner loops
510 1: // loop over 64-byte chunks
511 lvx v1,cm1,rs
512 lvx v2,cm17,rs
513 lvx v3,cm33,rs
514 lvx v4,cm49,rs
515 subi rs,rs,64
516 stvx v1,cm1,rd
517 stvx v2,cm17,rd
518 stvx v3,cm33,rd
519 stvx v4,cm49,rd
520 subi rd,rd,64
521 bdnz 1b
522
523 beq 4f // no leftover quadwords
524 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
525 mtctr r0
526 3: // loop over remaining quadwords (1-7)
527 lvx v1,cm1,rs
528 subi rs,rs,16
529 stvx v1,cm1,rd
530 subi rd,rd,16
531 bdnz 3b
532 4:
533 mtspr vrsave,rv // restore bitmap of live vr's
534 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
535 blr
536
537
538 // Long, reverse, unaligned vector loop.
539 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
540 // r0/cr0 = leftover QWs to move
541 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
542 // cr6 = beq if leftover byte count is 0
543 // rv = original value of vrsave
544 // cm1 = -1
545
546 LReverseVecUnal:
547 lvsl vp,0,rs // get permute vector to shift left
548 lvx v1,cm1,rs // v1 always looks ahead
549 li cm17,-17
550 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
551 li cm33,-33
552 li cm49,-49
553 b 1f
554
555 .align 5 // align the inner loops
556 1: // loop over 64-byte chunks
557 lvx v2,cm17,rs
558 lvx v3,cm33,rs
559 lvx v4,cm49,rs
560 subi rs,rs,64
561 vperm vx,v2,v1,vp
562 lvx v1,cm1,rs
563 vperm vy,v3,v2,vp
564 stvx vx,cm1,rd
565 vperm vz,v4,v3,vp
566 stvx vy,cm17,rd
567 vperm vx,v1,v4,vp
568 stvx vz,cm33,rd
569 stvx vx,cm49,rd
570 subi rd,rd,64
571 bdnz 1b
572
573 beq 4f // no leftover quadwords
574 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
575 mtctr r0
576 3: // loop over 1-3 quadwords
577 lvx v2,cm17,rs
578 subi rs,rs,16
579 vperm vx,v2,v1,vp
580 vor v1,v2,v2 // v1 <- v2
581 stvx vx,cm1,rd
582 subi rd,rd,16
583 bdnz 3b
584 4:
585 mtspr vrsave,rv // restore bitmap of live vr's
586 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
587 blr
588
589 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0,kCommPageMTCRF)