]> git.saurik.com Git - apple/xnu.git/blame - osfmk/ppc/commpage/bcopy_970.s
xnu-792.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s
CommitLineData
55e303ae
A
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
e5568f75
A
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
55e303ae 11 *
e5568f75
A
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
55e303ae
A
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
e5568f75
A
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
55e303ae
A
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/* =======================================
23 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
24 * =======================================
25 *
26 * Version of 6/11/2003, tuned for the IBM 970.
27 *
55e303ae
A
28 * Register usage. Note the rather delicate way we assign multiple uses
29 * to the same register. Beware.
30 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
31 * r3 = not used, as memcpy and memmove return 1st parameter as a value
32 * r4 = source ptr ("rs")
33 * r5 = count of bytes to move ("rc")
34 * r6 = "w1", "c16", or "cm17"
35 * r7 = "w2", "c32", or "cm33"
36 * r8 = "w3", "c48", or "cm49"
37 * r9 = "w4", or "cm1"
38 * r10 = vrsave ("rv")
39 * r11 = unused
40 * r12 = destination ptr ("rd")
41 * v0 = permute vector ("vp")
42 * v1-v8 = qw's loaded from source
43 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
44 */
45#define rs r4
46#define rd r12
47#define rc r5
48#define rv r10
49
50#define w1 r6
51#define w2 r7
52#define w3 r8
53#define w4 r9
54
55#define c16 r6
56#define cm17 r6
57#define c32 r7
58#define cm33 r7
59#define c48 r8
60#define cm49 r8
61#define cm1 r9
62
63#define vp v0
64#define vw v9
65#define vx v10
66#define vy v11
67#define vz v12
68
69#define ASSEMBLER
70#include <sys/appleapiopts.h>
71#include <ppc/asm.h>
72#include <machine/cpu_capabilities.h>
73#include <machine/commpage.h>
74
75 .text
91447636
A
76/*
77 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
78 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
79 * simple transformations:
80 * - all word compares are changed to doubleword
81 * - all "srwi[.]" opcodes are changed to "srdi[.]"
82 * Nothing else is done. For this to work, the following rules must be
83 * carefully followed:
84 * - do not use carry or overflow
85 * - only use record mode if you are sure the results are mode-invariant
86 * for example, all "andi." and almost all "rlwinm." are fine
87 * - do not use "slwi", "slw", or "srw"
88 * An imaginative programmer could break the porting model in other ways, but the above
89 * are the most likely problem areas. It is perhaps surprising how well in practice
90 * this simple method works.
91 */
55e303ae
A
92
93#define kShort 64
94#define kVeryLong (128*1024)
95
96
97// Main entry points.
98
99 .align 5
100bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
101 cmplwi rc,kShort // short or long?
102 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
103 mr rd,r4 // move registers to canonic spot
104 mr rs,r3
105 blt LShort // handle short operands
106 dcbt 0,rs // touch in the first line of source
107 dcbtst 0,rd // touch in destination
108 b LLong1 // join long operand code
109
110// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
111
112 .align 5
113Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
114Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
115 cmplwi rc,kShort // short or long?
116 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
117 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
118 bge LLong0 // handle long operands
119
120// Handle short operands.
121// rs = source
122// rd = destination
123// rc = count
124// w1 = (rd-rs), must move reverse if (rd-rs)<rc
125
126LShort:
127 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
128 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
129 mtcrf 0x01,rc
130 blt-- cr1,LShortReverse
131
132// Forward short operands. This is the most frequent case, so it is inline.
133
134 bf 26,0f // 32-byte chunk to move?
135 ld w1,0(rs)
136 ld w2,8(rs)
137 ld w3,16(rs)
138 ld w4,24(rs)
139 addi rs,rs,32
140 std w1,0(rd)
141 std w2,8(rd)
142 std w3,16(rd)
143 std w4,24(rd)
144 addi rd,rd,32
1450:
146LShort32:
147 bf 27,1f // quadword to move?
148 ld w1,0(rs)
149 ld w3,8(rs)
150 addi rs,rs,16
151 std w1,0(rd)
152 std w3,8(rd)
153 addi rd,rd,16
1541:
155LShort16: // join here to xfer 0-15 bytes
156 bf 28,2f // doubleword?
157 ld w1,0(rs)
158 addi rs,rs,8
159 std w1,0(rd)
160 addi rd,rd,8
1612:
162 bf 29,3f // word?
163 lwz w1,0(rs)
164 addi rs,rs,4
165 stw w1,0(rd)
166 addi rd,rd,4
1673:
168 bf 30,4f // halfword to move?
169 lhz w1,0(rs)
170 addi rs,rs,2
171 sth w1,0(rd)
172 addi rd,rd,2
1734:
174 bflr 31 // skip if no odd byte
175 lbz w1,0(rs)
176 stb w1,0(rd)
177 blr
178
179
180// Handle short reverse operands.
181// cr = length in bits 26-31
182
183LShortReverse:
184 add rs,rs,rc // adjust ptrs for reverse move
185 add rd,rd,rc
186 bf 26,0f // 32 bytes to move?
187 ld w1,-8(rs)
188 ld w2,-16(rs)
189 ld w3,-24(rs)
190 ldu w4,-32(rs)
191 std w1,-8(rd)
192 std w2,-16(rd)
193 std w3,-24(rd)
194 stdu w4,-32(rd)
1950:
196 bf 27,1f // quadword to move?
197 ld w1,-8(rs)
198 ldu w2,-16(rs)
199 std w1,-8(rd)
200 stdu w2,-16(rd)
2011:
202LShortReverse16: // join here to xfer 0-15 bytes and return
203 bf 28,2f // doubleword?
204 ldu w1,-8(rs)
205 stdu w1,-8(rd)
2062:
207 bf 29,3f // word?
208 lwzu w1,-4(rs)
209 stwu w1,-4(rd)
2103:
211 bf 30,4f // halfword to move?
212 lhzu w1,-2(rs)
213 sthu w1,-2(rd)
2144:
215 bflr 31 // done if no odd byte
216 lbz w1,-1(rs) // no update
217 stb w1,-1(rd)
218 blr
219
220
221// Long operands, use Altivec in most cases.
222// rs = source
223// rd = destination
224// rc = count
225// w1 = (rd-rs), must move reverse if (rd-rs)<rc
226
227LLong0: // entry from memmove()
228 dcbt 0,rs // touch in source
229 dcbtst 0,rd // touch in destination
230LLong1: // entry from bcopy() with operands already touched in
231 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
232 neg w3,rd // start to compute #bytes to align destination
233 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
234 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
235 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
236 blt-- cr1,LLongReverse // handle reverse moves
237 sub rc,rc,w4 // adjust length for aligning destination
238 srwi r0,rc,7 // get #cache lines to copy (may be 0)
239 cmpwi cr1,r0,0 // set cr1 on #chunks
240 beq LFwdAligned // dest is already aligned
241
242// 16-byte align destination.
243
244 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
245 bf 31,1f // byte to move?
246 lbz w1,0(rs)
247 addi rs,rs,1
248 stb w1,0(rd)
249 addi rd,rd,1
2501:
251 bf 30,2f // halfword?
252 lhz w1,0(rs)
253 addi rs,rs,2
254 sth w1,0(rd)
255 addi rd,rd,2
2562:
257 bf 29,3f // word?
258 lwz w1,0(rs)
259 addi rs,rs,4
260 stw w1,0(rd)
261 addi rd,rd,4
2623:
263 bf 28,LFwdAligned // doubleword?
264 ld w1,0(rs)
265 addi rs,rs,8
266 std w1,0(rd)
267 addi rd,rd,8
268
269
270// Forward, destination is 16-byte aligned. There are five cases:
271// 1. If the length>=kVeryLong (ie, several pages), then use the
272// "bigcopy" path that pulls all the punches. This is the fastest
273// case for cold-cache operands, as any this long will likely be.
274// 2. If length>=128 and source is 16-byte aligned, then use the
275// lvx/stvx loop over 128-byte chunks. This is the fastest
276// case for hot-cache operands, 2nd fastest for cold.
277// 3. If length>=128 and source is not 16-byte aligned, then use the
278// lvx/vperm/stvx loop over 128-byte chunks.
279// 4. If length<128 and source is 8-byte aligned, then use the
280// ld/std loop over 32-byte chunks.
281// 5. If length<128 and source is not 8-byte aligned, then use the
282// lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
283// Registers at this point:
284// r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
285// rs = alignment unknown
286// rd = 16-byte aligned
287// rc = bytes remaining
288// w2 = low 4 bits of (rd-rs), used to check alignment
289// cr5 = beq if source is also 16-byte aligned
290
291LFwdAligned:
292 andi. w3,w2,7 // is source at least 8-byte aligned?
293 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
294 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
295 srwi w1,rc,5 // get 32-byte chunk count
296 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
297 mtctr w1 // set up 32-byte loop (w1!=0)
298 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
299 mfspr rv,vrsave // get bitmap of live vector registers
300 oris w4,rv,0xFFF8 // we use v0-v12
301 li c16,16 // get constant used in lvx
302 li c32,32
303 mtspr vrsave,w4 // update mask
304 lvx v1,0,rs // prefetch 1st source quadword
305 lvsl vp,0,rs // get permute vector to shift left
306
307
308// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
309
3101: // loop over 32-byte chunks
311 lvx v2,c16,rs
312 lvx v3,c32,rs
313 addi rs,rs,32
314 vperm vx,v1,v2,vp
315 vperm vy,v2,v3,vp
316 vor v1,v3,v3 // v1 <- v3
317 stvx vx,0,rd
318 stvx vy,c16,rd
319 addi rd,rd,32
320 bdnz 1b
321
322 mtspr vrsave,rv // restore bitmap of live vr's
323 b LShort32
324
325
326// Fewer than 128 bytes and doubleword aligned: use ld/std.
327
328 .align 5
329LFwdMedAligned: // loop over 32-byte chunks
330 ld w1,0(rs)
331 ld w2,8(rs)
332 ld w3,16(rs)
333 ld w4,24(rs)
334 addi rs,rs,32
335 std w1,0(rd)
336 std w2,8(rd)
337 std w3,16(rd)
338 std w4,24(rd)
339 addi rd,rd,32
340 bdnz LFwdMedAligned
341
342 b LShort32
343
344
345// Forward, 128 bytes or more: use vectors. When entered:
346// r0 = 128-byte chunks to move (>0)
347// rd = 16-byte aligned
348// cr5 = beq if source is 16-byte aligned
349// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
350// We set up many registers:
351// ctr = number of 128-byte chunks to move
352// r0/cr0 = leftover QWs to move
353// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
354// cr6 = beq if leftover byte count is 0
355// rv = original value of VRSave
356// c16,c32,c48 = loaded
357
358LFwdLongVectors:
359 mfspr rv,vrsave // get bitmap of live vector registers
360 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
361 cmplw cr1,rc,w3 // very long operand?
362 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
91447636 363 bge-- cr1,LBigCopy // handle big copies separately
55e303ae
A
364 mtctr r0 // set up loop count
365 cmpwi cr6,w3,0 // set cr6 on leftover byte count
366 oris w4,rv,0xFFF8 // we use v0-v12
367 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
368 li c16,16 // get constants used in ldvx/stvx
369 mtspr vrsave,w4 // update mask
370 li c32,32
371 li c48,48
372 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
373 lvsl vp,0,rs // get permute vector to shift left
374 lvx v1,0,rs // prefetch 1st source quadword
375 b LFwdLongUnaligned
376
377
378// Forward, long, unaligned vector loop.
379
380 .align 5 // align inner loops
381LFwdLongUnaligned: // loop over 128-byte chunks
382 addi w4,rs,64
383 lvx v2,c16,rs
384 lvx v3,c32,rs
385 lvx v4,c48,rs
386 lvx v5,0,w4
387 lvx v6,c16,w4
388 vperm vw,v1,v2,vp
389 lvx v7,c32,w4
390 lvx v8,c48,w4
391 addi rs,rs,128
392 vperm vx,v2,v3,vp
393 addi w4,rd,64
394 lvx v1,0,rs
395 stvx vw,0,rd
396 vperm vy,v3,v4,vp
397 stvx vx,c16,rd
398 vperm vz,v4,v5,vp
399 stvx vy,c32,rd
400 vperm vw,v5,v6,vp
401 stvx vz,c48,rd
402 vperm vx,v6,v7,vp
403 addi rd,rd,128
404 stvx vw,0,w4
405 vperm vy,v7,v8,vp
406 stvx vx,c16,w4
407 vperm vz,v8,v1,vp
408 stvx vy,c32,w4
409 stvx vz,c48,w4
410 bdnz LFwdLongUnaligned
411
412 beq 4f // no leftover quadwords
413 mtctr r0
4143: // loop over remaining quadwords
415 lvx v2,c16,rs
416 addi rs,rs,16
417 vperm vx,v1,v2,vp
418 vor v1,v2,v2 // v1 <- v2
419 stvx vx,0,rd
420 addi rd,rd,16
421 bdnz 3b
4224:
423 mtspr vrsave,rv // restore bitmap of live vr's
424 bne cr6,LShort16 // handle last 0-15 bytes if any
425 blr
426
427
428// Forward, long, 16-byte aligned vector loop.
429
430 .align 5
431LFwdLongAligned: // loop over 128-byte chunks
432 addi w4,rs,64
433 lvx v1,0,rs
434 lvx v2,c16,rs
435 lvx v3,c32,rs
436 lvx v4,c48,rs
437 lvx v5,0,w4
438 lvx v6,c16,w4
439 lvx v7,c32,w4
440 lvx v8,c48,w4
441 addi rs,rs,128
442 addi w4,rd,64
443 stvx v1,0,rd
444 stvx v2,c16,rd
445 stvx v3,c32,rd
446 stvx v4,c48,rd
447 stvx v5,0,w4
448 stvx v6,c16,w4
449 stvx v7,c32,w4
450 stvx v8,c48,w4
451 addi rd,rd,128
452 bdnz LFwdLongAligned
453
454 beq 4f // no leftover quadwords
455 mtctr r0
4563: // loop over remaining quadwords (1-7)
457 lvx v1,0,rs
458 addi rs,rs,16
459 stvx v1,0,rd
460 addi rd,rd,16
461 bdnz 3b
4624:
463 mtspr vrsave,rv // restore bitmap of live vr's
464 bne cr6,LShort16 // handle last 0-15 bytes if any
465 blr
466
467
468// Long, reverse moves.
469// rs = source
470// rd = destination
471// rc = count
472// cr5 = beq if relatively 16-byte aligned
473
474LLongReverse:
475 add rd,rd,rc // point to end of operands
476 add rs,rs,rc
477 andi. r0,rd,0xF // #bytes to 16-byte align destination
478 beq 2f // already aligned
479
480// 16-byte align destination.
481
482 mtctr r0 // set up for loop
483 sub rc,rc,r0
4841:
485 lbzu w1,-1(rs)
486 stbu w1,-1(rd)
487 bdnz 1b
488
489// Prepare for reverse vector loop. When entered:
490// rd = 16-byte aligned
491// cr5 = beq if source also 16-byte aligned
492// We set up many registers:
493// ctr/cr1 = number of 64-byte chunks to move (may be 0)
494// r0/cr0 = leftover QWs to move
495// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
496// cr6 = beq if leftover byte count is 0
497// cm1 = -1
498// rv = original value of vrsave
499
5002:
501 mfspr rv,vrsave // get bitmap of live vector registers
502 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
503 oris w1,rv,0xFFF8 // we use v0-v12
504 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
505 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
506 cmpwi cr1,r0,0 // set cr1 on chunk count
507 mtspr vrsave,w1 // update mask
508 mtctr r0 // set up loop count
509 cmpwi cr6,w3,0 // set cr6 on leftover byte count
510 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
511 li cm1,-1 // get constants used in ldvx/stvx
512
513 bne cr5,LReverseVecUnal // handle unaligned operands
514 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
515 li cm17,-17
516 li cm33,-33
517 li cm49,-49
518 b 1f
519
520// Long, reverse 16-byte-aligned vector loop.
521
522 .align 5 // align inner loops
5231: // loop over 64-byte chunks
524 lvx v1,cm1,rs
525 lvx v2,cm17,rs
526 lvx v3,cm33,rs
527 lvx v4,cm49,rs
528 subi rs,rs,64
529 stvx v1,cm1,rd
530 stvx v2,cm17,rd
531 stvx v3,cm33,rd
532 stvx v4,cm49,rd
533 subi rd,rd,64
534 bdnz 1b
535
536 beq 4f // no leftover quadwords
5372: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
538 mtctr r0
5393: // loop over remaining quadwords (1-7)
540 lvx v1,cm1,rs
541 subi rs,rs,16
542 stvx v1,cm1,rd
543 subi rd,rd,16
544 bdnz 3b
5454:
546 mtspr vrsave,rv // restore bitmap of live vr's
547 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
548 blr
549
550
551// Long, reverse, unaligned vector loop.
552// ctr/cr1 = number of 64-byte chunks to move (may be 0)
553// r0/cr0 = leftover QWs to move
554// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
555// cr6 = beq if leftover byte count is 0
556// rv = original value of vrsave
557// cm1 = -1
558
559LReverseVecUnal:
560 lvsl vp,0,rs // get permute vector to shift left
561 lvx v1,cm1,rs // v1 always looks ahead
562 li cm17,-17
563 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
564 li cm33,-33
565 li cm49,-49
566 b 1f
567
568 .align 5 // align the inner loops
5691: // loop over 64-byte chunks
570 lvx v2,cm17,rs
571 lvx v3,cm33,rs
572 lvx v4,cm49,rs
573 subi rs,rs,64
574 vperm vx,v2,v1,vp
575 lvx v1,cm1,rs
576 vperm vy,v3,v2,vp
577 stvx vx,cm1,rd
578 vperm vz,v4,v3,vp
579 stvx vy,cm17,rd
580 vperm vx,v1,v4,vp
581 stvx vz,cm33,rd
582 stvx vx,cm49,rd
583 subi rd,rd,64
584 bdnz 1b
585
586 beq 4f // no leftover quadwords
5872: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
588 mtctr r0
5893: // loop over 1-3 quadwords
590 lvx v2,cm17,rs
591 subi rs,rs,16
592 vperm vx,v2,v1,vp
593 vor v1,v2,v2 // v1 <- v2
594 stvx vx,cm1,rd
595 subi rd,rd,16
596 bdnz 3b
5974:
598 mtspr vrsave,rv // restore bitmap of live vr's
599 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
600 blr
601
91447636
A
602
603// Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
604// The conditions bigcopy expects are:
605// r0 = return address (also stored in caller's SF)
606// r4 = source ptr
607// r5 = length (at least several pages)
608// r12 = dest ptr
609
610LBigCopy:
611 lis r2,0x4000 // r2 <- 0x40000000
612 mflr r0 // get our return address
613 add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
614 stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
615 blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
616 std r0,16(r1) // save return in correct spot for 64-bit mode
617 ba _COMM_PAGE_BIGCOPY // then join big operand code
618
619
620 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
621 kCommPageMTCRF+kCommPageBoth+kPort32to64)