]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_g4.s
xnu-792.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_g4.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* =======================================
23 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
24 * =======================================
25 *
26 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
27 * reading destination cache lines. Only the 7450 actually benefits from
28 * this, and then only in the cold-cache case. On 7400s and 7455s, we
29 * patch the DCBAs into NOPs.
30 *
31 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
32 * environment. Note also the rather delicate way we assign multiple uses
33 * to the same register. Beware.
34 *
35 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
36 * r2 = "w8" or vrsave ("rv")
37 * r3 = not used, as memcpy and memmove return 1st parameter as a value
38 * r4 = source ptr ("rs")
39 * r5 = count of bytes to move ("rc")
40 * r6 = "w1", "c16", or "cm17"
41 * r7 = "w2", "c32", or "cm33"
42 * r8 = "w3", "c48", or "cm49"
43 * r9 = "w4", or "cm1"
44 * r10 = "w5", "c96", or "cm97"
45 * r11 = "w6", "c128", or "cm129"
46 * r12 = destination ptr ("rd")
47 * v0 = permute vector ("vp")
48 * v1-v4 = qw's loaded from source
49 * v5-v7 = permuted qw's ("vw", "vx", "vy")
50 */
51 #define rs r4
52 #define rd r12
53 #define rc r5
54 #define rv r2
55
56 #define w1 r6
57 #define w2 r7
58 #define w3 r8
59 #define w4 r9
60 #define w5 r10
61 #define w6 r11
62 #define w7 r0
63 #define w8 r2
64
65 #define c16 r6
66 #define cm17 r6
67 #define c32 r7
68 #define cm33 r7
69 #define c48 r8
70 #define cm49 r8
71 #define cm1 r9
72 #define c96 r10
73 #define cm97 r10
74 #define c128 r11
75 #define cm129 r11
76
77 #define vp v0
78 #define vw v5
79 #define vx v6
80 #define vy v7
81
82 #define ASSEMBLER
83 #include <sys/appleapiopts.h>
84 #include <ppc/asm.h>
85 #include <machine/cpu_capabilities.h>
86 #include <machine/commpage.h>
87
88 .text
89
90 #define kMedium 32 // too long for inline loopless code
91 #define kLong 96 // long enough to justify use of Altivec
92
93
94 // Main entry points.
95
96 .align 5
97 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
98 cmplwi rc,kMedium // short or long?
99 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
100 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
101 mr rd,r4 // start to move registers to canonic spot
102 mr rs,r3
103 blt+ LShort // handle short operands
104 dcbt 0,r3 // touch in destination
105 b LMedium // join medium/long operand code
106
107 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
108
109 .align 5
110 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
111 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
112 cmplwi rc,kMedium // short or long?
113 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
114 dcbt 0,r4 // touch in the first line of source
115 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
116 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
117 bge- LMedium // handle medium or long operands
118
119 // Handle short operands.
120
121 LShort:
122 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
123 mtcrf 0x01,rc // put length bits 28-31 in cr7
124 blt- cr1,LShortReverse
125
126 // Forward short operands. This is the most frequent case, so it is inline.
127
128 beq LShort16 // quadword to move?
129 lwz w1,0(rs)
130 lwz w2,4(rs)
131 lwz w3,8(rs)
132 lwz w4,12(rs)
133 addi rs,rs,16
134 stw w1,0(rd)
135 stw w2,4(rd)
136 stw w3,8(rd)
137 stw w4,12(rd)
138 addi rd,rd,16
139 LShort16: // join here to xfer 0-15 bytes
140 bf 28,2f // doubleword?
141 lwz w1,0(rs)
142 lwz w2,4(rs)
143 addi rs,rs,8
144 stw w1,0(rd)
145 stw w2,4(rd)
146 addi rd,rd,8
147 2:
148 bf 29,3f // word?
149 lwz w1,0(rs)
150 addi rs,rs,4
151 stw w1,0(rd)
152 addi rd,rd,4
153 3:
154 bf 30,4f // halfword to move?
155 lhz w1,0(rs)
156 addi rs,rs,2
157 sth w1,0(rd)
158 addi rd,rd,2
159 4:
160 bflr 31 // skip if no odd byte
161 lbz w1,0(rs)
162 stb w1,0(rd)
163 blr
164
165
166 // Handle short reverse operands.
167 // cr0 = bne if bit 27 of length is set
168 // cr7 = bits 28-31 of length
169
170 LShortReverse:
171 add rs,rs,rc // adjust ptrs for reverse move
172 add rd,rd,rc
173 beq LShortReverse16 // quadword to move?
174 lwz w1,-4(rs)
175 lwz w2,-8(rs)
176 lwz w3,-12(rs)
177 lwzu w4,-16(rs)
178 stw w1,-4(rd)
179 stw w2,-8(rd)
180 stw w3,-12(rd)
181 stwu w4,-16(rd)
182 LShortReverse16: // join here to xfer 0-15 bytes and return
183 bf 28,2f // doubleword?
184 lwz w1,-4(rs)
185 lwzu w2,-8(rs)
186 stw w1,-4(rd)
187 stwu w2,-8(rd)
188 2:
189 bf 29,3f // word?
190 lwzu w1,-4(rs)
191 stwu w1,-4(rd)
192 3:
193 bf 30,4f // halfword to move?
194 lhzu w1,-2(rs)
195 sthu w1,-2(rd)
196 4:
197 bflr 31 // done if no odd byte
198 lbz w1,-1(rs) // no update
199 stb w1,-1(rd)
200 blr
201
202
203 // Medium and long operands. Use Altivec if long enough, else scalar loops.
204 // w1 = (rd-rs), used to check for alignment
205 // cr1 = blt iff we must move reverse
206
207 .align 4
208 LMedium:
209 dcbtst 0,rd // touch in destination
210 cmplwi cr7,rc,kLong // long enough for vectors?
211 neg w3,rd // start to compute #bytes to align destination
212 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
213 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
214 blt cr1,LMediumReverse // handle reverse moves
215 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
216 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
217 bge cr7,LFwdLong // long enough for vectors
218
219 // Medium length: use scalar loops.
220 // w6/cr0 = #bytes to 8-byte align destination
221 // cr6 = beq if relatively doubleword aligned
222
223 sub rc,rc,w6 // decrement length remaining
224 beq 1f // skip if dest already doubleword aligned
225 mtxer w6 // set up count for move
226 lswx w1,0,rs // move w6 bytes to align destination
227 stswx w1,0,rd
228 add rs,rs,w6 // bump ptrs past
229 add rd,rd,w6
230 1:
231 srwi r0,rc,4 // get # 16-byte chunks (>=1)
232 mtcrf 0x01,rc // save remaining byte count here for LShort16
233 mtctr r0 // set up 16-byte loop
234 bne cr6,3f // source not 4-byte aligned
235 b 2f
236
237 .align 4
238 2: // loop over 16-byte aligned chunks
239 lfd f0,0(rs)
240 lfd f1,8(rs)
241 addi rs,rs,16
242 stfd f0,0(rd)
243 stfd f1,8(rd)
244 addi rd,rd,16
245 bdnz 2b
246
247 b LShort16
248
249 .align 4
250 3: // loop over 16-byte unaligned chunks
251 lwz w1,0(rs)
252 lwz w2,4(rs)
253 lwz w3,8(rs)
254 lwz w4,12(rs)
255 addi rs,rs,16
256 stw w1,0(rd)
257 stw w2,4(rd)
258 stw w3,8(rd)
259 stw w4,12(rd)
260 addi rd,rd,16
261 bdnz 3b
262
263 b LShort16
264
265
266 // Vector loops. First, we must 32-byte align the destination.
267 // w1 = (rd-rs), used to check for reverse and alignment
268 // w4 = #bytes to 32-byte align destination
269 // rc = long enough for at least one vector loop
270
271 LFwdLong:
272 cmpwi w4,0 // dest already aligned?
273 sub rc,rc,w4 // adjust length
274 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
275 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
276 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
277 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
278 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
279 beq LFwdAligned // dest is already aligned
280
281 // 32-byte align destination.
282
283 bf 31,1f // byte to move?
284 lbz w1,0(rs)
285 addi rs,rs,1
286 stb w1,0(rd)
287 addi rd,rd,1
288 1:
289 bf 30,2f // halfword?
290 lhz w1,0(rs)
291 addi rs,rs,2
292 sth w1,0(rd)
293 addi rd,rd,2
294 2:
295 bf 29,3f // word?
296 lwz w1,0(rs)
297 addi rs,rs,4
298 stw w1,0(rd)
299 addi rd,rd,4
300 3:
301 bf 28,4f // doubleword?
302 lwz w1,0(rs)
303 lwz w2,4(rs)
304 addi rs,rs,8
305 stw w1,0(rd)
306 stw w2,4(rd)
307 addi rd,rd,8
308 4:
309 bf 27,LFwdAligned // quadword?
310 lwz w1,0(rs)
311 lwz w2,4(rs)
312 lwz w3,8(rs)
313 lwz w4,12(rs)
314 addi rs,rs,16
315 stw w1,0(rd)
316 stw w2,4(rd)
317 stw w3,8(rd)
318 stw w4,12(rd)
319 addi rd,rd,16
320
321
322 // Destination is 32-byte aligned.
323 // r0 = count of 64-byte chunks to move (not 0)
324 // rd = 32-byte aligned
325 // rc = bytes remaining
326 // cr5 = beq if source is 16-byte aligned
327 // We set up many registers:
328 // ctr = number of 64-byte chunks to move
329 // r0/cr0 = leftover QWs to move
330 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
331 // cr6 = beq if leftover byte count is 0
332 // rv = original value of vrsave
333 // c16 etc = loaded
334
335 LFwdAligned:
336 mfspr rv,vrsave // get bitmap of live vector registers
337 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
338 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
339 mtctr r0 // set up loop count
340 cmpwi cr6,w3,0 // set cr6 on leftover byte count
341 oris w1,rv,0xFF00 // we use v0-v7
342 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
343 mtspr vrsave,w1 // update mask
344 li c16,16 // get constants used in ldvx/stvx
345 li c32,32
346 li c48,48
347 li c96,96
348 li c128,128
349 bne cr5,LForwardVecUnal // handle unaligned operands
350 b 1f
351
352 .align 4
353 1: // loop over 64-byte chunks
354 dcbt c96,rs
355 dcbt c128,rs
356 lvx v1,0,rs
357 lvx v2,c16,rs
358 lvx v3,c32,rs
359 lvx v4,c48,rs
360 addi rs,rs,64
361 dcba 0,rd // patched to NOP on some machines
362 stvx v1,0,rd
363 stvx v2,c16,rd
364 dcba c32,rd // patched to NOP on some machines
365 stvx v3,c32,rd
366 stvx v4,c48,rd
367 addi rd,rd,64
368 bdnz 1b
369
370 beq 4f // no leftover quadwords
371 mtctr r0
372 3: // loop over remaining quadwords (1-3)
373 lvx v1,0,rs
374 addi rs,rs,16
375 stvx v1,0,rd
376 addi rd,rd,16
377 bdnz 3b
378 4:
379 mtspr vrsave,rv // restore bitmap of live vr's
380 bne cr6,LShort16 // handle last 0-15 bytes if any
381 blr
382
383
384 // Long, forward, unaligned vector loop.
385
386 LForwardVecUnal:
387 lvsl vp,0,rs // get permute vector to shift left
388 lvx v1,0,rs // prefetch 1st source quadword
389 b 1f
390
391 .align 4 // align inner loops
392 1: // loop over 64-byte chunks
393 lvx v2,c16,rs
394 dcbt c96,rs
395 lvx v3,c32,rs
396 dcbt c128,rs
397 lvx v4,c48,rs
398 addi rs,rs,64
399 vperm vw,v1,v2,vp
400 lvx v1,0,rs
401 vperm vx,v2,v3,vp
402 dcba 0,rd // patched to NOP on some machines
403 stvx vw,0,rd
404 vperm vy,v3,v4,vp
405 stvx vx,c16,rd
406 vperm vw,v4,v1,vp
407 dcba c32,rd // patched to NOP on some machines
408 stvx vy,c32,rd
409 stvx vw,c48,rd
410 addi rd,rd,64
411 bdnz 1b
412
413 beq- 4f // no leftover quadwords
414 mtctr r0
415 3: // loop over remaining quadwords
416 lvx v2,c16,rs
417 addi rs,rs,16
418 vperm vx,v1,v2,vp
419 vor v1,v2,v2 // v1 <- v2
420 stvx vx,0,rd
421 addi rd,rd,16
422 bdnz 3b
423 4:
424 mtspr vrsave,rv // restore bitmap of live vr's
425 bne cr6,LShort16 // handle last 0-15 bytes if any
426 blr
427
428
429 // Medium and long, reverse moves. We use altivec if the operands are long enough,
430 // else a lwz/stx loop.
431 // w1 = (rd-rs), used to check for reverse and alignment
432 // cr7 = bge if long
433
434 LMediumReverse:
435 add rd,rd,rc // point to end of operands
436 add rs,rs,rc
437 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
438 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
439 bge cr7,LLongReverse // long enough for vectors
440
441 // Scalar loop.
442 // w6 = #bytes to 4-byte align destination
443
444 sub rc,rc,w6 // decrement length remaining
445 mtxer w6 // set up count for move
446 sub rs,rs,w6 // back up ptrs
447 sub rd,rd,w6
448 srwi r0,rc,4 // get # 16-byte chunks (>=1)
449 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
450 lswx w1,0,rs // move w6 bytes to align destination
451 stswx w1,0,rd
452 mtctr r0 // set up 16-byte loop
453 b 1f
454
455 .align 4
456 1: // loop over 16-byte aligned chunks
457 lwz w1,-4(rs)
458 lwz w2,-8(rs)
459 lwz w3,-12(rs)
460 lwzu w4,-16(rs)
461 stw w1,-4(rd)
462 stw w2,-8(rd)
463 stw w3,-12(rd)
464 stwu w4,-16(rd)
465 bdnz 1b
466
467 b LShortReverse16
468
469
470 // Reverse vector loops. First, we must 32-byte align the destination.
471 // w1 = (rd-rs), used to check for reverse and alignment
472 // w4/cr0 = #bytes to 32-byte align destination
473 // rc = long enough for at least one vector loop
474
475 LLongReverse:
476 sub rc,rc,w4 // adjust length
477 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
478 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
479 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
480 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
481 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
482 beq LReverseAligned // dest is already aligned
483
484 // 32-byte align destination.
485
486 bf 31,1f // byte to move?
487 lbzu w1,-1(rs)
488 stbu w1,-1(rd)
489 1:
490 bf 30,2f // halfword?
491 lhzu w1,-2(rs)
492 sthu w1,-2(rd)
493 2:
494 bf 29,3f // word?
495 lwzu w1,-4(rs)
496 stwu w1,-4(rd)
497 3:
498 bf 28,4f // doubleword?
499 lwz w1,-4(rs)
500 lwzu w2,-8(rs)
501 stw w1,-4(rd)
502 stwu w2,-8(rd)
503 4:
504 bf 27,LReverseAligned // quadword?
505 lwz w1,-4(rs)
506 lwz w2,-8(rs)
507 lwz w3,-12(rs)
508 lwzu w4,-16(rs)
509 stw w1,-4(rd)
510 stw w2,-8(rd)
511 stw w3,-12(rd)
512 stwu w4,-16(rd)
513
514 // Destination is 32-byte aligned.
515 // r0 = count of 64-byte chunks to move (not 0)
516 // rd = 32-byte aligned
517 // rc = bytes remaining
518 // cr5 = beq if source is 16-byte aligned
519 // We set up many registers:
520 // ctr = number of 64-byte chunks to move
521 // r0/cr0 = leftover QWs to move
522 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
523 // cr6 = beq if leftover byte count is 0
524 // rv = original value of vrsave
525 // cm1 etc = loaded
526
527 LReverseAligned:
528 mfspr rv,vrsave // get bitmap of live vector registers
529 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
530 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
531 mtctr r0 // set up loop count
532 cmpwi cr6,w3,0 // set cr6 on leftover byte count
533 oris w1,rv,0xFF00 // we use v0-v7
534 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
535 mtspr vrsave,w1 // update mask
536 li cm1,-1 // get constants used in ldvx/stvx
537 li cm17,-17
538 li cm33,-33
539 li cm49,-49
540 li cm97,-97
541 li cm129,-129
542 bne cr5,LReverseVecUnal // handle unaligned operands
543 b 1f
544
545 .align 4 // align inner loops
546 1: // loop over 64-byte chunks
547 dcbt cm97,rs
548 dcbt cm129,rs
549 lvx v1,cm1,rs
550 lvx v2,cm17,rs
551 lvx v3,cm33,rs
552 lvx v4,cm49,rs
553 subi rs,rs,64
554 stvx v1,cm1,rd
555 stvx v2,cm17,rd
556 stvx v3,cm33,rd
557 stvx v4,cm49,rd
558 subi rd,rd,64
559 bdnz 1b
560
561 beq 4f // no leftover quadwords
562 mtctr r0
563 3: // loop over remaining quadwords (1-7)
564 lvx v1,cm1,rs
565 subi rs,rs,16
566 stvx v1,cm1,rd
567 subi rd,rd,16
568 bdnz 3b
569 4:
570 mtspr vrsave,rv // restore bitmap of live vr's
571 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
572 blr
573
574
575 // Long, reverse, unaligned vector loop.
576
577 LReverseVecUnal:
578 lvsl vp,0,rs // get permute vector to shift left
579 lvx v1,cm1,rs // v1 always looks ahead
580 b 1f
581
582 .align 4 // align the inner loops
583 1: // loop over 64-byte chunks
584 lvx v2,cm17,rs
585 dcbt cm97,rs
586 lvx v3,cm33,rs
587 dcbt cm129,rs
588 lvx v4,cm49,rs
589 subi rs,rs,64
590 vperm vw,v2,v1,vp
591 lvx v1,cm1,rs
592 vperm vx,v3,v2,vp
593 stvx vw,cm1,rd
594 vperm vy,v4,v3,vp
595 stvx vx,cm17,rd
596 vperm vw,v1,v4,vp
597 stvx vy,cm33,rd
598 stvx vw,cm49,rd
599 subi rd,rd,64
600 bdnz 1b
601
602 beq 3f // no leftover quadwords
603 mtctr r0
604 2: // loop over 1-3 quadwords
605 lvx v2,cm17,rs
606 subi rs,rs,16
607 vperm vx,v2,v1,vp
608 vor v1,v2,v2 // v1 <- v2
609 stvx vx,cm1,rd
610 subi rd,rd,16
611 bdnz 2b
612 3:
613 mtspr vrsave,rv // restore bitmap of live vr's
614 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
615 blr
616
617 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)