]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/memset_g4.s
xnu-792.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / memset_g4.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #define ASSEMBLER
24 #include <sys/appleapiopts.h>
25 #include <ppc/asm.h>
26 #include <machine/cpu_capabilities.h>
27 #include <machine/commpage.h>
28
29 .text
30 .align 2
31
32
33 /* *********************
34 * * M E M S E T _ G 4 *
35 * *********************
36 *
37 * This is a subroutine called by Libc memset and memset_pattern for large nonzero
38 * operands (zero operands are funneled into bzero.) This version is for
39 * 32-bit processors with a 32-byte cache line and Altivec.
40 *
41 * Registers at entry:
42 * r4 = count of bytes to store (must be >= 32)
43 * r8 = ptr to the 1st byte to store (16-byte aligned)
44 * r9 = ptr to 16-byte pattern to store (16-byte aligned)
45 * When we return:
46 * r3 = not changed, since memset returns it
47 * r4 = bytes remaining to store (will be <32)
48 * r7 = not changed
49 * r8 = ptr to next byte to store (still 16-byte aligned)
50 * r12 = not changed (holds return value for memset)
51 */
52
53 #define kBig (3*64) // big enough to warrant using dcba (NB: must be >= 3*64)
54
55 .align 4
56 memset_g4:
57 cmplwi cr1,r4,kBig // big enough to warrant using dcbz?
58 mfspr r2,vrsave // we'll be using VRs
59 oris r0,r2,0x8000 // we use vr0
60 andi. r5,r8,0x10 // is ptr 32-byte aligned?
61 mtspr vrsave,r0
62 li r5,16 // get offsets for "stvx"
63 lvx v0,0,r9 // load the pattern into v0
64 li r6,32
65 blt cr1,LShort // not big enough to bother with dcba
66 li r9,48
67
68 // cache line align
69
70 beq 2f // already aligned
71 stvx v0,0,r8 // store another 16 bytes to align
72 addi r8,r8,16
73 subi r4,r4,16
74
75 // Set up for inner loop.
76 2:
77 srwi r0,r4,6 // get count of 64-byte chunks (>=2)
78 dcba 0,r8 // pre-allocate first cache line (possibly nop'd)
79 rlwinm r4,r4,0,0x3F // mask down to residual count (0..63)
80 subic r0,r0,1 // loop 1-too-few times
81 li r10,64 // get offsets to DCBA one chunk ahead
82 li r11,64+32
83 mtctr r0
84 dcba r6,r8 // zero 2nd cache line (possibly nop'd)
85 b 3f // enter DCBA loop
86
87 // Loop over 64-byte chunks. We DCBA one chunk ahead, which is a little faster.
88 // Note that some G4s do not benefit from the DCBAs. We nop them in that case.
89
90 .align 4
91 3:
92 dcba r10,r8 // zero one 64-byte chunk ahead (possibly nop'd)
93 dcba r11,r8
94 stvx v0,0,r8
95 stvx v0,r5,r8
96 stvx v0,r6,r8
97 stvx v0,r9,r8
98 addi r8,r8,64
99 bdnz+ 3b
100
101 // Last chunk, which we've already DCBAd.
102
103 stvx v0,0,r8
104 stvx v0,r5,r8
105 stvx v0,r6,r8
106 stvx v0,r9,r8
107 addi r8,r8,64
108
109 // loop over 32-byte chunks at end
110 LShort:
111 srwi. r0,r4,5 // get count of 32-byte chunks
112 rlwinm r4,r4,0,0x1F // mask down to residual count (0..31)
113 beq 7f // no chunks so done
114 mtctr r0
115 6:
116 stvx v0,0,r8
117 stvx v0,r5,r8
118 addi r8,r8,32
119 bdnz 6b
120 7:
121 mtspr vrsave,r2 // restore caller's vrsave
122 blr
123
124
125 COMMPAGE_DESCRIPTOR(memset_g4,_COMM_PAGE_MEMSET_PATTERN,kCache32+kHasAltivec,0, \
126 kCommPageDCBA+kCommPage32)