]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_64.s
f51d96857331c51b5661ec4192a8f2e91a4bb9d5
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_64.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /* =======================================
24 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
25 * =======================================
26 *
27 * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
28 * This version might be used bringing up new processors, with known
29 * Altivec bugs that need to be worked around. It is not particularly well
30 * optimized.
31 *
32 * For 64-bit processors with a 128-byte cache line, running in either
33 * 32- or 64-bit mode. This is written for 32-bit execution, the kernel
34 * will translate to 64-bit code when it compiles the 64-bit commpage.
35 *
36 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
37 * environment.
38 * r0 = "w7" or temp
39 * r2 = "w8"
40 * r3 = not used, as memcpy and memmove return 1st parameter as a value
41 * r4 = source ptr ("rs")
42 * r5 = count of bytes to move ("rc")
43 * r6 = "w1"
44 * r7 = "w2"
45 * r8 = "w3"
46 * r9 = "w4"
47 * r10 = "w5"
48 * r11 = "w6"
49 * r12 = destination ptr ("rd")
50 */
51 #define rs r4
52 #define rd r12
53 #define rc r5
54 #define rv r2
55
56 #define w1 r6
57 #define w2 r7
58 #define w3 r8
59 #define w4 r9
60 #define w5 r10
61 #define w6 r11
62 #define w7 r0
63 #define w8 r2
64
65 #define ASSEMBLER
66 #include <sys/appleapiopts.h>
67 #include <ppc/asm.h>
68 #include <machine/cpu_capabilities.h>
69 #include <machine/commpage.h>
70
71 .text
72
73 #define kLong 64 // too long for inline loopless code
74
75
76 // Main entry points.
77
78 .align 5
79 bcopy_64: // void bcopy(const void *src, void *dst, size_t len)
80 cmplwi rc,kLong // short or long?
81 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
82 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
83 mr rd,r4 // start to move registers to canonic spot
84 mr rs,r3
85 blt LShort // handle short operands
86 dcbt 0,r3 // touch in destination
87 b LLong // join medium/long operand code
88
89 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
90
91 .align 5
92 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
93 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
94 cmplwi rc,kLong // short or long?
95 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
96 dcbt 0,r4 // touch in the first line of source
97 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
98 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
99 bge LLong // handle medium or long operands
100
101 // Handle short operands.
102
103 LShort:
104 mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time)
105 mtcrf 0x01,rc // put length bits 28-31 in cr7
106 blt cr1,LShortReverse
107
108 // Forward short operands. This is the most frequent case, so it is inline.
109
110 LShort64: // enter to xfer last 64 bytes
111 bf 26,0f // 64-byte chunk to xfer?
112 ld w1,0(rs)
113 ld w2,8(rs)
114 ld w3,16(rs)
115 ld w4,24(rs)
116 addi rs,rs,32
117 std w1,0(rd)
118 std w2,8(rd)
119 std w3,16(rd)
120 std w4,24(rd)
121 addi rd,rd,32
122 0:
123 bf 27,1f // quadword to move?
124 ld w1,0(rs)
125 ld w2,8(rs)
126 addi rs,rs,16
127 std w1,0(rd)
128 std w2,8(rd)
129 addi rd,rd,16
130 1:
131 bf 28,2f // doubleword?
132 ld w1,0(rs)
133 addi rs,rs,8
134 std w1,0(rd)
135 addi rd,rd,8
136 2:
137 bf 29,3f // word?
138 lwz w1,0(rs)
139 addi rs,rs,4
140 stw w1,0(rd)
141 addi rd,rd,4
142 3:
143 bf 30,4f // halfword to move?
144 lhz w1,0(rs)
145 addi rs,rs,2
146 sth w1,0(rd)
147 addi rd,rd,2
148 4:
149 bflr 31 // skip if no odd byte
150 lbz w1,0(rs)
151 stb w1,0(rd)
152 blr
153
154
155 // Handle short reverse operands.
156 // cr6 = bits 26-27 of length
157 // cr7 = bits 28-31 of length
158
159 LShortReverse:
160 add rs,rs,rc // adjust ptrs for reverse move
161 add rd,rd,rc
162 LShortReverse64: // enter to xfer last 64 bytes
163 bf 26,0f // 64-byte chunk to xfer?
164 ld w1,-8(rs)
165 ld w2,-16(rs)
166 ld w3,-24(rs)
167 ldu w4,-32(rs)
168 std w1,-8(rd)
169 std w2,-16(rd)
170 std w3,-24(rd)
171 stdu w4,-32(rd)
172 0:
173 bf 27,1f // quadword to move?
174 ld w1,-8(rs)
175 ldu w2,-16(rs)
176 std w1,-8(rd)
177 stdu w2,-16(rd)
178 1:
179 bf 28,2f // doubleword?
180 ldu w1,-8(rs)
181 stdu w1,-8(rd)
182 2:
183 bf 29,3f // word?
184 lwzu w1,-4(rs)
185 stwu w1,-4(rd)
186 3:
187 bf 30,4f // halfword to move?
188 lhzu w1,-2(rs)
189 sthu w1,-2(rd)
190 4:
191 bflr 31 // done if no odd byte
192 lbz w1,-1(rs) // no update
193 stb w1,-1(rd)
194 blr
195
196
197 // Long operands.
198 // cr1 = blt iff we must move reverse
199
200 .align 4
201 LLong:
202 dcbtst 0,rd // touch in destination
203 neg w3,rd // start to compute #bytes to align destination
204 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
205 blt cr1,LLongReverse // handle reverse moves
206 mtctr w6 // set up for loop to align destination
207 sub rc,rc,w6 // adjust count
208 beq LAligned // destination already 8-byte aligned
209 1:
210 lbz w1,0(rs)
211 addi rs,rs,1
212 stb w1,0(rd)
213 addi rd,rd,1
214 bdnz 1b
215
216 // Destination is 8-byte aligned.
217
218 LAligned:
219 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
220 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
221 mtcrf 0x01,rc // put length bits 28-31 in cr7
222 beq LShort64 // no 64-byte chunks
223 mtctr w2
224 b 1f
225
226 // Loop moving 64-byte chunks.
227
228 .align 5
229 1:
230 ld w1,0(rs)
231 ld w2,8(rs)
232 ld w3,16(rs)
233 ld w4,24(rs)
234 ld w5,32(rs)
235 ld w6,40(rs)
236 ld w7,48(rs)
237 ld w8,56(rs)
238 addi rs,rs,64
239 std w1,0(rd)
240 std w2,8(rd)
241 std w3,16(rd)
242 std w4,24(rd)
243 std w5,32(rd)
244 std w6,40(rd)
245 std w7,48(rd)
246 std w8,56(rd)
247 addi rd,rd,64
248 bdnz 1b
249
250 b LShort64
251
252
253 // Handle reverse moves.
254
255 LLongReverse:
256 add rd,rd,rc // point to end of operands
257 add rs,rs,rc
258 andi. r0,rd,7 // is destination 8-byte aligned?
259 sub rc,rc,r0 // adjust count
260 mtctr r0 // set up for byte loop
261 beq LRevAligned // already aligned
262
263 1:
264 lbzu w1,-1(rs)
265 stbu w1,-1(rd)
266 bdnz 1b
267
268 // Destination is 8-byte aligned.
269
270 LRevAligned:
271 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
272 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
273 mtcrf 0x01,rc // put length bits 28-31 in cr7
274 beq LShortReverse64 // no 64-byte chunks
275 mtctr w2
276 b 1f
277
278 // Loop over 64-byte chunks (reverse).
279
280 .align 5
281 1:
282 ld w1,-8(rs)
283 ld w2,-16(rs)
284 ld w3,-24(rs)
285 ld w4,-32(rs)
286 ld w5,-40(rs)
287 ld w6,-48(rs)
288 ld w7,-56(rs)
289 ldu w8,-64(rs)
290 std w1,-8(rd)
291 std w2,-16(rd)
292 std w3,-24(rd)
293 std w4,-32(rd)
294 std w5,-40(rd)
295 std w6,-48(rd)
296 std w7,-56(rd)
297 stdu w8,-64(rd)
298 bdnz 1b
299
300 b LShortReverse64
301
302 COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)