]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/ppc/commpage/bcopy_64.s
xnu-1228.3.13.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_64.s
... / ...
CommitLineData
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
31 *
32 * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
33 * This version might be used bringing up new processors, with known
34 * Altivec bugs that need to be worked around. It is not particularly well
35 * optimized.
36 *
37 * For 64-bit processors with a 128-byte cache line, running in either
38 * 32- or 64-bit mode. This is written for 32-bit execution, the kernel
39 * will translate to 64-bit code when it compiles the 64-bit commpage.
40 *
41 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
42 * environment.
43 * r0 = "w7" or temp
44 * r2 = "w8"
45 * r3 = not used, as memcpy and memmove return 1st parameter as a value
46 * r4 = source ptr ("rs")
47 * r5 = count of bytes to move ("rc")
48 * r6 = "w1"
49 * r7 = "w2"
50 * r8 = "w3"
51 * r9 = "w4"
52 * r10 = "w5"
53 * r11 = "w6"
54 * r12 = destination ptr ("rd")
55 */
56#define rs r4
57#define rd r12
58#define rc r5
59#define rv r2
60
61#define w1 r6
62#define w2 r7
63#define w3 r8
64#define w4 r9
65#define w5 r10
66#define w6 r11
67#define w7 r0
68#define w8 r2
69
70#include <sys/appleapiopts.h>
71#include <ppc/asm.h>
72#include <machine/cpu_capabilities.h>
73#include <machine/commpage.h>
74
75 .text
76
77#define kLong 64 // too long for inline loopless code
78
79
80// Main entry points.
81
82 .align 5
83bcopy_64: // void bcopy(const void *src, void *dst, size_t len)
84 cmplwi rc,kLong // short or long?
85 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
86 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
87 mr rd,r4 // start to move registers to canonic spot
88 mr rs,r3
89 blt LShort // handle short operands
90 dcbt 0,r3 // touch in destination
91 b LLong // join medium/long operand code
92
93// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
94
95 .align 5
96Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
97Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
98 cmplwi rc,kLong // short or long?
99 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
100 dcbt 0,r4 // touch in the first line of source
101 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
102 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
103 bge LLong // handle medium or long operands
104
105// Handle short operands.
106
107LShort:
108 mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time)
109 mtcrf 0x01,rc // put length bits 28-31 in cr7
110 blt cr1,LShortReverse
111
112// Forward short operands. This is the most frequent case, so it is inline.
113
114LShort64: // enter to xfer last 64 bytes
115 bf 26,0f // 64-byte chunk to xfer?
116 ld w1,0(rs)
117 ld w2,8(rs)
118 ld w3,16(rs)
119 ld w4,24(rs)
120 addi rs,rs,32
121 std w1,0(rd)
122 std w2,8(rd)
123 std w3,16(rd)
124 std w4,24(rd)
125 addi rd,rd,32
1260:
127 bf 27,1f // quadword to move?
128 ld w1,0(rs)
129 ld w2,8(rs)
130 addi rs,rs,16
131 std w1,0(rd)
132 std w2,8(rd)
133 addi rd,rd,16
1341:
135 bf 28,2f // doubleword?
136 ld w1,0(rs)
137 addi rs,rs,8
138 std w1,0(rd)
139 addi rd,rd,8
1402:
141 bf 29,3f // word?
142 lwz w1,0(rs)
143 addi rs,rs,4
144 stw w1,0(rd)
145 addi rd,rd,4
1463:
147 bf 30,4f // halfword to move?
148 lhz w1,0(rs)
149 addi rs,rs,2
150 sth w1,0(rd)
151 addi rd,rd,2
1524:
153 bflr 31 // skip if no odd byte
154 lbz w1,0(rs)
155 stb w1,0(rd)
156 blr
157
158
159// Handle short reverse operands.
160// cr6 = bits 26-27 of length
161// cr7 = bits 28-31 of length
162
163LShortReverse:
164 add rs,rs,rc // adjust ptrs for reverse move
165 add rd,rd,rc
166LShortReverse64: // enter to xfer last 64 bytes
167 bf 26,0f // 64-byte chunk to xfer?
168 ld w1,-8(rs)
169 ld w2,-16(rs)
170 ld w3,-24(rs)
171 ldu w4,-32(rs)
172 std w1,-8(rd)
173 std w2,-16(rd)
174 std w3,-24(rd)
175 stdu w4,-32(rd)
1760:
177 bf 27,1f // quadword to move?
178 ld w1,-8(rs)
179 ldu w2,-16(rs)
180 std w1,-8(rd)
181 stdu w2,-16(rd)
1821:
183 bf 28,2f // doubleword?
184 ldu w1,-8(rs)
185 stdu w1,-8(rd)
1862:
187 bf 29,3f // word?
188 lwzu w1,-4(rs)
189 stwu w1,-4(rd)
1903:
191 bf 30,4f // halfword to move?
192 lhzu w1,-2(rs)
193 sthu w1,-2(rd)
1944:
195 bflr 31 // done if no odd byte
196 lbz w1,-1(rs) // no update
197 stb w1,-1(rd)
198 blr
199
200
201// Long operands.
202// cr1 = blt iff we must move reverse
203
204 .align 4
205LLong:
206 dcbtst 0,rd // touch in destination
207 neg w3,rd // start to compute #bytes to align destination
208 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
209 blt cr1,LLongReverse // handle reverse moves
210 mtctr w6 // set up for loop to align destination
211 sub rc,rc,w6 // adjust count
212 beq LAligned // destination already 8-byte aligned
2131:
214 lbz w1,0(rs)
215 addi rs,rs,1
216 stb w1,0(rd)
217 addi rd,rd,1
218 bdnz 1b
219
220// Destination is 8-byte aligned.
221
222LAligned:
223 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
224 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
225 mtcrf 0x01,rc // put length bits 28-31 in cr7
226 beq LShort64 // no 64-byte chunks
227 mtctr w2
228 b 1f
229
230// Loop moving 64-byte chunks.
231
232 .align 5
2331:
234 ld w1,0(rs)
235 ld w2,8(rs)
236 ld w3,16(rs)
237 ld w4,24(rs)
238 ld w5,32(rs)
239 ld w6,40(rs)
240 ld w7,48(rs)
241 ld w8,56(rs)
242 addi rs,rs,64
243 std w1,0(rd)
244 std w2,8(rd)
245 std w3,16(rd)
246 std w4,24(rd)
247 std w5,32(rd)
248 std w6,40(rd)
249 std w7,48(rd)
250 std w8,56(rd)
251 addi rd,rd,64
252 bdnz 1b
253
254 b LShort64
255
256
257// Handle reverse moves.
258
259LLongReverse:
260 add rd,rd,rc // point to end of operands
261 add rs,rs,rc
262 andi. r0,rd,7 // is destination 8-byte aligned?
263 sub rc,rc,r0 // adjust count
264 mtctr r0 // set up for byte loop
265 beq LRevAligned // already aligned
266
2671:
268 lbzu w1,-1(rs)
269 stbu w1,-1(rd)
270 bdnz 1b
271
272// Destination is 8-byte aligned.
273
274LRevAligned:
275 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
276 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
277 mtcrf 0x01,rc // put length bits 28-31 in cr7
278 beq LShortReverse64 // no 64-byte chunks
279 mtctr w2
280 b 1f
281
282// Loop over 64-byte chunks (reverse).
283
284 .align 5
2851:
286 ld w1,-8(rs)
287 ld w2,-16(rs)
288 ld w3,-24(rs)
289 ld w4,-32(rs)
290 ld w5,-40(rs)
291 ld w6,-48(rs)
292 ld w7,-56(rs)
293 ldu w8,-64(rs)
294 std w1,-8(rd)
295 std w2,-16(rd)
296 std w3,-24(rd)
297 std w4,-32(rd)
298 std w5,-40(rd)
299 std w6,-48(rd)
300 std w7,-56(rd)
301 stdu w8,-64(rd)
302 bdnz 1b
303
304 b LShortReverse64
305
306 COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)