]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/ppc/commpage/bcopy_64.s
xnu-792.22.5.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_64.s
... / ...
CommitLineData
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
31 *
32 * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
33 * This version might be used bringing up new processors, with known
34 * Altivec bugs that need to be worked around. It is not particularly well
35 * optimized.
36 *
37 * For 64-bit processors with a 128-byte cache line, running in either
38 * 32- or 64-bit mode. This is written for 32-bit execution, the kernel
39 * will translate to 64-bit code when it compiles the 64-bit commpage.
40 *
41 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
42 * environment.
43 * r0 = "w7" or temp
44 * r2 = "w8"
45 * r3 = not used, as memcpy and memmove return 1st parameter as a value
46 * r4 = source ptr ("rs")
47 * r5 = count of bytes to move ("rc")
48 * r6 = "w1"
49 * r7 = "w2"
50 * r8 = "w3"
51 * r9 = "w4"
52 * r10 = "w5"
53 * r11 = "w6"
54 * r12 = destination ptr ("rd")
55 */
56#define rs r4
57#define rd r12
58#define rc r5
59#define rv r2
60
61#define w1 r6
62#define w2 r7
63#define w3 r8
64#define w4 r9
65#define w5 r10
66#define w6 r11
67#define w7 r0
68#define w8 r2
69
70#define ASSEMBLER
71#include <sys/appleapiopts.h>
72#include <ppc/asm.h>
73#include <machine/cpu_capabilities.h>
74#include <machine/commpage.h>
75
76 .text
77
78#define kLong 64 // too long for inline loopless code
79
80
81// Main entry points.
82
83 .align 5
84bcopy_64: // void bcopy(const void *src, void *dst, size_t len)
85 cmplwi rc,kLong // short or long?
86 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
87 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
88 mr rd,r4 // start to move registers to canonic spot
89 mr rs,r3
90 blt LShort // handle short operands
91 dcbt 0,r3 // touch in destination
92 b LLong // join medium/long operand code
93
94// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
95
96 .align 5
97Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
98Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
99 cmplwi rc,kLong // short or long?
100 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
101 dcbt 0,r4 // touch in the first line of source
102 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
103 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
104 bge LLong // handle medium or long operands
105
106// Handle short operands.
107
108LShort:
109 mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time)
110 mtcrf 0x01,rc // put length bits 28-31 in cr7
111 blt cr1,LShortReverse
112
113// Forward short operands. This is the most frequent case, so it is inline.
114
115LShort64: // enter to xfer last 64 bytes
116 bf 26,0f // 64-byte chunk to xfer?
117 ld w1,0(rs)
118 ld w2,8(rs)
119 ld w3,16(rs)
120 ld w4,24(rs)
121 addi rs,rs,32
122 std w1,0(rd)
123 std w2,8(rd)
124 std w3,16(rd)
125 std w4,24(rd)
126 addi rd,rd,32
1270:
128 bf 27,1f // quadword to move?
129 ld w1,0(rs)
130 ld w2,8(rs)
131 addi rs,rs,16
132 std w1,0(rd)
133 std w2,8(rd)
134 addi rd,rd,16
1351:
136 bf 28,2f // doubleword?
137 ld w1,0(rs)
138 addi rs,rs,8
139 std w1,0(rd)
140 addi rd,rd,8
1412:
142 bf 29,3f // word?
143 lwz w1,0(rs)
144 addi rs,rs,4
145 stw w1,0(rd)
146 addi rd,rd,4
1473:
148 bf 30,4f // halfword to move?
149 lhz w1,0(rs)
150 addi rs,rs,2
151 sth w1,0(rd)
152 addi rd,rd,2
1534:
154 bflr 31 // skip if no odd byte
155 lbz w1,0(rs)
156 stb w1,0(rd)
157 blr
158
159
160// Handle short reverse operands.
161// cr6 = bits 26-27 of length
162// cr7 = bits 28-31 of length
163
164LShortReverse:
165 add rs,rs,rc // adjust ptrs for reverse move
166 add rd,rd,rc
167LShortReverse64: // enter to xfer last 64 bytes
168 bf 26,0f // 64-byte chunk to xfer?
169 ld w1,-8(rs)
170 ld w2,-16(rs)
171 ld w3,-24(rs)
172 ldu w4,-32(rs)
173 std w1,-8(rd)
174 std w2,-16(rd)
175 std w3,-24(rd)
176 stdu w4,-32(rd)
1770:
178 bf 27,1f // quadword to move?
179 ld w1,-8(rs)
180 ldu w2,-16(rs)
181 std w1,-8(rd)
182 stdu w2,-16(rd)
1831:
184 bf 28,2f // doubleword?
185 ldu w1,-8(rs)
186 stdu w1,-8(rd)
1872:
188 bf 29,3f // word?
189 lwzu w1,-4(rs)
190 stwu w1,-4(rd)
1913:
192 bf 30,4f // halfword to move?
193 lhzu w1,-2(rs)
194 sthu w1,-2(rd)
1954:
196 bflr 31 // done if no odd byte
197 lbz w1,-1(rs) // no update
198 stb w1,-1(rd)
199 blr
200
201
202// Long operands.
203// cr1 = blt iff we must move reverse
204
205 .align 4
206LLong:
207 dcbtst 0,rd // touch in destination
208 neg w3,rd // start to compute #bytes to align destination
209 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
210 blt cr1,LLongReverse // handle reverse moves
211 mtctr w6 // set up for loop to align destination
212 sub rc,rc,w6 // adjust count
213 beq LAligned // destination already 8-byte aligned
2141:
215 lbz w1,0(rs)
216 addi rs,rs,1
217 stb w1,0(rd)
218 addi rd,rd,1
219 bdnz 1b
220
221// Destination is 8-byte aligned.
222
223LAligned:
224 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
225 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
226 mtcrf 0x01,rc // put length bits 28-31 in cr7
227 beq LShort64 // no 64-byte chunks
228 mtctr w2
229 b 1f
230
231// Loop moving 64-byte chunks.
232
233 .align 5
2341:
235 ld w1,0(rs)
236 ld w2,8(rs)
237 ld w3,16(rs)
238 ld w4,24(rs)
239 ld w5,32(rs)
240 ld w6,40(rs)
241 ld w7,48(rs)
242 ld w8,56(rs)
243 addi rs,rs,64
244 std w1,0(rd)
245 std w2,8(rd)
246 std w3,16(rd)
247 std w4,24(rd)
248 std w5,32(rd)
249 std w6,40(rd)
250 std w7,48(rd)
251 std w8,56(rd)
252 addi rd,rd,64
253 bdnz 1b
254
255 b LShort64
256
257
258// Handle reverse moves.
259
260LLongReverse:
261 add rd,rd,rc // point to end of operands
262 add rs,rs,rc
263 andi. r0,rd,7 // is destination 8-byte aligned?
264 sub rc,rc,r0 // adjust count
265 mtctr r0 // set up for byte loop
266 beq LRevAligned // already aligned
267
2681:
269 lbzu w1,-1(rs)
270 stbu w1,-1(rd)
271 bdnz 1b
272
273// Destination is 8-byte aligned.
274
275LRevAligned:
276 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
277 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
278 mtcrf 0x01,rc // put length bits 28-31 in cr7
279 beq LShortReverse64 // no 64-byte chunks
280 mtctr w2
281 b 1f
282
283// Loop over 64-byte chunks (reverse).
284
285 .align 5
2861:
287 ld w1,-8(rs)
288 ld w2,-16(rs)
289 ld w3,-24(rs)
290 ld w4,-32(rs)
291 ld w5,-40(rs)
292 ld w6,-48(rs)
293 ld w7,-56(rs)
294 ldu w8,-64(rs)
295 std w1,-8(rd)
296 std w2,-16(rd)
297 std w3,-24(rd)
298 std w4,-32(rd)
299 std w5,-40(rd)
300 std w6,-48(rd)
301 std w7,-56(rd)
302 stdu w8,-64(rd)
303 bdnz 1b
304
305 b LShortReverse64
306
307 COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)