]> git.saurik.com Git - apple/xnu.git/blame - osfmk/ppc/commpage/bcopy_64.s
xnu-344.21.73.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_64.s
CommitLineData
d7e50217
A
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25/* =======================================
26 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
27 * =======================================
28 *
29 * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
30 * This version might be used bringing up new processors, with known
31 * Altivec bugs that need to be worked around. It is not particularly well
32 * optimized.
33 *
34 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
35 * environment.
36 * r0 = "w7" or temp
37 * r2 = "w8"
38 * r3 = not used, as memcpy and memmove return 1st parameter as a value
39 * r4 = source ptr ("rs")
40 * r5 = count of bytes to move ("rc")
41 * r6 = "w1"
42 * r7 = "w2"
43 * r8 = "w3"
44 * r9 = "w4"
45 * r10 = "w5"
46 * r11 = "w6"
47 * r12 = destination ptr ("rd")
48 */
49#define rs r4
50#define rd r12
51#define rc r5
52#define rv r2
53
54#define w1 r6
55#define w2 r7
56#define w3 r8
57#define w4 r9
58#define w5 r10
59#define w6 r11
60#define w7 r0
61#define w8 r2
62
63#define ASSEMBLER
64#include <sys/appleapiopts.h>
65#include <ppc/asm.h>
66#include <machine/cpu_capabilities.h>
67#include <machine/commpage.h>
68
69 .text
70 .globl EXT(bcopy_64)
71
72#define kLong 64 // too long for inline loopless code
73
74
75// Main entry points.
76
77 .align 5
78bcopy_64: // void bcopy(const void *src, void *dst, size_t len)
79 cmplwi rc,kLong // short or long?
80 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
81 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
82 mr rd,r4 // start to move registers to canonic spot
83 mr rs,r3
84 blt LShort // handle short operands
85 dcbt 0,r3 // touch in destination
86 b LLong // join medium/long operand code
87
88// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
89
90 .align 5
91Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
92Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
93 cmplwi rc,kLong // short or long?
94 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
95 dcbt 0,r4 // touch in the first line of source
96 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
97 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
98 bge LLong // handle medium or long operands
99
100// Handle short operands.
101
102LShort:
103 mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time)
104 mtcrf 0x01,rc // put length bits 28-31 in cr7
105 blt cr1,LShortReverse
106
107// Forward short operands. This is the most frequent case, so it is inline.
108
109LShort64: // enter to xfer last 64 bytes
110 bf 26,0f // 64-byte chunk to xfer?
111 ld w1,0(rs)
112 ld w2,8(rs)
113 ld w3,16(rs)
114 ld w4,24(rs)
115 addi rs,rs,32
116 std w1,0(rd)
117 std w2,8(rd)
118 std w3,16(rd)
119 std w4,24(rd)
120 addi rd,rd,32
1210:
122 bf 27,1f // quadword to move?
123 ld w1,0(rs)
124 ld w2,8(rs)
125 addi rs,rs,16
126 std w1,0(rd)
127 std w2,8(rd)
128 addi rd,rd,16
1291:
130 bf 28,2f // doubleword?
131 ld w1,0(rs)
132 addi rs,rs,8
133 std w1,0(rd)
134 addi rd,rd,8
1352:
136 bf 29,3f // word?
137 lwz w1,0(rs)
138 addi rs,rs,4
139 stw w1,0(rd)
140 addi rd,rd,4
1413:
142 bf 30,4f // halfword to move?
143 lhz w1,0(rs)
144 addi rs,rs,2
145 sth w1,0(rd)
146 addi rd,rd,2
1474:
148 bflr 31 // skip if no odd byte
149 lbz w1,0(rs)
150 stb w1,0(rd)
151 blr
152
153
154// Handle short reverse operands.
155// cr6 = bits 26-27 of length
156// cr7 = bits 28-31 of length
157
158LShortReverse:
159 add rs,rs,rc // adjust ptrs for reverse move
160 add rd,rd,rc
161LShortReverse64: // enter to xfer last 64 bytes
162 bf 26,0f // 64-byte chunk to xfer?
163 ld w1,-8(rs)
164 ld w2,-16(rs)
165 ld w3,-24(rs)
166 ldu w4,-32(rs)
167 std w1,-8(rd)
168 std w2,-16(rd)
169 std w3,-24(rd)
170 stdu w4,-32(rd)
1710:
172 bf 27,1f // quadword to move?
173 ld w1,-8(rs)
174 ldu w2,-16(rs)
175 std w1,-8(rd)
176 stdu w2,-16(rd)
1771:
178 bf 28,2f // doubleword?
179 ldu w1,-8(rs)
180 stdu w1,-8(rd)
1812:
182 bf 29,3f // word?
183 lwzu w1,-4(rs)
184 stwu w1,-4(rd)
1853:
186 bf 30,4f // halfword to move?
187 lhzu w1,-2(rs)
188 sthu w1,-2(rd)
1894:
190 bflr 31 // done if no odd byte
191 lbz w1,-1(rs) // no update
192 stb w1,-1(rd)
193 blr
194
195
196// Long operands.
197// cr1 = blt iff we must move reverse
198
199 .align 4
200LLong:
201 dcbtst 0,rd // touch in destination
202 neg w3,rd // start to compute #bytes to align destination
203 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
204 blt cr1,LLongReverse // handle reverse moves
205 mtctr w6 // set up for loop to align destination
206 sub rc,rc,w6 // adjust count
207 beq LAligned // destination already 8-byte aligned
2081:
209 lbz w1,0(rs)
210 addi rs,rs,1
211 stb w1,0(rd)
212 addi rd,rd,1
213 bdnz 1b
214
215// Destination is 8-byte aligned.
216
217LAligned:
218 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
219 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
220 mtcrf 0x01,rc // put length bits 28-31 in cr7
221 beq LShort64 // no 64-byte chunks
222 mtctr w2
223 b 1f
224
225// Loop moving 64-byte chunks.
226
227 .align 5
2281:
229 ld w1,0(rs)
230 ld w2,8(rs)
231 ld w3,16(rs)
232 ld w4,24(rs)
233 ld w5,32(rs)
234 ld w6,40(rs)
235 ld w7,48(rs)
236 ld w8,56(rs)
237 addi rs,rs,64
238 std w1,0(rd)
239 std w2,8(rd)
240 std w3,16(rd)
241 std w4,24(rd)
242 std w5,32(rd)
243 std w6,40(rd)
244 std w7,48(rd)
245 std w8,56(rd)
246 addi rd,rd,64
247 bdnz 1b
248
249 b LShort64
250
251
252// Handle reverse moves.
253
254LLongReverse:
255 add rd,rd,rc // point to end of operands
256 add rs,rs,rc
257 andi. r0,rd,7 // is destination 8-byte aligned?
258 sub rc,rc,r0 // adjust count
259 mtctr r0 // set up for byte loop
260 beq LRevAligned // already aligned
261
2621:
263 lbzu w1,-1(rs)
264 stbu w1,-1(rd)
265 bdnz 1b
266
267// Destination is 8-byte aligned.
268
269LRevAligned:
270 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
271 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
272 mtcrf 0x01,rc // put length bits 28-31 in cr7
273 beq LShortReverse64 // no 64-byte chunks
274 mtctr w2
275 b 1f
276
277// Loop over 64-byte chunks (reverse).
278
279 .align 5
2801:
281 ld w1,-8(rs)
282 ld w2,-16(rs)
283 ld w3,-24(rs)
284 ld w4,-32(rs)
285 ld w5,-40(rs)
286 ld w6,-48(rs)
287 ld w7,-56(rs)
288 ldu w8,-64(rs)
289 std w1,-8(rd)
290 std w2,-16(rd)
291 std w3,-24(rd)
292 std w4,-32(rd)
293 std w5,-40(rd)
294 std w6,-48(rd)
295 std w7,-56(rd)
296 stdu w8,-64(rd)
297 bdnz 1b
298
299 b LShortReverse64
300
301 COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,0)