]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_64.s
65ad189f9d53f718d63a0c204028998dd884dcf2
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_64.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /* =======================================
31 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
32 * =======================================
33 *
34 * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
35 * This version might be used bringing up new processors, with known
36 * Altivec bugs that need to be worked around. It is not particularly well
37 * optimized.
38 *
39 * For 64-bit processors with a 128-byte cache line, running in either
40 * 32- or 64-bit mode. This is written for 32-bit execution, the kernel
41 * will translate to 64-bit code when it compiles the 64-bit commpage.
42 *
43 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
44 * environment.
45 * r0 = "w7" or temp
46 * r2 = "w8"
47 * r3 = not used, as memcpy and memmove return 1st parameter as a value
48 * r4 = source ptr ("rs")
49 * r5 = count of bytes to move ("rc")
50 * r6 = "w1"
51 * r7 = "w2"
52 * r8 = "w3"
53 * r9 = "w4"
54 * r10 = "w5"
55 * r11 = "w6"
56 * r12 = destination ptr ("rd")
57 */
58 #define rs r4
59 #define rd r12
60 #define rc r5
61 #define rv r2
62
63 #define w1 r6
64 #define w2 r7
65 #define w3 r8
66 #define w4 r9
67 #define w5 r10
68 #define w6 r11
69 #define w7 r0
70 #define w8 r2
71
72 #define ASSEMBLER
73 #include <sys/appleapiopts.h>
74 #include <ppc/asm.h>
75 #include <machine/cpu_capabilities.h>
76 #include <machine/commpage.h>
77
78 .text
79
80 #define kLong 64 // too long for inline loopless code
81
82
83 // Main entry points.
84
85 .align 5
86 bcopy_64: // void bcopy(const void *src, void *dst, size_t len)
87 cmplwi rc,kLong // short or long?
88 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
89 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
90 mr rd,r4 // start to move registers to canonic spot
91 mr rs,r3
92 blt LShort // handle short operands
93 dcbt 0,r3 // touch in destination
94 b LLong // join medium/long operand code
95
96 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
97
98 .align 5
99 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
100 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
101 cmplwi rc,kLong // short or long?
102 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
103 dcbt 0,r4 // touch in the first line of source
104 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
105 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
106 bge LLong // handle medium or long operands
107
108 // Handle short operands.
109
110 LShort:
111 mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time)
112 mtcrf 0x01,rc // put length bits 28-31 in cr7
113 blt cr1,LShortReverse
114
115 // Forward short operands. This is the most frequent case, so it is inline.
116
117 LShort64: // enter to xfer last 64 bytes
118 bf 26,0f // 64-byte chunk to xfer?
119 ld w1,0(rs)
120 ld w2,8(rs)
121 ld w3,16(rs)
122 ld w4,24(rs)
123 addi rs,rs,32
124 std w1,0(rd)
125 std w2,8(rd)
126 std w3,16(rd)
127 std w4,24(rd)
128 addi rd,rd,32
129 0:
130 bf 27,1f // quadword to move?
131 ld w1,0(rs)
132 ld w2,8(rs)
133 addi rs,rs,16
134 std w1,0(rd)
135 std w2,8(rd)
136 addi rd,rd,16
137 1:
138 bf 28,2f // doubleword?
139 ld w1,0(rs)
140 addi rs,rs,8
141 std w1,0(rd)
142 addi rd,rd,8
143 2:
144 bf 29,3f // word?
145 lwz w1,0(rs)
146 addi rs,rs,4
147 stw w1,0(rd)
148 addi rd,rd,4
149 3:
150 bf 30,4f // halfword to move?
151 lhz w1,0(rs)
152 addi rs,rs,2
153 sth w1,0(rd)
154 addi rd,rd,2
155 4:
156 bflr 31 // skip if no odd byte
157 lbz w1,0(rs)
158 stb w1,0(rd)
159 blr
160
161
162 // Handle short reverse operands.
163 // cr6 = bits 26-27 of length
164 // cr7 = bits 28-31 of length
165
166 LShortReverse:
167 add rs,rs,rc // adjust ptrs for reverse move
168 add rd,rd,rc
169 LShortReverse64: // enter to xfer last 64 bytes
170 bf 26,0f // 64-byte chunk to xfer?
171 ld w1,-8(rs)
172 ld w2,-16(rs)
173 ld w3,-24(rs)
174 ldu w4,-32(rs)
175 std w1,-8(rd)
176 std w2,-16(rd)
177 std w3,-24(rd)
178 stdu w4,-32(rd)
179 0:
180 bf 27,1f // quadword to move?
181 ld w1,-8(rs)
182 ldu w2,-16(rs)
183 std w1,-8(rd)
184 stdu w2,-16(rd)
185 1:
186 bf 28,2f // doubleword?
187 ldu w1,-8(rs)
188 stdu w1,-8(rd)
189 2:
190 bf 29,3f // word?
191 lwzu w1,-4(rs)
192 stwu w1,-4(rd)
193 3:
194 bf 30,4f // halfword to move?
195 lhzu w1,-2(rs)
196 sthu w1,-2(rd)
197 4:
198 bflr 31 // done if no odd byte
199 lbz w1,-1(rs) // no update
200 stb w1,-1(rd)
201 blr
202
203
204 // Long operands.
205 // cr1 = blt iff we must move reverse
206
207 .align 4
208 LLong:
209 dcbtst 0,rd // touch in destination
210 neg w3,rd // start to compute #bytes to align destination
211 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
212 blt cr1,LLongReverse // handle reverse moves
213 mtctr w6 // set up for loop to align destination
214 sub rc,rc,w6 // adjust count
215 beq LAligned // destination already 8-byte aligned
216 1:
217 lbz w1,0(rs)
218 addi rs,rs,1
219 stb w1,0(rd)
220 addi rd,rd,1
221 bdnz 1b
222
223 // Destination is 8-byte aligned.
224
225 LAligned:
226 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
227 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
228 mtcrf 0x01,rc // put length bits 28-31 in cr7
229 beq LShort64 // no 64-byte chunks
230 mtctr w2
231 b 1f
232
233 // Loop moving 64-byte chunks.
234
235 .align 5
236 1:
237 ld w1,0(rs)
238 ld w2,8(rs)
239 ld w3,16(rs)
240 ld w4,24(rs)
241 ld w5,32(rs)
242 ld w6,40(rs)
243 ld w7,48(rs)
244 ld w8,56(rs)
245 addi rs,rs,64
246 std w1,0(rd)
247 std w2,8(rd)
248 std w3,16(rd)
249 std w4,24(rd)
250 std w5,32(rd)
251 std w6,40(rd)
252 std w7,48(rd)
253 std w8,56(rd)
254 addi rd,rd,64
255 bdnz 1b
256
257 b LShort64
258
259
260 // Handle reverse moves.
261
262 LLongReverse:
263 add rd,rd,rc // point to end of operands
264 add rs,rs,rc
265 andi. r0,rd,7 // is destination 8-byte aligned?
266 sub rc,rc,r0 // adjust count
267 mtctr r0 // set up for byte loop
268 beq LRevAligned // already aligned
269
270 1:
271 lbzu w1,-1(rs)
272 stbu w1,-1(rd)
273 bdnz 1b
274
275 // Destination is 8-byte aligned.
276
277 LRevAligned:
278 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
279 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
280 mtcrf 0x01,rc // put length bits 28-31 in cr7
281 beq LShortReverse64 // no 64-byte chunks
282 mtctr w2
283 b 1f
284
285 // Loop over 64-byte chunks (reverse).
286
287 .align 5
288 1:
289 ld w1,-8(rs)
290 ld w2,-16(rs)
291 ld w3,-24(rs)
292 ld w4,-32(rs)
293 ld w5,-40(rs)
294 ld w6,-48(rs)
295 ld w7,-56(rs)
296 ldu w8,-64(rs)
297 std w1,-8(rd)
298 std w2,-16(rd)
299 std w3,-24(rd)
300 std w4,-32(rd)
301 std w5,-40(rd)
302 std w6,-48(rd)
303 std w7,-56(rd)
304 stdu w8,-64(rd)
305 bdnz 1b
306
307 b LShortReverse64
308
309 COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)