]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bcopy_64.s
xnu-792.10.96.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bcopy_64.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* =======================================
23 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
24 * =======================================
25 *
26 * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
27 * This version might be used bringing up new processors, with known
28 * Altivec bugs that need to be worked around. It is not particularly well
29 * optimized.
30 *
31 * For 64-bit processors with a 128-byte cache line, running in either
32 * 32- or 64-bit mode. This is written for 32-bit execution, the kernel
33 * will translate to 64-bit code when it compiles the 64-bit commpage.
34 *
35 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
36 * environment.
37 * r0 = "w7" or temp
38 * r2 = "w8"
39 * r3 = not used, as memcpy and memmove return 1st parameter as a value
40 * r4 = source ptr ("rs")
41 * r5 = count of bytes to move ("rc")
42 * r6 = "w1"
43 * r7 = "w2"
44 * r8 = "w3"
45 * r9 = "w4"
46 * r10 = "w5"
47 * r11 = "w6"
48 * r12 = destination ptr ("rd")
49 */
50 #define rs r4
51 #define rd r12
52 #define rc r5
53 #define rv r2
54
55 #define w1 r6
56 #define w2 r7
57 #define w3 r8
58 #define w4 r9
59 #define w5 r10
60 #define w6 r11
61 #define w7 r0
62 #define w8 r2
63
64 #define ASSEMBLER
65 #include <sys/appleapiopts.h>
66 #include <ppc/asm.h>
67 #include <machine/cpu_capabilities.h>
68 #include <machine/commpage.h>
69
70 .text
71
72 #define kLong 64 // too long for inline loopless code
73
74
75 // Main entry points.
76
77 .align 5
78 bcopy_64: // void bcopy(const void *src, void *dst, size_t len)
79 cmplwi rc,kLong // short or long?
80 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
81 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
82 mr rd,r4 // start to move registers to canonic spot
83 mr rs,r3
84 blt LShort // handle short operands
85 dcbt 0,r3 // touch in destination
86 b LLong // join medium/long operand code
87
88 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
89
90 .align 5
91 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
92 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
93 cmplwi rc,kLong // short or long?
94 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
95 dcbt 0,r4 // touch in the first line of source
96 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
97 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
98 bge LLong // handle medium or long operands
99
100 // Handle short operands.
101
102 LShort:
103 mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time)
104 mtcrf 0x01,rc // put length bits 28-31 in cr7
105 blt cr1,LShortReverse
106
107 // Forward short operands. This is the most frequent case, so it is inline.
108
109 LShort64: // enter to xfer last 64 bytes
110 bf 26,0f // 64-byte chunk to xfer?
111 ld w1,0(rs)
112 ld w2,8(rs)
113 ld w3,16(rs)
114 ld w4,24(rs)
115 addi rs,rs,32
116 std w1,0(rd)
117 std w2,8(rd)
118 std w3,16(rd)
119 std w4,24(rd)
120 addi rd,rd,32
121 0:
122 bf 27,1f // quadword to move?
123 ld w1,0(rs)
124 ld w2,8(rs)
125 addi rs,rs,16
126 std w1,0(rd)
127 std w2,8(rd)
128 addi rd,rd,16
129 1:
130 bf 28,2f // doubleword?
131 ld w1,0(rs)
132 addi rs,rs,8
133 std w1,0(rd)
134 addi rd,rd,8
135 2:
136 bf 29,3f // word?
137 lwz w1,0(rs)
138 addi rs,rs,4
139 stw w1,0(rd)
140 addi rd,rd,4
141 3:
142 bf 30,4f // halfword to move?
143 lhz w1,0(rs)
144 addi rs,rs,2
145 sth w1,0(rd)
146 addi rd,rd,2
147 4:
148 bflr 31 // skip if no odd byte
149 lbz w1,0(rs)
150 stb w1,0(rd)
151 blr
152
153
154 // Handle short reverse operands.
155 // cr6 = bits 26-27 of length
156 // cr7 = bits 28-31 of length
157
158 LShortReverse:
159 add rs,rs,rc // adjust ptrs for reverse move
160 add rd,rd,rc
161 LShortReverse64: // enter to xfer last 64 bytes
162 bf 26,0f // 64-byte chunk to xfer?
163 ld w1,-8(rs)
164 ld w2,-16(rs)
165 ld w3,-24(rs)
166 ldu w4,-32(rs)
167 std w1,-8(rd)
168 std w2,-16(rd)
169 std w3,-24(rd)
170 stdu w4,-32(rd)
171 0:
172 bf 27,1f // quadword to move?
173 ld w1,-8(rs)
174 ldu w2,-16(rs)
175 std w1,-8(rd)
176 stdu w2,-16(rd)
177 1:
178 bf 28,2f // doubleword?
179 ldu w1,-8(rs)
180 stdu w1,-8(rd)
181 2:
182 bf 29,3f // word?
183 lwzu w1,-4(rs)
184 stwu w1,-4(rd)
185 3:
186 bf 30,4f // halfword to move?
187 lhzu w1,-2(rs)
188 sthu w1,-2(rd)
189 4:
190 bflr 31 // done if no odd byte
191 lbz w1,-1(rs) // no update
192 stb w1,-1(rd)
193 blr
194
195
196 // Long operands.
197 // cr1 = blt iff we must move reverse
198
199 .align 4
200 LLong:
201 dcbtst 0,rd // touch in destination
202 neg w3,rd // start to compute #bytes to align destination
203 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
204 blt cr1,LLongReverse // handle reverse moves
205 mtctr w6 // set up for loop to align destination
206 sub rc,rc,w6 // adjust count
207 beq LAligned // destination already 8-byte aligned
208 1:
209 lbz w1,0(rs)
210 addi rs,rs,1
211 stb w1,0(rd)
212 addi rd,rd,1
213 bdnz 1b
214
215 // Destination is 8-byte aligned.
216
217 LAligned:
218 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
219 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
220 mtcrf 0x01,rc // put length bits 28-31 in cr7
221 beq LShort64 // no 64-byte chunks
222 mtctr w2
223 b 1f
224
225 // Loop moving 64-byte chunks.
226
227 .align 5
228 1:
229 ld w1,0(rs)
230 ld w2,8(rs)
231 ld w3,16(rs)
232 ld w4,24(rs)
233 ld w5,32(rs)
234 ld w6,40(rs)
235 ld w7,48(rs)
236 ld w8,56(rs)
237 addi rs,rs,64
238 std w1,0(rd)
239 std w2,8(rd)
240 std w3,16(rd)
241 std w4,24(rd)
242 std w5,32(rd)
243 std w6,40(rd)
244 std w7,48(rd)
245 std w8,56(rd)
246 addi rd,rd,64
247 bdnz 1b
248
249 b LShort64
250
251
252 // Handle reverse moves.
253
254 LLongReverse:
255 add rd,rd,rc // point to end of operands
256 add rs,rs,rc
257 andi. r0,rd,7 // is destination 8-byte aligned?
258 sub rc,rc,r0 // adjust count
259 mtctr r0 // set up for byte loop
260 beq LRevAligned // already aligned
261
262 1:
263 lbzu w1,-1(rs)
264 stbu w1,-1(rd)
265 bdnz 1b
266
267 // Destination is 8-byte aligned.
268
269 LRevAligned:
270 srwi. w2,rc,6 // w2 <- count of 64-byte chunks
271 mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
272 mtcrf 0x01,rc // put length bits 28-31 in cr7
273 beq LShortReverse64 // no 64-byte chunks
274 mtctr w2
275 b 1f
276
277 // Loop over 64-byte chunks (reverse).
278
279 .align 5
280 1:
281 ld w1,-8(rs)
282 ld w2,-16(rs)
283 ld w3,-24(rs)
284 ld w4,-32(rs)
285 ld w5,-40(rs)
286 ld w6,-48(rs)
287 ld w7,-56(rs)
288 ldu w8,-64(rs)
289 std w1,-8(rd)
290 std w2,-16(rd)
291 std w3,-24(rd)
292 std w4,-32(rd)
293 std w5,-40(rd)
294 std w6,-48(rd)
295 std w7,-56(rd)
296 stdu w8,-64(rd)
297 bdnz 1b
298
299 b LShortReverse64
300
301 COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)