2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
26 // Only built for armv6 and higher.
27 #if defined _ARM_ARCH_6
29 // If we're building for armv7, and not for DYLD, then we have a symbol
30 // resolver so we need to rename these implementations.
31 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
32 #define _memset_pattern4 _memset_pattern4$VARIANT$Generic
33 #define _memset_pattern8 _memset_pattern8$VARIANT$Generic
34 #define _memset_pattern16 _memset_pattern16$VARIANT$Generic
37 #include <mach/machine/asm.h>
40 * This file contains the following functions:
42 * void memset_pattern4(void *b, const void *c4, size_t len)
43 * void memset_pattern8(void *b, const void *c8, size_t len)
44 * void memset_pattern16(void *b, const void *c16, size_t len)
46 * The memset() is implemented in the bzero.s file.
48 * This is a reasonably well optimized version of memset_pattern* routines
49 * implemented for the ARM9 and ARM11 processors using the ARMv6 instruction
50 * set. These routines use the ARM's core registers.
52 * The algorithm is to align the destination pointer on a 16 byte boundary
53 * and then blast data 64 bytes at a time, in two stores of 32 bytes per loop.
60 /*----------------------------------------------------------------------------*/
61 /* void memset_pattern4(void *ptr, const void *pattern4, size_t len); */
63 /* r0 << destination pointer */
64 /* r1 << pointer to 4-byte pattern */
65 /* r2 << 'len' (length of destination buffer in bytes) */
66 /*----------------------------------------------------------------------------*/
67 .globl _memset_pattern4
69 cmp r2, #0 /* check if len is zero */
70 bxeq lr /* return if length is zero */
72 /* We need some registers, so save volatiles on stack */
73 /* Avoid r7 (frame pointer) and r9 (thread register) */
74 stmfd sp!, {r4-r7, lr}
75 add r7, sp, #12 /* establish frame */
76 stmfd sp!, {r8, r10-r11}
78 /* copy destination base pointer r0 to r12 and leave r0 alone */
79 /* so that we return original pointer back to the caller */
82 /* Check if 'len' is long enough to bother alignment of destination */
84 cmp r2, #32 /* long enough to bother aligning? */
85 movlt r3, #4 /* move pattern length into r3 */
86 movlt r10, #4 /* pattern index */
87 movlt r11, r1 /* move pattern pointer into r11 */
90 /* move 'len' into r1, get 4-byte pattern in r2 */
91 mov r6, r2 /* temporarily move 'len' in to r6 */
92 ldr r2, [r1]/* load 4-byte pattern into r2 */
93 mov r1, r6 /* move 'len' from r6 to r1 */
95 mov r3, r2 /* copy 4-byte pattern into r3, r4 and r5 registers */
101 /* Check for 16 or 32 byte aligned destination pointer */
102 tst r12, #0x1F /* check for 32 byte aligned */
104 tst r12, #0xF /* check for 16 byte aligned */
106 b L_Unaligned /* yes */
117 cmp r2, #0 /* more bytes left? */
119 ldm sp!, {r8, r10-r11} /* restores registers from stack */
120 ldm sp!, {r4-r7, pc} /* restore & return from subroutine */
122 /* 'len' is long enough to justify aligning the destination pointer */
124 /* By the time we reach here, data is stored in registers as follows: */
125 /* r1 << 'len' (length of destination buffer in bytes) */
126 /* r2-r5 << pattern; either 4x4byte OR 2x8byte OR 1x16-byte */
127 /* r12 << destination pointer copy (scratch register) */
128 /* r0 << destination pointer original */
130 /* Use r11 as scratch register to store the #bytes offset to 16-byte align */
132 /* Unaligned on 32-byte boundary, store 1-15 bytes until 16-byte aligned */
133 /* As we store these bytes, we rotate the pattern stored in r2-r5 to reflect */
137 mov r11, r12, lsl #28
139 msr cpsr_f, r11 /* Bits[31:28] of cpsr now contain #bytes to align*/
141 L_Store15BytesAndRotatePattern:
142 strbvs r2, [r12], #1 /* v is set, unaligned in the 1s column */
143 andvs r6, r2, #0xFF /* Rotate pattern right in r2-r5 by 1-byte */
144 andvs r8, r3, #0xFF /* Consider register r2-r5 and a contiguous */
145 andvs r10, r4, #0xFF /* 16-byte register with r2 containing LSB */
146 andvs r11, r5, #0xFF /* and r5 containing MSB */
151 orrvs r2, r2, r8, lsl #24
152 orrvs r3, r3, r10, lsl #24
153 orrvs r4, r4, r11, lsl #24
154 orrvs r5, r5, r6, lsl #24
156 strhcs r2, [r12], #2 /* c is set, unaligned in the 2s column */
157 movcs r6, r2, lsl #16 /* Rotate pattern right in r2-r5 by 2-bytes */
158 movcs r8, r3, lsl #16
159 movcs r10, r4, lsl #16
160 movcs r11, r5, lsl #16
170 streq r2, [r12], #4 /* z is set, unaligned in the 4s column */
171 moveq r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */
177 stmmi r12!, {r2-r3} /* n is set, unaligned in the 8s column */
178 movmi r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */
185 mrs r11, cpsr /*copy cpsr in to r11 */
186 subs r1, r1, r11, lsr #28
187 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
188 ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */
190 /* By the time we reach here, we are 16-byte aligned and r2-r5 contains */
191 /* rotated pattern. Now lets make sure we are 32-byte aligned. */
197 /* By the time we reach here, data is stored in registers as follows: */
198 /* r1 << 'len' (remaining length of destination buffer in bytes) */
199 /* r2-r5 << rotated pattern; either 4x4byte OR 2x8byte OR 1x16-byte */
200 /* r12 << aligned destination pointer copy (scratch register) */
203 blt L_AlignedLessThan64
205 /* Copy pattern in four more registers so that we can do 64 byte transfers */
211 /* At this point, we are 16-byte aligned and 'len' is greater than 64 bytes */
212 /* Lets transfer 64 bytes at a time until len becomes less than 64 bytes */
213 sub r1, r1, #64 /* pre-subtract to avoid extra compare in loop */
215 stm r12!, {r2-r6, r8, r10-r11}
217 stm r12!, {r2-r6, r8, r10-r11}
220 /* return if 'len' is zero */
221 adds r1, r1, #64 /* readjust length; previously subtracted extra 64*/
222 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
223 ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */
226 /* do we have 16 or more bytes left */
230 bgt L_AlignedLessThan64
231 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
232 ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */
235 /* store last up-to 15 bytes */
236 /* move the remaining len bits [3:0] to the flags area of cpsr */
240 stmmi r12!, {r2-r3} /* n is set, store 8 bytes */
241 movmi r2, r4 /* shift vector down 8 bytes */
244 streq r2, [r12], #4 /* z is set, store 4 bytes */
245 moveq r2, r3 /* shift vector down 4 bytes */
247 strhcs r2, [r12], #2 /* c is set, store 2 bytes */
248 lsrcs r2, #16 /* shift register right 2 bytes */
250 strbvs r2, [r12], #1 /* v is set, store 1 byte */
251 ldm sp!, {r8, r10-r11} /* restores registers from stack */
252 ldm sp!, {r4-r7, pc} /* restore & return from subroutine */
254 /*----------------------------------------------------------------------------*/
255 /* void memset_pattern8(void *ptr, const void *pattern8, size_t len); */
257 /* r0 << destination pointer */
258 /* r1 << pointer to 8-byte pattern */
259 /* r2 << 'len' (length of destination buffer in bytes) */
260 /*----------------------------------------------------------------------------*/
261 .globl _memset_pattern8
263 cmp r2, #0 /* check if len is zero */
264 bxeq lr /* return if length is zero */
266 /* We need some registers, so save volatiles on stack */
267 /* Avoid r7 (frame pointer) and r9 (thread register) */
268 stmfd sp!, {r4-r7, lr}
269 add r7, sp, #12 /* establish frame */
270 stmfd sp!, {r8, r10-r11}
272 /* copy destination base pointer r0 to r12 and leave r0 alone */
273 /* so that we return original pointer back to the caller */
276 /* Check if 'len' is long enough to bother alignment of destination */
278 cmp r2, #32 /* long enough to bother aligning? */
279 movlt r3, #8 /* move pattern length into r3 */
280 movlt r10, #8 /* pattern index */
281 movlt r11, r1 /* move pattern pointer into r11 */
284 /* move 'len' into r1, get 8-byte pattern in r2-r3 */
285 mov r6, r2 /* temporarily move 'len' in to r6 */
286 ldr r2, [r1], #4 /* load 8-byte pattern into r2-r3 */
288 mov r1, r6 /* move 'len' from r6 to r1 */
290 mov r4, r2 /* copy 8-byte pattern into r4-r5 registers */
292 b L_NotShort /* yes */
295 /*----------------------------------------------------------------------------*/
296 /* void memset_pattern16(void *ptr, const void *pattern16, size_t len); */
298 /* r0 << destination pointer */
299 /* r1 << pointer to 16-byte pattern */
300 /* r2 << 'len' (length of destination buffer in bytes) */
301 /*----------------------------------------------------------------------------*/
302 .globl _memset_pattern16
304 cmp r2, #0 /* check if len is zero */
305 bxeq lr /* return if length is zero */
307 /* We need some registers, so save volatiles on stack */
308 /* Avoid r7 (frame pointer) and r9 (thread register) */
309 stmfd sp!, {r4-r7, lr}
310 add r7, sp, #12 /* establish frame */
311 stmfd sp!, {r8, r10-r11}
313 /* copy destination base pointer r0 to r12 and leave r0 alone */
314 /* so that we return original pointer back to the caller */
317 /* Check if 'len' is long enough to bother alignment of destination */
319 cmp r2, #32 /* long enough to bother aligning? */
320 movlt r3, #16 /* move pattern length into r3 */
321 movlt r10, #16 /* pattern index */
322 movlt r11, r1 /* move pattern pointer into r11 */
325 /* move 'len' into r1, get 16-byte pattern in r2-r5 */
326 mov r6, r2 /* temporarily move 'len' in to r6 */
327 ldr r2, [r1], #4 /* load 16-byte pattern into r2-r5 */
331 mov r1, r6 /* move 'len' from r6 to r1 */
333 b L_NotShort /* yes */
336 #endif /* _ARM_ARCH_6 */