]> git.saurik.com Git - apple/libc.git/blame - arm/string/memset_pattern.s
Libc-825.40.1.tar.gz
[apple/libc.git] / arm / string / memset_pattern.s
CommitLineData
7b00c0c4
A
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
ad3c9f2a
A
24#include <arm/arch.h>
25
26// Only built for armv6 and higher.
27#if defined _ARM_ARCH_6
28
29// If we're building for armv7, and not for DYLD, then we have a symbol
30// resolver so we need to rename these implementations.
31#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
32#define _memset_pattern4 _memset_pattern4$VARIANT$Generic
33#define _memset_pattern8 _memset_pattern8$VARIANT$Generic
34#define _memset_pattern16 _memset_pattern16$VARIANT$Generic
35#endif
36
37#include <mach/machine/asm.h>
38
39/*
40 * This file contains the following functions:
41 *
42 * void memset_pattern4(void *b, const void *c4, size_t len)
43 * void memset_pattern8(void *b, const void *c8, size_t len)
44 * void memset_pattern16(void *b, const void *c16, size_t len)
45 *
46 * The memset() is implemented in the bzero.s file.
47 *
48 * This is a reasonably well optimized version of memset_pattern* routines
49 * implemented for the ARM9 and ARM11 processors using the ARMv6 instruction
50 * set. These routines use the ARM's core registers.
51 *
52 * The algorithm is to align the destination pointer on a 16 byte boundary
53 * and then blast data 64 bytes at a time, in two stores of 32 bytes per loop.
54 *
55 */
56 .text
57 .align 2
58 .syntax unified
59
60/*----------------------------------------------------------------------------*/
61/* void memset_pattern4(void *ptr, const void *pattern4, size_t len); */
62/* */
63/* r0 << destination pointer */
64/* r1 << pointer to 4-byte pattern */
65/* r2 << 'len' (length of destination buffer in bytes) */
66/*----------------------------------------------------------------------------*/
67 .globl _memset_pattern4
68_memset_pattern4:
69 cmp r2, #0 /* check if len is zero */
70 bxeq lr /* return if length is zero */
71
72 /* We need some registers, so save volatiles on stack */
73 /* Avoid r7 (frame pointer) and r9 (thread register) */
74 stmfd sp!, {r4-r7, lr}
75 add r7, sp, #12 /* establish frame */
76 stmfd sp!, {r8, r10-r11}
77
78 /* copy destination base pointer r0 to r12 and leave r0 alone */
79 /* so that we return original pointer back to the caller */
80 mov r12, r0
81
82 /* Check if 'len' is long enough to bother alignment of destination */
83 /* pointer */
84 cmp r2, #32 /* long enough to bother aligning? */
85 movlt r3, #4 /* move pattern length into r3 */
86 movlt r10, #4 /* pattern index */
87 movlt r11, r1 /* move pattern pointer into r11 */
88 blt L_Short /* no */
89
90 /* move 'len' into r1, get 4-byte pattern in r2 */
91 mov r6, r2 /* temporarily move 'len' in to r6 */
92 ldr r2, [r1]/* load 4-byte pattern into r2 */
93 mov r1, r6 /* move 'len' from r6 to r1 */
94
95 mov r3, r2 /* copy 4-byte pattern into r3, r4 and r5 registers */
96 mov r4, r2
97 mov r5, r2
98
99L_NotShort:
100
101 /* Check for 16 or 32 byte aligned destination pointer */
102 tst r12, #0x1F /* check for 32 byte aligned */
103 beq L_Aligned
104 tst r12, #0xF /* check for 16 byte aligned */
105 beq L_16ByteAligned
106 b L_Unaligned /* yes */
107
108L_Bytewise:
109 ldrb r4, [r11], #1
110 strb r4, [r12], #1
111 subs r10, #1
112 moveq r10, r3
113 moveq r11, r1
114 sub r2, #1
115
116L_Short:
117 cmp r2, #0 /* more bytes left? */
118 bne L_Bytewise
119 ldm sp!, {r8, r10-r11} /* restores registers from stack */
120 ldm sp!, {r4-r7, pc} /* restore & return from subroutine */
121
122/* 'len' is long enough to justify aligning the destination pointer */
123/* */
124/* By the time we reach here, data is stored in registers as follows: */
125/* r1 << 'len' (length of destination buffer in bytes) */
126/* r2-r5 << pattern; either 4x4byte OR 2x8byte OR 1x16-byte */
127/* r12 << destination pointer copy (scratch register) */
128/* r0 << destination pointer original */
129/* */
130/* Use r11 as scratch register to store the #bytes offset to 16-byte align */
131/* */
132/* Unaligned on 32-byte boundary, store 1-15 bytes until 16-byte aligned */
133/* As we store these bytes, we rotate the pattern stored in r2-r5 to reflect */
134/* the alignment. */
135
136L_Unaligned:
137 mov r11, r12, lsl #28
138 rsb r11, r11, #0
139 msr cpsr_f, r11 /* Bits[31:28] of cpsr now contain #bytes to align*/
140
141L_Store15BytesAndRotatePattern:
142 strbvs r2, [r12], #1 /* v is set, unaligned in the 1s column */
143 andvs r6, r2, #0xFF /* Rotate pattern right in r2-r5 by 1-byte */
144 andvs r8, r3, #0xFF /* Consider register r2-r5 and a contiguous */
145 andvs r10, r4, #0xFF /* 16-byte register with r2 containing LSB */
146 andvs r11, r5, #0xFF /* and r5 containing MSB */
147 lsrvs r2, r2, #8
148 lsrvs r3, r3, #8
149 lsrvs r4, r4, #8
150 lsrvs r5, r5, #8
151 orrvs r2, r2, r8, lsl #24
152 orrvs r3, r3, r10, lsl #24
153 orrvs r4, r4, r11, lsl #24
154 orrvs r5, r5, r6, lsl #24
155
156 strhcs r2, [r12], #2 /* c is set, unaligned in the 2s column */
157 movcs r6, r2, lsl #16 /* Rotate pattern right in r2-r5 by 2-bytes */
158 movcs r8, r3, lsl #16
159 movcs r10, r4, lsl #16
160 movcs r11, r5, lsl #16
161 lsrcs r2, r2, #16
162 lsrcs r3, r3, #16
163 lsrcs r4, r4, #16
164 lsrcs r5, r5, #16
165 orrcs r2, r2, r8
166 orrcs r3, r3, r10
167 orrcs r4, r4, r11
168 orrcs r5, r5, r6
169
170 streq r2, [r12], #4 /* z is set, unaligned in the 4s column */
171 moveq r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */
172 moveq r2, r3
173 moveq r3, r4
174 moveq r4, r5
175 moveq r5, r6
176
177 stmmi r12!, {r2-r3} /* n is set, unaligned in the 8s column */
178 movmi r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */
179 movmi r8, r3
180 movmi r2, r4
181 movmi r3, r5
182 movmi r4, r6
183 movmi r5, r8
184
185 mrs r11, cpsr /*copy cpsr in to r11 */
186 subs r1, r1, r11, lsr #28
187 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
188 ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */
189
190/* By the time we reach here, we are 16-byte aligned and r2-r5 contains */
191/* rotated pattern. Now lets make sure we are 32-byte aligned. */
192L_16ByteAligned:
193 tst r12, #(1 << 4)
194 stmne r12!, {r2-r5}
195 subsne r1, r1, #16
196
197/* By the time we reach here, data is stored in registers as follows: */
198/* r1 << 'len' (remaining length of destination buffer in bytes) */
199/* r2-r5 << rotated pattern; either 4x4byte OR 2x8byte OR 1x16-byte */
200/* r12 << aligned destination pointer copy (scratch register) */
201L_Aligned:
202 cmp r1, #64
203 blt L_AlignedLessThan64
204
205/* Copy pattern in four more registers so that we can do 64 byte transfers */
206 mov r6, r2
207 mov r8, r3
208 mov r10, r4
209 mov r11, r5
210
211/* At this point, we are 16-byte aligned and 'len' is greater than 64 bytes */
212/* Lets transfer 64 bytes at a time until len becomes less than 64 bytes */
213 sub r1, r1, #64 /* pre-subtract to avoid extra compare in loop */
214L_Loop64:
215 stm r12!, {r2-r6, r8, r10-r11}
216 subs r1, r1, #64
217 stm r12!, {r2-r6, r8, r10-r11}
218 bge L_Loop64
219
220 /* return if 'len' is zero */
221 adds r1, r1, #64 /* readjust length; previously subtracted extra 64*/
222 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
223 ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */
224
225L_AlignedLessThan64:
226 /* do we have 16 or more bytes left */
227 cmp r1, #16
228 stmge r12!, {r2-r5}
229 subsge r1, r1, #16
230 bgt L_AlignedLessThan64
231 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
232 ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */
233
234L_AlignedLessThan16:
235 /* store last up-to 15 bytes */
236 /* move the remaining len bits [3:0] to the flags area of cpsr */
237 mov r1, r1, lsl #28
238 msr cpsr_f, r1
239
240 stmmi r12!, {r2-r3} /* n is set, store 8 bytes */
241 movmi r2, r4 /* shift vector down 8 bytes */
242 movmi r3, r5
243
244 streq r2, [r12], #4 /* z is set, store 4 bytes */
245 moveq r2, r3 /* shift vector down 4 bytes */
246
247 strhcs r2, [r12], #2 /* c is set, store 2 bytes */
248 lsrcs r2, #16 /* shift register right 2 bytes */
249
250 strbvs r2, [r12], #1 /* v is set, store 1 byte */
251 ldm sp!, {r8, r10-r11} /* restores registers from stack */
252 ldm sp!, {r4-r7, pc} /* restore & return from subroutine */
253
254/*----------------------------------------------------------------------------*/
255/* void memset_pattern8(void *ptr, const void *pattern8, size_t len); */
256/* */
257/* r0 << destination pointer */
258/* r1 << pointer to 8-byte pattern */
259/* r2 << 'len' (length of destination buffer in bytes) */
260/*----------------------------------------------------------------------------*/
261 .globl _memset_pattern8
262_memset_pattern8:
263 cmp r2, #0 /* check if len is zero */
264 bxeq lr /* return if length is zero */
265
266 /* We need some registers, so save volatiles on stack */
267 /* Avoid r7 (frame pointer) and r9 (thread register) */
268 stmfd sp!, {r4-r7, lr}
269 add r7, sp, #12 /* establish frame */
270 stmfd sp!, {r8, r10-r11}
271
272 /* copy destination base pointer r0 to r12 and leave r0 alone */
273 /* so that we return original pointer back to the caller */
274 mov r12, r0
275
276 /* Check if 'len' is long enough to bother alignment of destination */
277 /* pointer */
278 cmp r2, #32 /* long enough to bother aligning? */
279 movlt r3, #8 /* move pattern length into r3 */
280 movlt r10, #8 /* pattern index */
281 movlt r11, r1 /* move pattern pointer into r11 */
282 blt L_Short /* no */
283
284 /* move 'len' into r1, get 8-byte pattern in r2-r3 */
285 mov r6, r2 /* temporarily move 'len' in to r6 */
286 ldr r2, [r1], #4 /* load 8-byte pattern into r2-r3 */
287 ldr r3, [r1], #4
288 mov r1, r6 /* move 'len' from r6 to r1 */
289
290 mov r4, r2 /* copy 8-byte pattern into r4-r5 registers */
291 mov r5, r3
292 b L_NotShort /* yes */
293
294
295/*----------------------------------------------------------------------------*/
296/* void memset_pattern16(void *ptr, const void *pattern16, size_t len); */
297/* */
298/* r0 << destination pointer */
299/* r1 << pointer to 16-byte pattern */
300/* r2 << 'len' (length of destination buffer in bytes) */
301/*----------------------------------------------------------------------------*/
302 .globl _memset_pattern16
303_memset_pattern16:
304 cmp r2, #0 /* check if len is zero */
305 bxeq lr /* return if length is zero */
306
307 /* We need some registers, so save volatiles on stack */
308 /* Avoid r7 (frame pointer) and r9 (thread register) */
309 stmfd sp!, {r4-r7, lr}
310 add r7, sp, #12 /* establish frame */
311 stmfd sp!, {r8, r10-r11}
312
313 /* copy destination base pointer r0 to r12 and leave r0 alone */
314 /* so that we return original pointer back to the caller */
315 mov r12, r0
316
317 /* Check if 'len' is long enough to bother alignment of destination */
318 /* pointer */
319 cmp r2, #32 /* long enough to bother aligning? */
320 movlt r3, #16 /* move pattern length into r3 */
321 movlt r10, #16 /* pattern index */
322 movlt r11, r1 /* move pattern pointer into r11 */
323 blt L_Short /* no */
324
325 /* move 'len' into r1, get 16-byte pattern in r2-r5 */
326 mov r6, r2 /* temporarily move 'len' in to r6 */
327 ldr r2, [r1], #4 /* load 16-byte pattern into r2-r5 */
328 ldr r3, [r1], #4
329 ldr r4, [r1], #4
330 ldr r5, [r1], #4
331 mov r1, r6 /* move 'len' from r6 to r1 */
332
333 b L_NotShort /* yes */
334
335
336#endif /* _ARM_ARCH_6 */