]> git.saurik.com Git - apple/libc.git/blame - arm/string/memset_pattern.s
Libc-594.9.5.tar.gz
[apple/libc.git] / arm / string / memset_pattern.s
CommitLineData
34e8f829
A
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#include <arm/arch.h>
34e8f829
A
25#include <mach/machine/asm.h>
26
27/*
28 * This file contains the following functions:
29 *
30 * void memset_pattern4(void *b, const void *c4, size_t len)
31 * void memset_pattern8(void *b, const void *c8, size_t len)
32 * void memset_pattern16(void *b, const void *c16, size_t len)
33 *
34 * The memset() is implemented in the bzero.s file.
35 *
36 * This is a reasonably well optimized version of memset_pattern* routines
511daa4c
A
37 * implemented for ARM processors using the ARMv4 and later instruction sets.
38 * These routines use the ARM's core registers.
34e8f829
A
39 *
40 * The algorithm is to align the destination pointer on a 16 byte boundary
41 * and then blast data 64 bytes at a time, in two stores of 32 bytes per loop.
42 *
43 */
44 .text
45 .align 2
46 .syntax unified
47
48/*----------------------------------------------------------------------------*/
49/* void memset_pattern4(void *ptr, const void *pattern4, size_t len); */
50/* */
51/* r0 << destination pointer */
52/* r1 << pointer to 4-byte pattern */
53/* r2 << 'len' (length of destination buffer in bytes) */
54/*----------------------------------------------------------------------------*/
55 .globl _memset_pattern4
56_memset_pattern4:
57 cmp r2, #0 /* check if len is zero */
58 bxeq lr /* return if length is zero */
59
60 /* We need some registers, so save volatiles on stack */
61 /* Avoid r7 (frame pointer) and r9 (thread register) */
62 stmfd sp!, {r4-r7, lr}
63 add r7, sp, #12 /* establish frame */
64 stmfd sp!, {r8, r10-r11}
65
66 /* copy destination base pointer r0 to r12 and leave r0 alone */
67 /* so that we return original pointer back to the caller */
68 mov r12, r0
69
70 /* Check if 'len' is long enough to bother alignment of destination */
71 /* pointer */
72 cmp r2, #32 /* long enough to bother aligning? */
73 movlt r3, #4 /* move pattern length into r3 */
74 movlt r10, #4 /* pattern index */
75 movlt r11, r1 /* move pattern pointer into r11 */
76 blt L_Short /* no */
77
78 /* move 'len' into r1, get 4-byte pattern in r2 */
79 mov r6, r2 /* temporarily move 'len' in to r6 */
511daa4c
A
80 bl L_GetPatternWord /* get unaligned pattern word in r5 */
81 mov r2, r5 /* move pattern word into r2 */
82 mov r0, r12 /* r0 was clobbered - restore it */
34e8f829
A
83 mov r1, r6 /* move 'len' from r6 to r1 */
84
85 mov r3, r2 /* copy 4-byte pattern into r3, r4 and r5 registers */
86 mov r4, r2
87 mov r5, r2
88
89L_NotShort:
90
91 /* Check for 16 or 32 byte aligned destination pointer */
92 tst r12, #0x1F /* check for 32 byte aligned */
93 beq L_Aligned
94 tst r12, #0xF /* check for 16 byte aligned */
95 beq L_16ByteAligned
96 b L_Unaligned /* yes */
97
98L_Bytewise:
99 ldrb r4, [r11], #1
100 strb r4, [r12], #1
101 subs r10, #1
102 moveq r10, r3
103 moveq r11, r1
104 sub r2, #1
105
106L_Short:
107 cmp r2, #0 /* more bytes left? */
108 bne L_Bytewise
109 ldm sp!, {r8, r10-r11} /* restores registers from stack */
511daa4c
A
110 ldm sp!, {r4-r7, lr} /* restore & return from subroutine */
111 bx lr
34e8f829
A
112
113/* 'len' is long enough to justify aligning the destination pointer */
114/* */
115/* By the time we reach here, data is stored in registers as follows: */
116/* r1 << 'len' (length of destination buffer in bytes) */
117/* r2-r5 << pattern; either 4x4byte OR 2x8byte OR 1x16-byte */
118/* r12 << destination pointer copy (scratch register) */
119/* r0 << destination pointer original */
120/* */
121/* Use r11 as scratch register to store the #bytes offset to 16-byte align */
122/* */
123/* Unaligned on 32-byte boundary, store 1-15 bytes until 16-byte aligned */
124/* As we store these bytes, we rotate the pattern stored in r2-r5 to reflect */
125/* the alignment. */
126
127L_Unaligned:
128 mov r11, r12, lsl #28
129 rsb r11, r11, #0
130 msr cpsr_f, r11 /* Bits[31:28] of cpsr now contain #bytes to align*/
131
132L_Store15BytesAndRotatePattern:
133 strbvs r2, [r12], #1 /* v is set, unaligned in the 1s column */
134 andvs r6, r2, #0xFF /* Rotate pattern right in r2-r5 by 1-byte */
135 andvs r8, r3, #0xFF /* Consider register r2-r5 and a contiguous */
136 andvs r10, r4, #0xFF /* 16-byte register with r2 containing LSB */
137 andvs r11, r5, #0xFF /* and r5 containing MSB */
138 lsrvs r2, r2, #8
139 lsrvs r3, r3, #8
140 lsrvs r4, r4, #8
141 lsrvs r5, r5, #8
142 orrvs r2, r2, r8, lsl #24
143 orrvs r3, r3, r10, lsl #24
144 orrvs r4, r4, r11, lsl #24
145 orrvs r5, r5, r6, lsl #24
146
147 strhcs r2, [r12], #2 /* c is set, unaligned in the 2s column */
148 movcs r6, r2, lsl #16 /* Rotate pattern right in r2-r5 by 2-bytes */
149 movcs r8, r3, lsl #16
150 movcs r10, r4, lsl #16
151 movcs r11, r5, lsl #16
152 lsrcs r2, r2, #16
153 lsrcs r3, r3, #16
154 lsrcs r4, r4, #16
155 lsrcs r5, r5, #16
156 orrcs r2, r2, r8
157 orrcs r3, r3, r10
158 orrcs r4, r4, r11
159 orrcs r5, r5, r6
160
161 streq r2, [r12], #4 /* z is set, unaligned in the 4s column */
162 moveq r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */
163 moveq r2, r3
164 moveq r3, r4
165 moveq r4, r5
166 moveq r5, r6
167
168 stmmi r12!, {r2-r3} /* n is set, unaligned in the 8s column */
169 movmi r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */
170 movmi r8, r3
171 movmi r2, r4
172 movmi r3, r5
173 movmi r4, r6
174 movmi r5, r8
175
176 mrs r11, cpsr /*copy cpsr in to r11 */
177 subs r1, r1, r11, lsr #28
178 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
511daa4c
A
179 ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */
180 bxeq lr
34e8f829
A
181
182/* By the time we reach here, we are 16-byte aligned and r2-r5 contains */
183/* rotated pattern. Now lets make sure we are 32-byte aligned. */
184L_16ByteAligned:
185 tst r12, #(1 << 4)
186 stmne r12!, {r2-r5}
187 subsne r1, r1, #16
188
189/* By the time we reach here, data is stored in registers as follows: */
190/* r1 << 'len' (remaining length of destination buffer in bytes) */
191/* r2-r5 << rotated pattern; either 4x4byte OR 2x8byte OR 1x16-byte */
192/* r12 << aligned destination pointer copy (scratch register) */
193L_Aligned:
194 cmp r1, #64
195 blt L_AlignedLessThan64
196
197/* Copy pattern in four more registers so that we can do 64 byte transfers */
198 mov r6, r2
199 mov r8, r3
200 mov r10, r4
201 mov r11, r5
202
203/* At this point, we are 16-byte aligned and 'len' is greater than 64 bytes */
204/* Lets transfer 64 bytes at a time until len becomes less than 64 bytes */
205 sub r1, r1, #64 /* pre-subtract to avoid extra compare in loop */
206L_Loop64:
207 stm r12!, {r2-r6, r8, r10-r11}
208 subs r1, r1, #64
209 stm r12!, {r2-r6, r8, r10-r11}
210 bge L_Loop64
211
212 /* return if 'len' is zero */
213 adds r1, r1, #64 /* readjust length; previously subtracted extra 64*/
214 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
511daa4c
A
215 ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */
216 bxeq lr
34e8f829
A
217
218L_AlignedLessThan64:
219 /* do we have 16 or more bytes left */
220 cmp r1, #16
221 stmge r12!, {r2-r5}
222 subsge r1, r1, #16
223 bgt L_AlignedLessThan64
224 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
511daa4c
A
225 ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */
226 bxeq lr
34e8f829
A
227
228L_AlignedLessThan16:
229 /* store last up-to 15 bytes */
230 /* move the remaining len bits [3:0] to the flags area of cpsr */
231 mov r1, r1, lsl #28
232 msr cpsr_f, r1
233
234 stmmi r12!, {r2-r3} /* n is set, store 8 bytes */
235 movmi r2, r4 /* shift vector down 8 bytes */
236 movmi r3, r5
237
238 streq r2, [r12], #4 /* z is set, store 4 bytes */
239 moveq r2, r3 /* shift vector down 4 bytes */
240
241 strhcs r2, [r12], #2 /* c is set, store 2 bytes */
242 lsrcs r2, #16 /* shift register right 2 bytes */
243
244 strbvs r2, [r12], #1 /* v is set, store 1 byte */
245 ldm sp!, {r8, r10-r11} /* restores registers from stack */
511daa4c
A
246 ldm sp!, {r4-r7, lr} /* restore & return from subroutine */
247 bx lr
34e8f829
A
248
249/*----------------------------------------------------------------------------*/
250/* void memset_pattern8(void *ptr, const void *pattern8, size_t len); */
251/* */
252/* r0 << destination pointer */
253/* r1 << pointer to 8-byte pattern */
254/* r2 << 'len' (length of destination buffer in bytes) */
255/*----------------------------------------------------------------------------*/
256 .globl _memset_pattern8
257_memset_pattern8:
258 cmp r2, #0 /* check if len is zero */
259 bxeq lr /* return if length is zero */
260
261 /* We need some registers, so save volatiles on stack */
262 /* Avoid r7 (frame pointer) and r9 (thread register) */
263 stmfd sp!, {r4-r7, lr}
264 add r7, sp, #12 /* establish frame */
265 stmfd sp!, {r8, r10-r11}
266
267 /* copy destination base pointer r0 to r12 and leave r0 alone */
268 /* so that we return original pointer back to the caller */
269 mov r12, r0
270
271 /* Check if 'len' is long enough to bother alignment of destination */
272 /* pointer */
273 cmp r2, #32 /* long enough to bother aligning? */
274 movlt r3, #8 /* move pattern length into r3 */
275 movlt r10, #8 /* pattern index */
276 movlt r11, r1 /* move pattern pointer into r11 */
277 blt L_Short /* no */
278
279 /* move 'len' into r1, get 8-byte pattern in r2-r3 */
280 mov r6, r2 /* temporarily move 'len' in to r6 */
511daa4c
A
281 bl L_GetPatternWord /* get unaligned pattern word in r5 */
282 mov r2, r5 /* move pattern word into r2 */
283 bl L_GetPatternWord
284 mov r3, r5
285 mov r0, r12 /* r0 was clobbered - restore it */
34e8f829
A
286 mov r1, r6 /* move 'len' from r6 to r1 */
287
288 mov r4, r2 /* copy 8-byte pattern into r4-r5 registers */
289 mov r5, r3
290 b L_NotShort /* yes */
291
292
293/*----------------------------------------------------------------------------*/
294/* void memset_pattern16(void *ptr, const void *pattern16, size_t len); */
295/* */
296/* r0 << destination pointer */
297/* r1 << pointer to 16-byte pattern */
298/* r2 << 'len' (length of destination buffer in bytes) */
299/*----------------------------------------------------------------------------*/
300 .globl _memset_pattern16
301_memset_pattern16:
302 cmp r2, #0 /* check if len is zero */
303 bxeq lr /* return if length is zero */
304
305 /* We need some registers, so save volatiles on stack */
306 /* Avoid r7 (frame pointer) and r9 (thread register) */
307 stmfd sp!, {r4-r7, lr}
308 add r7, sp, #12 /* establish frame */
309 stmfd sp!, {r8, r10-r11}
310
311 /* copy destination base pointer r0 to r12 and leave r0 alone */
312 /* so that we return original pointer back to the caller */
313 mov r12, r0
314
315 /* Check if 'len' is long enough to bother alignment of destination */
316 /* pointer */
317 cmp r2, #32 /* long enough to bother aligning? */
318 movlt r3, #16 /* move pattern length into r3 */
319 movlt r10, #16 /* pattern index */
320 movlt r11, r1 /* move pattern pointer into r11 */
321 blt L_Short /* no */
322
323 /* move 'len' into r1, get 16-byte pattern in r2-r5 */
324 mov r6, r2 /* temporarily move 'len' in to r6 */
511daa4c
A
325 bl L_GetPatternWord /* get unaligned pattern word in r5 */
326 mov r2, r5 /* move pattern word into r2 */
327 bl L_GetPatternWord
328 mov r3, r5
329 bl L_GetPatternWord
330 mov r4, r5
331 bl L_GetPatternWord
332 mov r0, r12 /* r0 was clobbered - restore it */
34e8f829
A
333 mov r1, r6 /* move 'len' from r6 to r1 */
334
335 b L_NotShort /* yes */
336
337
511daa4c
A
338/*----------------------------------------------------------------------------*/
339/* Get an unaligned word at r1, returning it in r5. */
340/* Increments r1 by 4, clobbers r0. */
341/* This is tailored to fit the register usage by the call sites. */
342/*----------------------------------------------------------------------------*/
343L_GetPatternWord:
344 ldrb r5, [r1], #1 /* get the 1st byte at r1 */
345 ldrb r0, [r1], #1 /* get the 2nd byte at r1 */
346 orr r5, r5, r0, lsl #8 /* move into bits 15:8 */
347 ldrb r0, [r1], #1 /* get the 3rd byte */
348 orr r5, r5, r0, lsl #16 /* bits 23:16 */
349 ldrb r0, [r1], #1 /* get the 4th byte */
350 orr r5, r5, r0, lsl #24 /* bits 31:24 */
351 bx lr