* @APPLE_LICENSE_HEADER_END@
*/
+#include <arm/arch.h>
+
+// Only built for armv6 and higher.
+#if defined _ARM_ARCH_6
+
+// If we're building for armv7, and not for DYLD, then we have a symbol
+// resolver so we need to rename these implementations.
+#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
+#define _memset_pattern4 _memset_pattern4$VARIANT$Generic
+#define _memset_pattern8 _memset_pattern8$VARIANT$Generic
+#define _memset_pattern16 _memset_pattern16$VARIANT$Generic
+#endif
+
+#include <mach/machine/asm.h>
+
+/*
+ * This file contains the following functions:
+ *
+ * void memset_pattern4(void *b, const void *c4, size_t len)
+ * void memset_pattern8(void *b, const void *c8, size_t len)
+ * void memset_pattern16(void *b, const void *c16, size_t len)
+ *
+ * The memset() is implemented in the bzero.s file.
+ *
+ * This is a reasonably well optimized version of memset_pattern* routines
+ * implemented for the ARM9 and ARM11 processors using the ARMv6 instruction
+ * set. These routines use the ARM's core registers.
+ *
+ * The algorithm is to align the destination pointer on a 16 byte boundary
+ * and then blast data 64 bytes at a time, in two stores of 32 bytes per loop.
+ *
+ */
+ .text
+ .align 2
+ .syntax unified
+
+/*----------------------------------------------------------------------------*/
+/* void memset_pattern4(void *ptr, const void *pattern4, size_t len); */
+/* */
+/* r0 << destination pointer */
+/* r1 << pointer to 4-byte pattern */
+/* r2 << 'len' (length of destination buffer in bytes) */
+/*----------------------------------------------------------------------------*/
+ .globl _memset_pattern4
+_memset_pattern4:
+ cmp r2, #0 /* check if len is zero */
+ bxeq lr /* return if length is zero */
+
+ /* We need some registers, so save volatiles on stack */
+ /* Avoid r7 (frame pointer) and r9 (thread register) */
+ stmfd sp!, {r4-r7, lr}
+ add r7, sp, #12 /* establish frame */
+ stmfd sp!, {r8, r10-r11}
+
+ /* copy destination base pointer r0 to r12 and leave r0 alone */
+ /* so that we return original pointer back to the caller */
+ mov r12, r0
+
+ /* Check if 'len' is long enough to bother alignment of destination */
+ /* pointer */
+ cmp r2, #32 /* long enough to bother aligning? */
+ movlt r3, #4 /* move pattern length into r3 */
+ movlt r10, #4 /* pattern index */
+ movlt r11, r1 /* move pattern pointer into r11 */
+ blt L_Short /* no */
+
+ /* move 'len' into r1, get 4-byte pattern in r2 */
+ mov r6, r2 /* temporarily move 'len' in to r6 */
+ ldr r2, [r1]/* load 4-byte pattern into r2 */
+ mov r1, r6 /* move 'len' from r6 to r1 */
+
+ mov r3, r2 /* copy 4-byte pattern into r3, r4 and r5 registers */
+ mov r4, r2
+ mov r5, r2
+
+L_NotShort:
+
+ /* Check for 16 or 32 byte aligned destination pointer */
+ tst r12, #0x1F /* check for 32 byte aligned */
+ beq L_Aligned
+ tst r12, #0xF /* check for 16 byte aligned */
+ beq L_16ByteAligned
+ b L_Unaligned /* yes */
+
+L_Bytewise:
+ ldrb r4, [r11], #1
+ strb r4, [r12], #1
+ subs r10, #1
+ moveq r10, r3
+ moveq r11, r1
+ sub r2, #1
+
+L_Short:
+ cmp r2, #0 /* more bytes left? */
+ bne L_Bytewise
+ ldm sp!, {r8, r10-r11} /* restores registers from stack */
+ ldm sp!, {r4-r7, pc} /* restore & return from subroutine */
+
+/* 'len' is long enough to justify aligning the destination pointer */
+/* */
+/* By the time we reach here, data is stored in registers as follows: */
+/* r1 << 'len' (length of destination buffer in bytes) */
+/* r2-r5 << pattern; either 4x4byte OR 2x8byte OR 1x16-byte */
+/* r12 << destination pointer copy (scratch register) */
+/* r0 << destination pointer original */
+/* */
+/* Use r11 as scratch register to store the #bytes offset to 16-byte align */
+/* */
+/* Unaligned on 32-byte boundary, store 1-15 bytes until 16-byte aligned */
+/* As we store these bytes, we rotate the pattern stored in r2-r5 to reflect */
+/* the alignment. */
+
+L_Unaligned:
+ mov r11, r12, lsl #28
+ rsb r11, r11, #0
+ msr cpsr_f, r11 /* Bits[31:28] of cpsr now contain #bytes to align*/
+
+L_Store15BytesAndRotatePattern:
+ strbvs r2, [r12], #1 /* v is set, unaligned in the 1s column */
+ andvs r6, r2, #0xFF /* Rotate pattern right in r2-r5 by 1-byte */
+ andvs r8, r3, #0xFF /* Consider register r2-r5 and a contiguous */
+ andvs r10, r4, #0xFF /* 16-byte register with r2 containing LSB */
+ andvs r11, r5, #0xFF /* and r5 containing MSB */
+ lsrvs r2, r2, #8
+ lsrvs r3, r3, #8
+ lsrvs r4, r4, #8
+ lsrvs r5, r5, #8
+ orrvs r2, r2, r8, lsl #24
+ orrvs r3, r3, r10, lsl #24
+ orrvs r4, r4, r11, lsl #24
+ orrvs r5, r5, r6, lsl #24
+
+ strhcs r2, [r12], #2 /* c is set, unaligned in the 2s column */
+ movcs r6, r2, lsl #16 /* Rotate pattern right in r2-r5 by 2-bytes */
+ movcs r8, r3, lsl #16
+ movcs r10, r4, lsl #16
+ movcs r11, r5, lsl #16
+ lsrcs r2, r2, #16
+ lsrcs r3, r3, #16
+ lsrcs r4, r4, #16
+ lsrcs r5, r5, #16
+ orrcs r2, r2, r8
+ orrcs r3, r3, r10
+ orrcs r4, r4, r11
+ orrcs r5, r5, r6
+
+ streq r2, [r12], #4 /* z is set, unaligned in the 4s column */
+ moveq r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */
+ moveq r2, r3
+ moveq r3, r4
+ moveq r4, r5
+ moveq r5, r6
+
+ stmmi r12!, {r2-r3} /* n is set, unaligned in the 8s column */
+ movmi r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */
+ movmi r8, r3
+ movmi r2, r4
+ movmi r3, r5
+ movmi r4, r6
+ movmi r5, r8
+
+ mrs r11, cpsr /*copy cpsr in to r11 */
+ subs r1, r1, r11, lsr #28
+ ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
+ ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */
+
+/* By the time we reach here, we are 16-byte aligned and r2-r5 contains */
+/* rotated pattern. Now lets make sure we are 32-byte aligned. */
+L_16ByteAligned:
+ tst r12, #(1 << 4)
+ stmne r12!, {r2-r5}
+ subsne r1, r1, #16
+
+/* By the time we reach here, data is stored in registers as follows: */
+/* r1 << 'len' (remaining length of destination buffer in bytes) */
+/* r2-r5 << rotated pattern; either 4x4byte OR 2x8byte OR 1x16-byte */
+/* r12 << aligned destination pointer copy (scratch register) */
+L_Aligned:
+ cmp r1, #64
+ blt L_AlignedLessThan64
+
+/* Copy pattern in four more registers so that we can do 64 byte transfers */
+ mov r6, r2
+ mov r8, r3
+ mov r10, r4
+ mov r11, r5
+
+/* At this point, we are 16-byte aligned and 'len' is greater than 64 bytes */
+/* Lets transfer 64 bytes at a time until len becomes less than 64 bytes */
+ sub r1, r1, #64 /* pre-subtract to avoid extra compare in loop */
+L_Loop64:
+ stm r12!, {r2-r6, r8, r10-r11}
+ subs r1, r1, #64
+ stm r12!, {r2-r6, r8, r10-r11}
+ bge L_Loop64
+
+ /* return if 'len' is zero */
+ adds r1, r1, #64 /* readjust length; previously subtracted extra 64*/
+ ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
+ ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */
+
+L_AlignedLessThan64:
+ /* do we have 16 or more bytes left */
+ cmp r1, #16
+ stmge r12!, {r2-r5}
+ subsge r1, r1, #16
+ bgt L_AlignedLessThan64
+ ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
+ ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */
+
+L_AlignedLessThan16:
+ /* store last up-to 15 bytes */
+ /* move the remaining len bits [3:0] to the flags area of cpsr */
+ mov r1, r1, lsl #28
+ msr cpsr_f, r1
+
+ stmmi r12!, {r2-r3} /* n is set, store 8 bytes */
+ movmi r2, r4 /* shift vector down 8 bytes */
+ movmi r3, r5
+
+ streq r2, [r12], #4 /* z is set, store 4 bytes */
+ moveq r2, r3 /* shift vector down 4 bytes */
+
+ strhcs r2, [r12], #2 /* c is set, store 2 bytes */
+ lsrcs r2, #16 /* shift register right 2 bytes */
+
+ strbvs r2, [r12], #1 /* v is set, store 1 byte */
+ ldm sp!, {r8, r10-r11} /* restores registers from stack */
+ ldm sp!, {r4-r7, pc} /* restore & return from subroutine */
+
+/*----------------------------------------------------------------------------*/
+/* void memset_pattern8(void *ptr, const void *pattern8, size_t len); */
+/* */
+/* r0 << destination pointer */
+/* r1 << pointer to 8-byte pattern */
+/* r2 << 'len' (length of destination buffer in bytes) */
+/*----------------------------------------------------------------------------*/
+ .globl _memset_pattern8
+_memset_pattern8:
+ cmp r2, #0 /* check if len is zero */
+ bxeq lr /* return if length is zero */
+
+ /* We need some registers, so save volatiles on stack */
+ /* Avoid r7 (frame pointer) and r9 (thread register) */
+ stmfd sp!, {r4-r7, lr}
+ add r7, sp, #12 /* establish frame */
+ stmfd sp!, {r8, r10-r11}
+
+ /* copy destination base pointer r0 to r12 and leave r0 alone */
+ /* so that we return original pointer back to the caller */
+ mov r12, r0
+
+ /* Check if 'len' is long enough to bother alignment of destination */
+ /* pointer */
+ cmp r2, #32 /* long enough to bother aligning? */
+ movlt r3, #8 /* move pattern length into r3 */
+ movlt r10, #8 /* pattern index */
+ movlt r11, r1 /* move pattern pointer into r11 */
+ blt L_Short /* no */
+
+ /* move 'len' into r1, get 8-byte pattern in r2-r3 */
+ mov r6, r2 /* temporarily move 'len' in to r6 */
+ ldr r2, [r1], #4 /* load 8-byte pattern into r2-r3 */
+ ldr r3, [r1], #4
+ mov r1, r6 /* move 'len' from r6 to r1 */
+
+ mov r4, r2 /* copy 8-byte pattern into r4-r5 registers */
+ mov r5, r3
+ b L_NotShort /* yes */
+
+
+/*----------------------------------------------------------------------------*/
+/* void memset_pattern16(void *ptr, const void *pattern16, size_t len); */
+/* */
+/* r0 << destination pointer */
+/* r1 << pointer to 16-byte pattern */
+/* r2 << 'len' (length of destination buffer in bytes) */
+/*----------------------------------------------------------------------------*/
+ .globl _memset_pattern16
+_memset_pattern16:
+ cmp r2, #0 /* check if len is zero */
+ bxeq lr /* return if length is zero */
+
+ /* We need some registers, so save volatiles on stack */
+ /* Avoid r7 (frame pointer) and r9 (thread register) */
+ stmfd sp!, {r4-r7, lr}
+ add r7, sp, #12 /* establish frame */
+ stmfd sp!, {r8, r10-r11}
+
+ /* copy destination base pointer r0 to r12 and leave r0 alone */
+ /* so that we return original pointer back to the caller */
+ mov r12, r0
+
+ /* Check if 'len' is long enough to bother alignment of destination */
+ /* pointer */
+ cmp r2, #32 /* long enough to bother aligning? */
+ movlt r3, #16 /* move pattern length into r3 */
+ movlt r10, #16 /* pattern index */
+ movlt r11, r1 /* move pattern pointer into r11 */
+ blt L_Short /* no */
+
+ /* move 'len' into r1, get 16-byte pattern in r2-r5 */
+ mov r6, r2 /* temporarily move 'len' in to r6 */
+ ldr r2, [r1], #4 /* load 16-byte pattern into r2-r5 */
+ ldr r3, [r1], #4
+ ldr r4, [r1], #4
+ ldr r5, [r1], #4
+ mov r1, r6 /* move 'len' from r6 to r1 */
+
+ b L_NotShort /* yes */
+
+
+#endif /* _ARM_ARCH_6 */