* @APPLE_LICENSE_HEADER_END@
*/
-#include <arm/arch.h>
-#include <mach/machine/asm.h>
-
-/*
- * This file contains the following functions:
- *
- * void memset_pattern4(void *b, const void *c4, size_t len)
- * void memset_pattern8(void *b, const void *c8, size_t len)
- * void memset_pattern16(void *b, const void *c16, size_t len)
- *
- * The memset() is implemented in the bzero.s file.
- *
- * This is a reasonably well optimized version of memset_pattern* routines
- * implemented for ARM processors using the ARMv4 and later instruction sets.
- * These routines use the ARM's core registers.
- *
- * The algorithm is to align the destination pointer on a 16 byte boundary
- * and then blast data 64 bytes at a time, in two stores of 32 bytes per loop.
- *
- */
- .text
- .align 2
- .syntax unified
-
-/*----------------------------------------------------------------------------*/
-/* void memset_pattern4(void *ptr, const void *pattern4, size_t len); */
-/* */
-/* r0 << destination pointer */
-/* r1 << pointer to 4-byte pattern */
-/* r2 << 'len' (length of destination buffer in bytes) */
-/*----------------------------------------------------------------------------*/
- .globl _memset_pattern4
-_memset_pattern4:
- cmp r2, #0 /* check if len is zero */
- bxeq lr /* return if length is zero */
-
- /* We need some registers, so save volatiles on stack */
- /* Avoid r7 (frame pointer) and r9 (thread register) */
- stmfd sp!, {r4-r7, lr}
- add r7, sp, #12 /* establish frame */
- stmfd sp!, {r8, r10-r11}
-
- /* copy destination base pointer r0 to r12 and leave r0 alone */
- /* so that we return original pointer back to the caller */
- mov r12, r0
-
- /* Check if 'len' is long enough to bother alignment of destination */
- /* pointer */
- cmp r2, #32 /* long enough to bother aligning? */
- movlt r3, #4 /* move pattern length into r3 */
- movlt r10, #4 /* pattern index */
- movlt r11, r1 /* move pattern pointer into r11 */
- blt L_Short /* no */
-
- /* move 'len' into r1, get 4-byte pattern in r2 */
- mov r6, r2 /* temporarily move 'len' in to r6 */
- bl L_GetPatternWord /* get unaligned pattern word in r5 */
- mov r2, r5 /* move pattern word into r2 */
- mov r0, r12 /* r0 was clobbered - restore it */
- mov r1, r6 /* move 'len' from r6 to r1 */
-
- mov r3, r2 /* copy 4-byte pattern into r3, r4 and r5 registers */
- mov r4, r2
- mov r5, r2
-
-L_NotShort:
-
- /* Check for 16 or 32 byte aligned destination pointer */
- tst r12, #0x1F /* check for 32 byte aligned */
- beq L_Aligned
- tst r12, #0xF /* check for 16 byte aligned */
- beq L_16ByteAligned
- b L_Unaligned /* yes */
-
-L_Bytewise:
- ldrb r4, [r11], #1
- strb r4, [r12], #1
- subs r10, #1
- moveq r10, r3
- moveq r11, r1
- sub r2, #1
-
-L_Short:
- cmp r2, #0 /* more bytes left? */
- bne L_Bytewise
- ldm sp!, {r8, r10-r11} /* restores registers from stack */
- ldm sp!, {r4-r7, lr} /* restore & return from subroutine */
- bx lr
-
-/* 'len' is long enough to justify aligning the destination pointer */
-/* */
-/* By the time we reach here, data is stored in registers as follows: */
-/* r1 << 'len' (length of destination buffer in bytes) */
-/* r2-r5 << pattern; either 4x4byte OR 2x8byte OR 1x16-byte */
-/* r12 << destination pointer copy (scratch register) */
-/* r0 << destination pointer original */
-/* */
-/* Use r11 as scratch register to store the #bytes offset to 16-byte align */
-/* */
-/* Unaligned on 32-byte boundary, store 1-15 bytes until 16-byte aligned */
-/* As we store these bytes, we rotate the pattern stored in r2-r5 to reflect */
-/* the alignment. */
-
-L_Unaligned:
- mov r11, r12, lsl #28
- rsb r11, r11, #0
- msr cpsr_f, r11 /* Bits[31:28] of cpsr now contain #bytes to align*/
-
-L_Store15BytesAndRotatePattern:
- strbvs r2, [r12], #1 /* v is set, unaligned in the 1s column */
- andvs r6, r2, #0xFF /* Rotate pattern right in r2-r5 by 1-byte */
- andvs r8, r3, #0xFF /* Consider register r2-r5 and a contiguous */
- andvs r10, r4, #0xFF /* 16-byte register with r2 containing LSB */
- andvs r11, r5, #0xFF /* and r5 containing MSB */
- lsrvs r2, r2, #8
- lsrvs r3, r3, #8
- lsrvs r4, r4, #8
- lsrvs r5, r5, #8
- orrvs r2, r2, r8, lsl #24
- orrvs r3, r3, r10, lsl #24
- orrvs r4, r4, r11, lsl #24
- orrvs r5, r5, r6, lsl #24
-
- strhcs r2, [r12], #2 /* c is set, unaligned in the 2s column */
- movcs r6, r2, lsl #16 /* Rotate pattern right in r2-r5 by 2-bytes */
- movcs r8, r3, lsl #16
- movcs r10, r4, lsl #16
- movcs r11, r5, lsl #16
- lsrcs r2, r2, #16
- lsrcs r3, r3, #16
- lsrcs r4, r4, #16
- lsrcs r5, r5, #16
- orrcs r2, r2, r8
- orrcs r3, r3, r10
- orrcs r4, r4, r11
- orrcs r5, r5, r6
-
- streq r2, [r12], #4 /* z is set, unaligned in the 4s column */
- moveq r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */
- moveq r2, r3
- moveq r3, r4
- moveq r4, r5
- moveq r5, r6
-
- stmmi r12!, {r2-r3} /* n is set, unaligned in the 8s column */
- movmi r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */
- movmi r8, r3
- movmi r2, r4
- movmi r3, r5
- movmi r4, r6
- movmi r5, r8
-
- mrs r11, cpsr /*copy cpsr in to r11 */
- subs r1, r1, r11, lsr #28
- ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
- ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */
- bxeq lr
-
-/* By the time we reach here, we are 16-byte aligned and r2-r5 contains */
-/* rotated pattern. Now lets make sure we are 32-byte aligned. */
-L_16ByteAligned:
- tst r12, #(1 << 4)
- stmne r12!, {r2-r5}
- subsne r1, r1, #16
-
-/* By the time we reach here, data is stored in registers as follows: */
-/* r1 << 'len' (remaining length of destination buffer in bytes) */
-/* r2-r5 << rotated pattern; either 4x4byte OR 2x8byte OR 1x16-byte */
-/* r12 << aligned destination pointer copy (scratch register) */
-L_Aligned:
- cmp r1, #64
- blt L_AlignedLessThan64
-
-/* Copy pattern in four more registers so that we can do 64 byte transfers */
- mov r6, r2
- mov r8, r3
- mov r10, r4
- mov r11, r5
-
-/* At this point, we are 16-byte aligned and 'len' is greater than 64 bytes */
-/* Lets transfer 64 bytes at a time until len becomes less than 64 bytes */
- sub r1, r1, #64 /* pre-subtract to avoid extra compare in loop */
-L_Loop64:
- stm r12!, {r2-r6, r8, r10-r11}
- subs r1, r1, #64
- stm r12!, {r2-r6, r8, r10-r11}
- bge L_Loop64
-
- /* return if 'len' is zero */
- adds r1, r1, #64 /* readjust length; previously subtracted extra 64*/
- ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
- ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */
- bxeq lr
-
-L_AlignedLessThan64:
- /* do we have 16 or more bytes left */
- cmp r1, #16
- stmge r12!, {r2-r5}
- subsge r1, r1, #16
- bgt L_AlignedLessThan64
- ldmeq sp!, {r8, r10-r11} /* restores registers from stack */
- ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */
- bxeq lr
-
-L_AlignedLessThan16:
- /* store last up-to 15 bytes */
- /* move the remaining len bits [3:0] to the flags area of cpsr */
- mov r1, r1, lsl #28
- msr cpsr_f, r1
-
- stmmi r12!, {r2-r3} /* n is set, store 8 bytes */
- movmi r2, r4 /* shift vector down 8 bytes */
- movmi r3, r5
-
- streq r2, [r12], #4 /* z is set, store 4 bytes */
- moveq r2, r3 /* shift vector down 4 bytes */
-
- strhcs r2, [r12], #2 /* c is set, store 2 bytes */
- lsrcs r2, #16 /* shift register right 2 bytes */
-
- strbvs r2, [r12], #1 /* v is set, store 1 byte */
- ldm sp!, {r8, r10-r11} /* restores registers from stack */
- ldm sp!, {r4-r7, lr} /* restore & return from subroutine */
- bx lr
-
-/*----------------------------------------------------------------------------*/
-/* void memset_pattern8(void *ptr, const void *pattern8, size_t len); */
-/* */
-/* r0 << destination pointer */
-/* r1 << pointer to 8-byte pattern */
-/* r2 << 'len' (length of destination buffer in bytes) */
-/*----------------------------------------------------------------------------*/
- .globl _memset_pattern8
-_memset_pattern8:
- cmp r2, #0 /* check if len is zero */
- bxeq lr /* return if length is zero */
-
- /* We need some registers, so save volatiles on stack */
- /* Avoid r7 (frame pointer) and r9 (thread register) */
- stmfd sp!, {r4-r7, lr}
- add r7, sp, #12 /* establish frame */
- stmfd sp!, {r8, r10-r11}
-
- /* copy destination base pointer r0 to r12 and leave r0 alone */
- /* so that we return original pointer back to the caller */
- mov r12, r0
-
- /* Check if 'len' is long enough to bother alignment of destination */
- /* pointer */
- cmp r2, #32 /* long enough to bother aligning? */
- movlt r3, #8 /* move pattern length into r3 */
- movlt r10, #8 /* pattern index */
- movlt r11, r1 /* move pattern pointer into r11 */
- blt L_Short /* no */
-
- /* move 'len' into r1, get 8-byte pattern in r2-r3 */
- mov r6, r2 /* temporarily move 'len' in to r6 */
- bl L_GetPatternWord /* get unaligned pattern word in r5 */
- mov r2, r5 /* move pattern word into r2 */
- bl L_GetPatternWord
- mov r3, r5
- mov r0, r12 /* r0 was clobbered - restore it */
- mov r1, r6 /* move 'len' from r6 to r1 */
-
- mov r4, r2 /* copy 8-byte pattern into r4-r5 registers */
- mov r5, r3
- b L_NotShort /* yes */
-
-
-/*----------------------------------------------------------------------------*/
-/* void memset_pattern16(void *ptr, const void *pattern16, size_t len); */
-/* */
-/* r0 << destination pointer */
-/* r1 << pointer to 16-byte pattern */
-/* r2 << 'len' (length of destination buffer in bytes) */
-/*----------------------------------------------------------------------------*/
- .globl _memset_pattern16
-_memset_pattern16:
- cmp r2, #0 /* check if len is zero */
- bxeq lr /* return if length is zero */
-
- /* We need some registers, so save volatiles on stack */
- /* Avoid r7 (frame pointer) and r9 (thread register) */
- stmfd sp!, {r4-r7, lr}
- add r7, sp, #12 /* establish frame */
- stmfd sp!, {r8, r10-r11}
-
- /* copy destination base pointer r0 to r12 and leave r0 alone */
- /* so that we return original pointer back to the caller */
- mov r12, r0
-
- /* Check if 'len' is long enough to bother alignment of destination */
- /* pointer */
- cmp r2, #32 /* long enough to bother aligning? */
- movlt r3, #16 /* move pattern length into r3 */
- movlt r10, #16 /* pattern index */
- movlt r11, r1 /* move pattern pointer into r11 */
- blt L_Short /* no */
-
- /* move 'len' into r1, get 16-byte pattern in r2-r5 */
- mov r6, r2 /* temporarily move 'len' in to r6 */
- bl L_GetPatternWord /* get unaligned pattern word in r5 */
- mov r2, r5 /* move pattern word into r2 */
- bl L_GetPatternWord
- mov r3, r5
- bl L_GetPatternWord
- mov r4, r5
- bl L_GetPatternWord
- mov r0, r12 /* r0 was clobbered - restore it */
- mov r1, r6 /* move 'len' from r6 to r1 */
-
- b L_NotShort /* yes */
-
-
-/*----------------------------------------------------------------------------*/
-/* Get an unaligned word at r1, returning it in r5. */
-/* Increments r1 by 4, clobbers r0. */
-/* This is tailored to fit the register usage by the call sites. */
-/*----------------------------------------------------------------------------*/
-L_GetPatternWord:
- ldrb r5, [r1], #1 /* get the 1st byte at r1 */
- ldrb r0, [r1], #1 /* get the 2nd byte at r1 */
- orr r5, r5, r0, lsl #8 /* move into bits 15:8 */
- ldrb r0, [r1], #1 /* get the 3rd byte */
- orr r5, r5, r0, lsl #16 /* bits 23:16 */
- ldrb r0, [r1], #1 /* get the 4th byte */
- orr r5, r5, r0, lsl #24 /* bits 31:24 */
- bx lr