/* * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ #include #include /* * This file contains the following functions: * * void memset_pattern4(void *b, const void *c4, size_t len) * void memset_pattern8(void *b, const void *c8, size_t len) * void memset_pattern16(void *b, const void *c16, size_t len) * * The memset() is implemented in the bzero.s file. * * This is a reasonably well optimized version of memset_pattern* routines * implemented for ARM processors using the ARMv4 and later instruction sets. * These routines use the ARM's core registers. * * The algorithm is to align the destination pointer on a 16 byte boundary * and then blast data 64 bytes at a time, in two stores of 32 bytes per loop. * */ .text .align 2 .syntax unified /*----------------------------------------------------------------------------*/ /* void memset_pattern4(void *ptr, const void *pattern4, size_t len); */ /* */ /* r0 << destination pointer */ /* r1 << pointer to 4-byte pattern */ /* r2 << 'len' (length of destination buffer in bytes) */ /*----------------------------------------------------------------------------*/ .globl _memset_pattern4 _memset_pattern4: cmp r2, #0 /* check if len is zero */ bxeq lr /* return if length is zero */ /* We need some registers, so save volatiles on stack */ /* Avoid r7 (frame pointer) and r9 (thread register) */ stmfd sp!, {r4-r7, lr} add r7, sp, #12 /* establish frame */ stmfd sp!, {r8, r10-r11} /* copy destination base pointer r0 to r12 and leave r0 alone */ /* so that we return original pointer back to the caller */ mov r12, r0 /* Check if 'len' is long enough to bother alignment of destination */ /* pointer */ cmp r2, #32 /* long enough to bother aligning? */ movlt r3, #4 /* move pattern length into r3 */ movlt r10, #4 /* pattern index */ movlt r11, r1 /* move pattern pointer into r11 */ blt L_Short /* no */ /* move 'len' into r1, get 4-byte pattern in r2 */ mov r6, r2 /* temporarily move 'len' in to r6 */ bl L_GetPatternWord /* get unaligned pattern word in r5 */ mov r2, r5 /* move pattern word into r2 */ mov r0, r12 /* r0 was clobbered - restore it */ mov r1, r6 /* move 'len' from r6 to r1 */ mov r3, r2 /* copy 4-byte pattern into r3, r4 and r5 registers */ mov r4, r2 mov r5, r2 L_NotShort: /* Check for 16 or 32 byte aligned destination pointer */ tst r12, #0x1F /* check for 32 byte aligned */ beq L_Aligned tst r12, #0xF /* check for 16 byte aligned */ beq L_16ByteAligned b L_Unaligned /* yes */ L_Bytewise: ldrb r4, [r11], #1 strb r4, [r12], #1 subs r10, #1 moveq r10, r3 moveq r11, r1 sub r2, #1 L_Short: cmp r2, #0 /* more bytes left? */ bne L_Bytewise ldm sp!, {r8, r10-r11} /* restores registers from stack */ ldm sp!, {r4-r7, lr} /* restore & return from subroutine */ bx lr /* 'len' is long enough to justify aligning the destination pointer */ /* */ /* By the time we reach here, data is stored in registers as follows: */ /* r1 << 'len' (length of destination buffer in bytes) */ /* r2-r5 << pattern; either 4x4byte OR 2x8byte OR 1x16-byte */ /* r12 << destination pointer copy (scratch register) */ /* r0 << destination pointer original */ /* */ /* Use r11 as scratch register to store the #bytes offset to 16-byte align */ /* */ /* Unaligned on 32-byte boundary, store 1-15 bytes until 16-byte aligned */ /* As we store these bytes, we rotate the pattern stored in r2-r5 to reflect */ /* the alignment. */ L_Unaligned: mov r11, r12, lsl #28 rsb r11, r11, #0 msr cpsr_f, r11 /* Bits[31:28] of cpsr now contain #bytes to align*/ L_Store15BytesAndRotatePattern: strbvs r2, [r12], #1 /* v is set, unaligned in the 1s column */ andvs r6, r2, #0xFF /* Rotate pattern right in r2-r5 by 1-byte */ andvs r8, r3, #0xFF /* Consider register r2-r5 and a contiguous */ andvs r10, r4, #0xFF /* 16-byte register with r2 containing LSB */ andvs r11, r5, #0xFF /* and r5 containing MSB */ lsrvs r2, r2, #8 lsrvs r3, r3, #8 lsrvs r4, r4, #8 lsrvs r5, r5, #8 orrvs r2, r2, r8, lsl #24 orrvs r3, r3, r10, lsl #24 orrvs r4, r4, r11, lsl #24 orrvs r5, r5, r6, lsl #24 strhcs r2, [r12], #2 /* c is set, unaligned in the 2s column */ movcs r6, r2, lsl #16 /* Rotate pattern right in r2-r5 by 2-bytes */ movcs r8, r3, lsl #16 movcs r10, r4, lsl #16 movcs r11, r5, lsl #16 lsrcs r2, r2, #16 lsrcs r3, r3, #16 lsrcs r4, r4, #16 lsrcs r5, r5, #16 orrcs r2, r2, r8 orrcs r3, r3, r10 orrcs r4, r4, r11 orrcs r5, r5, r6 streq r2, [r12], #4 /* z is set, unaligned in the 4s column */ moveq r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */ moveq r2, r3 moveq r3, r4 moveq r4, r5 moveq r5, r6 stmmi r12!, {r2-r3} /* n is set, unaligned in the 8s column */ movmi r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */ movmi r8, r3 movmi r2, r4 movmi r3, r5 movmi r4, r6 movmi r5, r8 mrs r11, cpsr /*copy cpsr in to r11 */ subs r1, r1, r11, lsr #28 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */ bxeq lr /* By the time we reach here, we are 16-byte aligned and r2-r5 contains */ /* rotated pattern. Now lets make sure we are 32-byte aligned. */ L_16ByteAligned: tst r12, #(1 << 4) stmne r12!, {r2-r5} subsne r1, r1, #16 /* By the time we reach here, data is stored in registers as follows: */ /* r1 << 'len' (remaining length of destination buffer in bytes) */ /* r2-r5 << rotated pattern; either 4x4byte OR 2x8byte OR 1x16-byte */ /* r12 << aligned destination pointer copy (scratch register) */ L_Aligned: cmp r1, #64 blt L_AlignedLessThan64 /* Copy pattern in four more registers so that we can do 64 byte transfers */ mov r6, r2 mov r8, r3 mov r10, r4 mov r11, r5 /* At this point, we are 16-byte aligned and 'len' is greater than 64 bytes */ /* Lets transfer 64 bytes at a time until len becomes less than 64 bytes */ sub r1, r1, #64 /* pre-subtract to avoid extra compare in loop */ L_Loop64: stm r12!, {r2-r6, r8, r10-r11} subs r1, r1, #64 stm r12!, {r2-r6, r8, r10-r11} bge L_Loop64 /* return if 'len' is zero */ adds r1, r1, #64 /* readjust length; previously subtracted extra 64*/ ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */ bxeq lr L_AlignedLessThan64: /* do we have 16 or more bytes left */ cmp r1, #16 stmge r12!, {r2-r5} subsge r1, r1, #16 bgt L_AlignedLessThan64 ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */ bxeq lr L_AlignedLessThan16: /* store last up-to 15 bytes */ /* move the remaining len bits [3:0] to the flags area of cpsr */ mov r1, r1, lsl #28 msr cpsr_f, r1 stmmi r12!, {r2-r3} /* n is set, store 8 bytes */ movmi r2, r4 /* shift vector down 8 bytes */ movmi r3, r5 streq r2, [r12], #4 /* z is set, store 4 bytes */ moveq r2, r3 /* shift vector down 4 bytes */ strhcs r2, [r12], #2 /* c is set, store 2 bytes */ lsrcs r2, #16 /* shift register right 2 bytes */ strbvs r2, [r12], #1 /* v is set, store 1 byte */ ldm sp!, {r8, r10-r11} /* restores registers from stack */ ldm sp!, {r4-r7, lr} /* restore & return from subroutine */ bx lr /*----------------------------------------------------------------------------*/ /* void memset_pattern8(void *ptr, const void *pattern8, size_t len); */ /* */ /* r0 << destination pointer */ /* r1 << pointer to 8-byte pattern */ /* r2 << 'len' (length of destination buffer in bytes) */ /*----------------------------------------------------------------------------*/ .globl _memset_pattern8 _memset_pattern8: cmp r2, #0 /* check if len is zero */ bxeq lr /* return if length is zero */ /* We need some registers, so save volatiles on stack */ /* Avoid r7 (frame pointer) and r9 (thread register) */ stmfd sp!, {r4-r7, lr} add r7, sp, #12 /* establish frame */ stmfd sp!, {r8, r10-r11} /* copy destination base pointer r0 to r12 and leave r0 alone */ /* so that we return original pointer back to the caller */ mov r12, r0 /* Check if 'len' is long enough to bother alignment of destination */ /* pointer */ cmp r2, #32 /* long enough to bother aligning? */ movlt r3, #8 /* move pattern length into r3 */ movlt r10, #8 /* pattern index */ movlt r11, r1 /* move pattern pointer into r11 */ blt L_Short /* no */ /* move 'len' into r1, get 8-byte pattern in r2-r3 */ mov r6, r2 /* temporarily move 'len' in to r6 */ bl L_GetPatternWord /* get unaligned pattern word in r5 */ mov r2, r5 /* move pattern word into r2 */ bl L_GetPatternWord mov r3, r5 mov r0, r12 /* r0 was clobbered - restore it */ mov r1, r6 /* move 'len' from r6 to r1 */ mov r4, r2 /* copy 8-byte pattern into r4-r5 registers */ mov r5, r3 b L_NotShort /* yes */ /*----------------------------------------------------------------------------*/ /* void memset_pattern16(void *ptr, const void *pattern16, size_t len); */ /* */ /* r0 << destination pointer */ /* r1 << pointer to 16-byte pattern */ /* r2 << 'len' (length of destination buffer in bytes) */ /*----------------------------------------------------------------------------*/ .globl _memset_pattern16 _memset_pattern16: cmp r2, #0 /* check if len is zero */ bxeq lr /* return if length is zero */ /* We need some registers, so save volatiles on stack */ /* Avoid r7 (frame pointer) and r9 (thread register) */ stmfd sp!, {r4-r7, lr} add r7, sp, #12 /* establish frame */ stmfd sp!, {r8, r10-r11} /* copy destination base pointer r0 to r12 and leave r0 alone */ /* so that we return original pointer back to the caller */ mov r12, r0 /* Check if 'len' is long enough to bother alignment of destination */ /* pointer */ cmp r2, #32 /* long enough to bother aligning? */ movlt r3, #16 /* move pattern length into r3 */ movlt r10, #16 /* pattern index */ movlt r11, r1 /* move pattern pointer into r11 */ blt L_Short /* no */ /* move 'len' into r1, get 16-byte pattern in r2-r5 */ mov r6, r2 /* temporarily move 'len' in to r6 */ bl L_GetPatternWord /* get unaligned pattern word in r5 */ mov r2, r5 /* move pattern word into r2 */ bl L_GetPatternWord mov r3, r5 bl L_GetPatternWord mov r4, r5 bl L_GetPatternWord mov r0, r12 /* r0 was clobbered - restore it */ mov r1, r6 /* move 'len' from r6 to r1 */ b L_NotShort /* yes */ /*----------------------------------------------------------------------------*/ /* Get an unaligned word at r1, returning it in r5. */ /* Increments r1 by 4, clobbers r0. */ /* This is tailored to fit the register usage by the call sites. */ /*----------------------------------------------------------------------------*/ L_GetPatternWord: ldrb r5, [r1], #1 /* get the 1st byte at r1 */ ldrb r0, [r1], #1 /* get the 2nd byte at r1 */ orr r5, r5, r0, lsl #8 /* move into bits 15:8 */ ldrb r0, [r1], #1 /* get the 3rd byte */ orr r5, r5, r0, lsl #16 /* bits 23:16 */ ldrb r0, [r1], #1 /* get the 4th byte */ orr r5, r5, r0, lsl #24 /* bits 31:24 */ bx lr