[apple/libc.git] / arm / string / bzero_CortexA8.s

/*
 * Copyright (c) 2009 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

#include <arm/arch.h>
#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD

/**********************************************************************
 * Cortex-A8 implementation                                           *
 **********************************************************************/

// Cortex-A8 implementations of memset( ) and bzero( ).  Main loop is 64-byte
// NEON stores, unless the buffer length is > 1k.  Beyond that point, there is
// little to no speed advantage with NEON (and a slight regression in some
// measured cases), so we switch to the GPRs.
//
// The crossover point should be reevaluated for future architectures.
//
// -- Stephen Canon, August 2009

.text
.syntax unified
.code 16

// void bzero(void * destination,
//            size_t length);
//
// zeros out a buffer length bytes long, beginning at the address destination.
.thumb_func ___bzero$VARIANT$CortexA8
.globl ___bzero$VARIANT$CortexA8
.thumb_func _bzero$VARIANT$CortexA8
.globl _bzero$VARIANT$CortexA8
.align 2
___bzero$VARIANT$CortexA8:
_bzero$VARIANT$CortexA8:
    mov     r2,     r1              // match the API to memset(dest, 0, length)
    eor     r1,     r1              // and fall through into memset

// void *memset(void * destination,
//              int value, size_t n);
//
// writes value converted to an unsigned char to n successive bytes, beginning
// at destination.

// Notes on register usage:
// 
// Throughout this function, registers have nearly constant usage; the pattern
// is:
//
//     r0 holds the original destination pointer, unmodified.  This value
//        must be returned by the routine, so it is easiest to just leave it
//        in place.
//     r1 holds the value that is being copied into the buffer, in some stage
//        of splattedness.  The low byte is guaranteed to always have the value
//        but the higher bytes may or may not contain copies of it.
//     r2 holds the length minus some offset, where the offset is always the
//        number of bytes that the current loop stores per iteration.
//     r3-r6,r8,r10,r11 are used with stmia, and will only ever contain splatted
//        copies of the value to be stored.
//     ip holds a pointer to the lowest byte in the array that has not yet been
//        set to hold value.
//     q0 and q1 hold splatted copies of the value in the vector path, and are
//        otherwise unused.

.thumb_func _memset$VARIANT$CortexA8
.globl _memset$VARIANT$CortexA8
.align 2
_memset$VARIANT$CortexA8:
    mov       ip,      r0           // copy destination pointer.
    subs      r2,           #0x8    // if length - 8 is negative (i.e. length
    and       r1,           #0xff   // is less than 8), jump to cleanup path.
    blt       L_scalarCleanup       // 
    
    tst       ip,           #0x7    // if the destination is doubleword
    beq       L_vectorCopy          // aligned, jump to fast path.
    
0:  strb      r1,     [ip], #1      // store one byte at a time until
    sub       r2,           #1      // destination pointer is 8 byte aligned.
    tst       ip,           #7      //
    bne       0b                    //
    
    cmp       r2,           #0x0    // if length - 8 is negative,
    blt       L_scalarCleanup       // jump to the cleanup code

L_vectorCopy:
    vdup.8    q0,      r1           // splat the byte to be stored across
    subs      r2,           #0x38   // q0 and q1, and check if length - 64
    vmov      q1,      q0           // is negative; if so, jump to the
    blt       L_vectorCleanup       // cleanup code.
    
    tst       ip,           #0x38   // if the destination is cacheline
    beq       L_cachelineAligned    // aligned, jump to the fast path.

0:  vst1.64  {d0},    [ip, :64]!    // store one double word at a time until
    sub       r2,           #8      // the destination is 64-byte aligned
    tst       ip,           #0x38   // 
    bne       0b
    
    cmp       r2,           #0x0    // if length - 64 is negative,
    blt       L_vectorCleanup       // jump to the cleanup code

L_cachelineAligned:
    cmp       r2,           #0x3c0  // if length > 1024
    bge       L_useSTMIA            // we use stmia instead

.align 4                            // main loop
0:  vst1.64  {q0,q1}, [ip, :256]!   // store 32 bytes
    subs      r2,           #0x40   // decrement length by 64
    vst1.64  {q0,q1}, [ip, :256]!   // store 32 bytes
    bge       0b                    // if length - 64 >= 0, continue
    
L_vectorCleanup:
    adds      r2,           #0x38   // if (length - 8) < 0, goto scalar cleanup
    blt       L_scalarCleanup       //
    
0:  subs      r2,           #8      // store one double word at a time until
    vst1.64  {d0},    [ip, :64]!    // (length - 8) < 0.
    bge       0b
    
L_scalarCleanup:
    adds      r2,           #8      // restore length
    beq       1f                    // early out if zero.
    
0:  strb      r1,     [ip], #1      // store one byte at a time until length
    subs      r2,           #1      // is zero.
    bne       0b                    //
1:  bx        lr                    // return.

//  STMIA loop for large buffers
//
//  For stores larger than 1024 bytes, we use STMIA because we can't get enough
//  of a speedup from NEON to offset the higher power draw of the NEON unit.
//
//  This crossover should be reevaluated on future architectures.
//
//  We avoid using r7 and r9 even though it's not strictly necessary.

L_useSTMIA:
    push     {r4,r5,r6,r8,r10,r11}
    orr       r1,      r1,  r1, lsl #8
    orr       r1,      r1,  r1, lsl #16
    mov       r3,      r1
    mov       r4,      r1
    mov       r5,      r1
    mov       r6,      r1
    mov       r8,      r1
    mov       r10,     r1
    mov       r11,     r1
.align 4
0:  stmia     ip!,  {r1,r3,r4,r5,r6,r8,r10,r11}
    subs      r2,           #0x40
    stmia     ip!,  {r1,r3,r4,r5,r6,r8,r10,r11}
    bge       0b
    pop      {r4,r5,r6,r8,r10,r11}
    b         L_vectorCleanup

#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD
Commit	Line	Data
7b00c0c4 A	1	/*
	2	* Copyright (c) 2009 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*/
ad3c9f2a A	23
	24	#include <arm/arch.h>
	25	#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
	26
	27	/**********************************************************************
	28	* Cortex-A8 implementation *
	29	**********************************************************************/
	30
	31	// Cortex-A8 implementations of memset( ) and bzero( ). Main loop is 64-byte
	32	// NEON stores, unless the buffer length is > 1k. Beyond that point, there is
	33	// little to no speed advantage with NEON (and a slight regression in some
	34	// measured cases), so we switch to the GPRs.
	35	//
	36	// The crossover point should be reevaluated for future architectures.
	37	//
	38	// -- Stephen Canon, August 2009
	39
	40	.text
	41	.syntax unified
	42	.code 16
	43
	44	// void bzero(void * destination,
	45	// size_t length);
	46	//
	47	// zeros out a buffer length bytes long, beginning at the address destination.
	48	.thumb_func ___bzero$VARIANT$CortexA8
	49	.globl ___bzero$VARIANT$CortexA8
	50	.thumb_func _bzero$VARIANT$CortexA8
	51	.globl _bzero$VARIANT$CortexA8
	52	.align 2
	53	___bzero$VARIANT$CortexA8:
	54	_bzero$VARIANT$CortexA8:
	55	mov r2, r1 // match the API to memset(dest, 0, length)
	56	eor r1, r1 // and fall through into memset
	57
	58	// void memset(void destination,
	59	// int value, size_t n);
	60	//
	61	// writes value converted to an unsigned char to n successive bytes, beginning
	62	// at destination.
	63
	64	// Notes on register usage:
	65	//
	66	// Throughout this function, registers have nearly constant usage; the pattern
	67	// is:
	68	//
	69	// r0 holds the original destination pointer, unmodified. This value
	70	// must be returned by the routine, so it is easiest to just leave it
	71	// in place.
	72	// r1 holds the value that is being copied into the buffer, in some stage
	73	// of splattedness. The low byte is guaranteed to always have the value
	74	// but the higher bytes may or may not contain copies of it.
	75	// r2 holds the length minus some offset, where the offset is always the
	76	// number of bytes that the current loop stores per iteration.
	77	// r3-r6,r8,r10,r11 are used with stmia, and will only ever contain splatted
	78	// copies of the value to be stored.
	79	// ip holds a pointer to the lowest byte in the array that has not yet been
	80	// set to hold value.
	81	// q0 and q1 hold splatted copies of the value in the vector path, and are
	82	// otherwise unused.
	83
	84	.thumb_func _memset$VARIANT$CortexA8
	85	.globl _memset$VARIANT$CortexA8
	86	.align 2
87	_memset$VARIANT$CortexA8:
88	mov ip, r0 // copy destination pointer.
89	subs r2, #0x8 // if length - 8 is negative (i.e. length
90	and r1, #0xff // is less than 8), jump to cleanup path.
91	blt L_scalarCleanup //
92
93	tst ip, #0x7 // if the destination is doubleword
94	beq L_vectorCopy // aligned, jump to fast path.
95
96	0: strb r1, [ip], #1 // store one byte at a time until
97	sub r2, #1 // destination pointer is 8 byte aligned.
98	tst ip, #7 //
99	bne 0b //
100
101	cmp r2, #0x0 // if length - 8 is negative,
102	blt L_scalarCleanup // jump to the cleanup code
103
104	L_vectorCopy:
105	vdup.8 q0, r1 // splat the byte to be stored across
106	subs r2, #0x38 // q0 and q1, and check if length - 64
107	vmov q1, q0 // is negative; if so, jump to the
108	blt L_vectorCleanup // cleanup code.
109
110	tst ip, #0x38 // if the destination is cacheline
111	beq L_cachelineAligned // aligned, jump to the fast path.
112
113	0: vst1.64 {d0}, [ip, :64]! // store one double word at a time until
114	sub r2, #8 // the destination is 64-byte aligned
115	tst ip, #0x38 //
116	bne 0b
117
118	cmp r2, #0x0 // if length - 64 is negative,
119	blt L_vectorCleanup // jump to the cleanup code
120
121	L_cachelineAligned:
122	cmp r2, #0x3c0 // if length > 1024
123	bge L_useSTMIA // we use stmia instead
124
125	.align 4 // main loop
126	0: vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes
127	subs r2, #0x40 // decrement length by 64
128	vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes
129	bge 0b // if length - 64 >= 0, continue
130
131	L_vectorCleanup:
132	adds r2, #0x38 // if (length - 8) < 0, goto scalar cleanup
133	blt L_scalarCleanup //
134
135	0: subs r2, #8 // store one double word at a time until
136	vst1.64 {d0}, [ip, :64]! // (length - 8) < 0.
137	bge 0b
138
139	L_scalarCleanup:
140	adds r2, #8 // restore length
141	beq 1f // early out if zero.
142
143	0: strb r1, [ip], #1 // store one byte at a time until length
144	subs r2, #1 // is zero.
145	bne 0b //
146	1: bx lr // return.
147
148	// STMIA loop for large buffers
149	//
150	// For stores larger than 1024 bytes, we use STMIA because we can't get enough
151	// of a speedup from NEON to offset the higher power draw of the NEON unit.
152	//
153	// This crossover should be reevaluated on future architectures.
154	//
155	// We avoid using r7 and r9 even though it's not strictly necessary.
156
157	L_useSTMIA:
158	push {r4,r5,r6,r8,r10,r11}
159	orr r1, r1, r1, lsl #8
160	orr r1, r1, r1, lsl #16
161	mov r3, r1
162	mov r4, r1
163	mov r5, r1
164	mov r6, r1
165	mov r8, r1
166	mov r10, r1
167	mov r11, r1
168	.align 4
169	0: stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11}
170	subs r2, #0x40
171	stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11}
172	bge 0b
173	pop {r4,r5,r6,r8,r10,r11}
174	b L_vectorCleanup
175
176	#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD
177