[apple/libc.git] / arm / string / memset_pattern_Swift.s

/*
 * Copyright (c) 2011 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 *
 *  This file implements the following functions for the Swift micro-arch:
 *
 *  void memset_pattern4(void *b, const void *pattern4, size_t len);
 *  void memset_pattern8(void *b, const void *pattern8, size_t len);
 *  void memset_pattern16(void *b, const void *pattern16, size_t len);
 *
 *  The implementation of all three functions is fundamentally the same.
 *  memset_pattern4 is extensively commented to explain, reference that
 *  if you have any questions about the other two.
 */

#include <arm/arch.h>
#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD

.syntax unified
.code 32
.text
.globl _memset_pattern4$VARIANT$Swift
.globl _memset_pattern8$VARIANT$Swift
.globl _memset_pattern16$VARIANT$Swift

/******************************************************************************/

.align 4
_memset_pattern4$VARIANT$Swift:
    push    {r7,lr}
    mov     r7,         sp

//  Load the pattern and splat it to q0, then check if the buffer is at least
//  64 bytes long.  If not, branch to a short-buffer implementation.
    ldr     r1,        [r1]
    vdup.32 q0,         r1
    subs    r3,     r2, #64
    blo     L_short4

//  We want to use aligned vector stores to fill the bulk of the buffer.  In
//  order to make that work, we need to rotate the pattern as necessary to
//  match up with aligned locations, and we also need to extract the alignment
//  of the destination pointer mod 16.
    lsl     ip,     r0, #3
    and     lr,     r0, #0xf    //  alignment of destination pointer mod 16
    rsb     ip,     ip, #32     //  low five bits contain 32 - 8*(address%4).

//  Before we start the aligned stores, we do a single unaligned store of
//  16 bytes of the pattern to the start of the buffer.  Since the buffer is
//  at least 64 bytes long, this store is known to lie entirely inside the
//  buffer:
//                              first aligned address in buffer
//                                                            v
//   ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
//  ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ...
//   ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
//        ^
//        unaligned store starts here:
//      [ 0   1   2   3   0   1   2   3   0   1   2   3   0   1   2   3 ]
    vst1.8  {q0},      [r0]!

//  Subsequent stores will be aligned, and will start at the first aligned
//  address in the buffer.  We apply the rotation that we calculated before
//  the vector store (in the low five bits of ip) to get the pattern that
//  is to be stored starting at the aligned location.  For example, in the
//  picture above, the buffer had alignment of 3 mod 4, so the rotation to
//  be applied is 32 - 8*3 = 8.  Rotating the pattern right by 8 bits gives
//  us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what
//  needs to be stored starting at the first aligned location.
//
//  Besides rotating the pattern, we also need to adjust the length (by
//  subtracting 16 - alignment mod 16), and to advance the pointer to the
//  first aligned location.
    ror     r1,         ip      //  Pattern to use for aligned memory
    add     r3,         lr
    bic     r0,         #0xf    //  destination for first aligned store
    subs    r3,         #16     //  updated length
    blo     1f

//  Splat the rotated value across q1 and q2
    vdup.32 q1,         r1
    vmov    q2,         q1

//  Main store loop.  We write the splatted aligned pattern across 64 bytes
//  per iteration, terminating the loop when the remaining length of the
//  buffer is 64 bytes or less.
0:  subs    r3,         #64
    vst1.32 {q1,q2}, [r0,:128]!
    vst1.32 {q1,q2}, [r0,:128]!
    bhi     0b

//  The remaining length of the buffer is 64 bytes or less (but the total
//  length of the buffer is at least 64 bytes; otherwise we would have
//  branched to the "short" path).  Thus, we can handle the entirety of the
//  remaining buffer with two 32-byte unaligned stores.
//
//  Again, we need to rotate the pattern to match the alignment, this time
//  by 8*(length%4), and we also need to back up the destination pointer
//  so that it points to precisely 64 bytes before the end of the buffer.
//  We accomplish this by adding r3, which contains the remaining length of
//  the buffer minus 64.
1:  lsl     ip,     r3, #3
    ror     r1,         ip
    vdup.32 q8,         r1
    vmov    q9,         q8
    add     r0,         r3
    vst1.32 {q8,q9},   [r0]!
    vst1.32 {q8,q9},   [r0]
    pop     {r7,pc}

L_short4:
//  If we branch here, the buffer is less than 64 bytes long.  At this point,
//  register contents are as follows:
//
//      r0      pointer to the buffer
//      r1      pattern
//      r2      buffer length
//      q0      splatted pattern
//
//  To begin, we store eight bytes at a time until the remaining length is
//  less than eight bytes.
    subs    r3,     r2, #8
    blo     1f
0:  subs    r3,         #8
    vst1.32 {d0},      [r0]!
    bhs     0b

//  Then we store one byte at a time, rotating the pattern to get the next
//  byte, until we reach the end of the buffer.
    add     r2,     r3, #8
1:  subs    r2,         #1
    strbhs  r1,    [r0],#1
    ror     r1,         #8
    bhi     1b
    pop     {r7,pc}

/******************************************************************************/

.align 4
_memset_pattern8$VARIANT$Swift:
//  The implementation of this function is substantially identical to that of
//  memset_pattern4.  The only differences are in how we rotate the pattern for
//  the purposes of extracting the bytes to store.  For clarity, only those
//  differences are commented here; consult memset_pattern4 (above) for
//  a detailed description of the algorithm used.
    push    {r7,lr}
    mov     r7,         sp
    vld1.8  {d0},      [r1]
    vmov    d1,         d0
    subs    r3,     r2, #64
    blo     L_short8

    bic     sp,         #0xf    //  Align stack to 16 bytes and write 32 bytes
    sub     sp,         #16     //  of pattern to the stack.  We will use 
    vst1.8  {q0},    [sp,:128]  //  unaligned loads from this scratch buffer
    sub     sp,         #16     //  to get rotated forms of the pattern.
    vst1.8  {q0},    [sp,:128]
    and     ip,     r0, #0x7    //  Now generate an unaligned pointer to the
    rsb     ip,     ip, #8      //  rotated pattern that we need to use for
    add     ip,         sp      //  aligned stores in the main loop.
    and     lr,     r0, #0xf
    vst1.8  {q0},      [r0]!
    add     r3,         lr
    bic     r0,         #0xf
    subs    r3,         #16
    blo     1f
    vld1.8  {q1},      [ip]
    vmov    q2,         q1
0:  subs    r3,         #64
    vst1.32 {q1,q2}, [r0,:128]!
    vst1.32 {q1,q2}, [r0,:128]!
    bhi     0b
1:  and     lr,     r3, #0x7    //  Generate an unaligned pointer to the
    add     ip,         lr      //  rotated pattern to use for cleanup.
    vld1.8  {q8},      [ip]
    vmov    q9,         q8
    add     r0,         r3
    vst1.32 {q8,q9},   [r0]!
    vst1.32 {q8,q9},   [r0]
    mov     sp,         r7      //  Restore stack pointer
    pop     {r7,pc}

L_short8:
    subs    r2,         #8
    blo     1f
0:  subs    r2,         #8
    vst1.32 {d0},      [r0]!
    bhs     0b
1:  adds    r2,         #8
    beq     3f
2:  vst1.8  {d0[0]},   [r0]!    //  Store one byte from NEON
    vext.8  d0,     d0, d0, #1  //  Use VEXT to rotate pattern
    subs    r2,         #1
    bhi     2b
3:  pop     {r7,pc}

/******************************************************************************/

.align 4
_memset_pattern16$VARIANT$Swift:
//  The implementation of this function is substantially identical to that of
//  memset_pattern4.  The only differences are in how we rotate the pattern for
//  the purposes of extracting the bytes to store.  For clarity, only those
//  differences are commented here; consult memset_pattern4 (above) for
//  a detailed description of the algorithm used.
    push    {r7,lr}
    mov     r7,         sp
    vld1.8  {q0},      [r1]
    subs    r3,     r2, #64
    blo     L_short16

    bic     sp,         #0xf    //  Align stack to 16 bytes and write 48 bytes
    sub     sp,         #16     //  of pattern to the stack.  We will use 
    vst1.8  {q0},    [sp,:128]  //  unaligned loads from this scratch buffer
    sub     sp,         #16     //  to get rotated forms of the pattern.
    vst1.8  {q0},    [sp,:128]
    sub     sp,         #16   
    vst1.8  {q0},    [sp,:128]
    and     lr,     r0, #0xf    //  Now generate an unaligned pointer to the
    rsb     ip,     lr, #16     //  rotated pattern that we need to use for
    add     ip,         sp      //  aligned stores in the main loop.
    vst1.8  {q0},      [r0]!
    add     r3,         lr
    bic     r0,         #0xf
    subs    r3,         #16
    blo     1f
    vld1.8  {q1},      [ip]
    vmov    q2,         q1
0:  subs    r3,         #64
    vst1.32 {q1,q2}, [r0,:128]!
    vst1.32 {q1,q2}, [r0,:128]!
    bhi     0b
1:  and     lr,     r3, #0xf    //  Generate an unaligned pointer to the
    add     ip,         lr      //  rotated pattern to use for cleanup.
    vld1.8  {q8},      [ip]
    vmov    q9,         q8
    add     r0,         r3
    vst1.32 {q8,q9},   [r0]!
    vst1.32 {q8,q9},   [r0]
    mov     sp,         r7      //  Restore stack pointer
    pop     {r7,pc}

L_short16:
    subs    r2,         #16
    blo     1f
0:  subs    r2,         #16
    vst1.32 {q0},      [r0]!
    bhs     0b
1:  adds    r2,         #16
    beq     3f
2:  vst1.8  {d0[0]},   [r0]!    //  Store one byte from NEON
    vext.8  q0,     q0, q0, #1  //  Use VEXT to rotate pattern
    subs    r2,         #1
    bhi     2b
3:  pop     {r7,pc}

#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD
Commit	Line	Data
ad3c9f2a A	1	/*
	2	* Copyright (c) 2011 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*
	23	* This file implements the following functions for the Swift micro-arch:
	24	*
	25	* void memset_pattern4(void b, const void pattern4, size_t len);
	26	* void memset_pattern8(void b, const void pattern8, size_t len);
	27	* void memset_pattern16(void b, const void pattern16, size_t len);
	28	*
	29	* The implementation of all three functions is fundamentally the same.
	30	* memset_pattern4 is extensively commented to explain, reference that
	31	* if you have any questions about the other two.
	32	*/
	33
	34	#include <arm/arch.h>
	35	#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
	36
	37	.syntax unified
	38	.code 32
	39	.text
	40	.globl _memset_pattern4$VARIANT$Swift
	41	.globl _memset_pattern8$VARIANT$Swift
	42	.globl _memset_pattern16$VARIANT$Swift
	43
	44	/******************************************************************************/
	45
	46	.align 4
	47	_memset_pattern4$VARIANT$Swift:
	48	push {r7,lr}
	49	mov r7, sp
	50
	51	// Load the pattern and splat it to q0, then check if the buffer is at least
	52	// 64 bytes long. If not, branch to a short-buffer implementation.
	53	ldr r1, [r1]
	54	vdup.32 q0, r1
	55	subs r3, r2, #64
	56	blo L_short4
	57
	58	// We want to use aligned vector stores to fill the bulk of the buffer. In
	59	// order to make that work, we need to rotate the pattern as necessary to
	60	// match up with aligned locations, and we also need to extract the alignment
	61	// of the destination pointer mod 16.
	62	lsl ip, r0, #3
	63	and lr, r0, #0xf // alignment of destination pointer mod 16
	64	rsb ip, ip, #32 // low five bits contain 32 - 8*(address%4).
65
66	// Before we start the aligned stores, we do a single unaligned store of
67	// 16 bytes of the pattern to the start of the buffer. Since the buffer is
68	// at least 64 bytes long, this store is known to lie entirely inside the
69	// buffer:
70	// first aligned address in buffer
71	// v
72	// ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
73	// ... \| 3 \| 4 \| 5 \| 6 \| 7 \| 8 \| 9 \| a \| b \| c \| d \| e \| f \| 0 \| 1 \| 2 \| ...
74	// ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
75	// ^
76	// unaligned store starts here:
77	// [ 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ]
78	vst1.8 {q0}, [r0]!
79
80	// Subsequent stores will be aligned, and will start at the first aligned
81	// address in the buffer. We apply the rotation that we calculated before
82	// the vector store (in the low five bits of ip) to get the pattern that
83	// is to be stored starting at the aligned location. For example, in the
84	// picture above, the buffer had alignment of 3 mod 4, so the rotation to
85	// be applied is 32 - 8*3 = 8. Rotating the pattern right by 8 bits gives
86	// us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what
87	// needs to be stored starting at the first aligned location.
88	//
89	// Besides rotating the pattern, we also need to adjust the length (by
90	// subtracting 16 - alignment mod 16), and to advance the pointer to the
91	// first aligned location.
92	ror r1, ip // Pattern to use for aligned memory
93	add r3, lr
94	bic r0, #0xf // destination for first aligned store
95	subs r3, #16 // updated length
96	blo 1f
97
98	// Splat the rotated value across q1 and q2
99	vdup.32 q1, r1
100	vmov q2, q1
101
102	// Main store loop. We write the splatted aligned pattern across 64 bytes
103	// per iteration, terminating the loop when the remaining length of the
104	// buffer is 64 bytes or less.
105	0: subs r3, #64
106	vst1.32 {q1,q2}, [r0,:128]!
107	vst1.32 {q1,q2}, [r0,:128]!
108	bhi 0b
109
110	// The remaining length of the buffer is 64 bytes or less (but the total
111	// length of the buffer is at least 64 bytes; otherwise we would have
112	// branched to the "short" path). Thus, we can handle the entirety of the
113	// remaining buffer with two 32-byte unaligned stores.
114	//
115	// Again, we need to rotate the pattern to match the alignment, this time
116	// by 8*(length%4), and we also need to back up the destination pointer
117	// so that it points to precisely 64 bytes before the end of the buffer.
118	// We accomplish this by adding r3, which contains the remaining length of
119	// the buffer minus 64.
120	1: lsl ip, r3, #3
121	ror r1, ip
122	vdup.32 q8, r1
123	vmov q9, q8
124	add r0, r3
125	vst1.32 {q8,q9}, [r0]!
126	vst1.32 {q8,q9}, [r0]
127	pop {r7,pc}
128
129	L_short4:
130	// If we branch here, the buffer is less than 64 bytes long. At this point,
131	// register contents are as follows:
132	//
133	// r0 pointer to the buffer
134	// r1 pattern
135	// r2 buffer length
136	// q0 splatted pattern
137	//
138	// To begin, we store eight bytes at a time until the remaining length is
139	// less than eight bytes.
140	subs r3, r2, #8
141	blo 1f
142	0: subs r3, #8
143	vst1.32 {d0}, [r0]!
144	bhs 0b
145
146	// Then we store one byte at a time, rotating the pattern to get the next
147	// byte, until we reach the end of the buffer.
148	add r2, r3, #8
149	1: subs r2, #1
150	strbhs r1, [r0],#1
151	ror r1, #8
152	bhi 1b
153	pop {r7,pc}
154
155	/******************************************************************************/
156
157	.align 4
158	_memset_pattern8$VARIANT$Swift:
159	// The implementation of this function is substantially identical to that of
160	// memset_pattern4. The only differences are in how we rotate the pattern for
161	// the purposes of extracting the bytes to store. For clarity, only those
162	// differences are commented here; consult memset_pattern4 (above) for
163	// a detailed description of the algorithm used.
164	push {r7,lr}
165	mov r7, sp
166	vld1.8 {d0}, [r1]
167	vmov d1, d0
168	subs r3, r2, #64
169	blo L_short8
170
171	bic sp, #0xf // Align stack to 16 bytes and write 32 bytes
172	sub sp, #16 // of pattern to the stack. We will use
173	vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
174	sub sp, #16 // to get rotated forms of the pattern.
175	vst1.8 {q0}, [sp,:128]
176	and ip, r0, #0x7 // Now generate an unaligned pointer to the
177	rsb ip, ip, #8 // rotated pattern that we need to use for
178	add ip, sp // aligned stores in the main loop.
179	and lr, r0, #0xf
180	vst1.8 {q0}, [r0]!
181	add r3, lr
182	bic r0, #0xf
183	subs r3, #16
184	blo 1f
185	vld1.8 {q1}, [ip]
186	vmov q2, q1
187	0: subs r3, #64
188	vst1.32 {q1,q2}, [r0,:128]!
189	vst1.32 {q1,q2}, [r0,:128]!
190	bhi 0b
191	1: and lr, r3, #0x7 // Generate an unaligned pointer to the
192	add ip, lr // rotated pattern to use for cleanup.
193	vld1.8 {q8}, [ip]
194	vmov q9, q8
195	add r0, r3
196	vst1.32 {q8,q9}, [r0]!
197	vst1.32 {q8,q9}, [r0]
198	mov sp, r7 // Restore stack pointer
199	pop {r7,pc}
200
201	L_short8:
202	subs r2, #8
203	blo 1f
204	0: subs r2, #8
205	vst1.32 {d0}, [r0]!
206	bhs 0b
207	1: adds r2, #8
208	beq 3f
209	2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
210	vext.8 d0, d0, d0, #1 // Use VEXT to rotate pattern
211	subs r2, #1
212	bhi 2b
213	3: pop {r7,pc}
214
215	/******************************************************************************/
216
217	.align 4
218	_memset_pattern16$VARIANT$Swift:
219	// The implementation of this function is substantially identical to that of
220	// memset_pattern4. The only differences are in how we rotate the pattern for
221	// the purposes of extracting the bytes to store. For clarity, only those
222	// differences are commented here; consult memset_pattern4 (above) for
223	// a detailed description of the algorithm used.
224	push {r7,lr}
225	mov r7, sp
226	vld1.8 {q0}, [r1]
227	subs r3, r2, #64
228	blo L_short16
229
230	bic sp, #0xf // Align stack to 16 bytes and write 48 bytes
231	sub sp, #16 // of pattern to the stack. We will use
232	vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
233	sub sp, #16 // to get rotated forms of the pattern.
234	vst1.8 {q0}, [sp,:128]
235	sub sp, #16
236	vst1.8 {q0}, [sp,:128]
237	and lr, r0, #0xf // Now generate an unaligned pointer to the
238	rsb ip, lr, #16 // rotated pattern that we need to use for
239	add ip, sp // aligned stores in the main loop.
240	vst1.8 {q0}, [r0]!
241	add r3, lr
242	bic r0, #0xf
243	subs r3, #16
244	blo 1f
245	vld1.8 {q1}, [ip]
246	vmov q2, q1
247	0: subs r3, #64
248	vst1.32 {q1,q2}, [r0,:128]!
249	vst1.32 {q1,q2}, [r0,:128]!
250	bhi 0b
251	1: and lr, r3, #0xf // Generate an unaligned pointer to the
252	add ip, lr // rotated pattern to use for cleanup.
253	vld1.8 {q8}, [ip]
254	vmov q9, q8
255	add r0, r3
256	vst1.32 {q8,q9}, [r0]!
257	vst1.32 {q8,q9}, [r0]
258	mov sp, r7 // Restore stack pointer
259	pop {r7,pc}
260
261	L_short16:
262	subs r2, #16
263	blo 1f
264	0: subs r2, #16
265	vst1.32 {q0}, [r0]!
266	bhs 0b
267	1: adds r2, #16
268	beq 3f
269	2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
270	vext.8 q0, q0, q0, #1 // Use VEXT to rotate pattern
271	subs r2, #1
272	bhi 2b
273	3: pop {r7,pc}
274
275	#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD