--- /dev/null
+/*
+ * Copyright (c) 2011 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ *
+ * This file implements the following functions for the Swift micro-arch:
+ *
+ * void memset_pattern4(void *b, const void *pattern4, size_t len);
+ * void memset_pattern8(void *b, const void *pattern8, size_t len);
+ * void memset_pattern16(void *b, const void *pattern16, size_t len);
+ *
+ * The implementation of all three functions is fundamentally the same.
+ * memset_pattern4 is extensively commented to explain, reference that
+ * if you have any questions about the other two.
+ */
+
+#include <arm/arch.h>
+#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
+
+.syntax unified
+.code 32
+.text
+.globl _memset_pattern4$VARIANT$Swift
+.globl _memset_pattern8$VARIANT$Swift
+.globl _memset_pattern16$VARIANT$Swift
+
+/******************************************************************************/
+
+.align 4
+_memset_pattern4$VARIANT$Swift:
+ push {r7,lr}
+ mov r7, sp
+
+// Load the pattern and splat it to q0, then check if the buffer is at least
+// 64 bytes long. If not, branch to a short-buffer implementation.
+ ldr r1, [r1]
+ vdup.32 q0, r1
+ subs r3, r2, #64
+ blo L_short4
+
+// We want to use aligned vector stores to fill the bulk of the buffer. In
+// order to make that work, we need to rotate the pattern as necessary to
+// match up with aligned locations, and we also need to extract the alignment
+// of the destination pointer mod 16.
+ lsl ip, r0, #3
+ and lr, r0, #0xf // alignment of destination pointer mod 16
+ rsb ip, ip, #32 // low five bits contain 32 - 8*(address%4).
+
+// Before we start the aligned stores, we do a single unaligned store of
+// 16 bytes of the pattern to the start of the buffer. Since the buffer is
+// at least 64 bytes long, this store is known to lie entirely inside the
+// buffer:
+// first aligned address in buffer
+// v
+// ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
+// ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ...
+// ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
+// ^
+// unaligned store starts here:
+// [ 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ]
+ vst1.8 {q0}, [r0]!
+
+// Subsequent stores will be aligned, and will start at the first aligned
+// address in the buffer. We apply the rotation that we calculated before
+// the vector store (in the low five bits of ip) to get the pattern that
+// is to be stored starting at the aligned location. For example, in the
+// picture above, the buffer had alignment of 3 mod 4, so the rotation to
+// be applied is 32 - 8*3 = 8. Rotating the pattern right by 8 bits gives
+// us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what
+// needs to be stored starting at the first aligned location.
+//
+// Besides rotating the pattern, we also need to adjust the length (by
+// subtracting 16 - alignment mod 16), and to advance the pointer to the
+// first aligned location.
+ ror r1, ip // Pattern to use for aligned memory
+ add r3, lr
+ bic r0, #0xf // destination for first aligned store
+ subs r3, #16 // updated length
+ blo 1f
+
+// Splat the rotated value across q1 and q2
+ vdup.32 q1, r1
+ vmov q2, q1
+
+// Main store loop. We write the splatted aligned pattern across 64 bytes
+// per iteration, terminating the loop when the remaining length of the
+// buffer is 64 bytes or less.
+0: subs r3, #64
+ vst1.32 {q1,q2}, [r0,:128]!
+ vst1.32 {q1,q2}, [r0,:128]!
+ bhi 0b
+
+// The remaining length of the buffer is 64 bytes or less (but the total
+// length of the buffer is at least 64 bytes; otherwise we would have
+// branched to the "short" path). Thus, we can handle the entirety of the
+// remaining buffer with two 32-byte unaligned stores.
+//
+// Again, we need to rotate the pattern to match the alignment, this time
+// by 8*(length%4), and we also need to back up the destination pointer
+// so that it points to precisely 64 bytes before the end of the buffer.
+// We accomplish this by adding r3, which contains the remaining length of
+// the buffer minus 64.
+1: lsl ip, r3, #3
+ ror r1, ip
+ vdup.32 q8, r1
+ vmov q9, q8
+ add r0, r3
+ vst1.32 {q8,q9}, [r0]!
+ vst1.32 {q8,q9}, [r0]
+ pop {r7,pc}
+
+L_short4:
+// If we branch here, the buffer is less than 64 bytes long. At this point,
+// register contents are as follows:
+//
+// r0 pointer to the buffer
+// r1 pattern
+// r2 buffer length
+// q0 splatted pattern
+//
+// To begin, we store eight bytes at a time until the remaining length is
+// less than eight bytes.
+ subs r3, r2, #8
+ blo 1f
+0: subs r3, #8
+ vst1.32 {d0}, [r0]!
+ bhs 0b
+
+// Then we store one byte at a time, rotating the pattern to get the next
+// byte, until we reach the end of the buffer.
+ add r2, r3, #8
+1: subs r2, #1
+ strbhs r1, [r0],#1
+ ror r1, #8
+ bhi 1b
+ pop {r7,pc}
+
+/******************************************************************************/
+
+.align 4
+_memset_pattern8$VARIANT$Swift:
+// The implementation of this function is substantially identical to that of
+// memset_pattern4. The only differences are in how we rotate the pattern for
+// the purposes of extracting the bytes to store. For clarity, only those
+// differences are commented here; consult memset_pattern4 (above) for
+// a detailed description of the algorithm used.
+ push {r7,lr}
+ mov r7, sp
+ vld1.8 {d0}, [r1]
+ vmov d1, d0
+ subs r3, r2, #64
+ blo L_short8
+
+ bic sp, #0xf // Align stack to 16 bytes and write 32 bytes
+ sub sp, #16 // of pattern to the stack. We will use
+ vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
+ sub sp, #16 // to get rotated forms of the pattern.
+ vst1.8 {q0}, [sp,:128]
+ and ip, r0, #0x7 // Now generate an unaligned pointer to the
+ rsb ip, ip, #8 // rotated pattern that we need to use for
+ add ip, sp // aligned stores in the main loop.
+ and lr, r0, #0xf
+ vst1.8 {q0}, [r0]!
+ add r3, lr
+ bic r0, #0xf
+ subs r3, #16
+ blo 1f
+ vld1.8 {q1}, [ip]
+ vmov q2, q1
+0: subs r3, #64
+ vst1.32 {q1,q2}, [r0,:128]!
+ vst1.32 {q1,q2}, [r0,:128]!
+ bhi 0b
+1: and lr, r3, #0x7 // Generate an unaligned pointer to the
+ add ip, lr // rotated pattern to use for cleanup.
+ vld1.8 {q8}, [ip]
+ vmov q9, q8
+ add r0, r3
+ vst1.32 {q8,q9}, [r0]!
+ vst1.32 {q8,q9}, [r0]
+ mov sp, r7 // Restore stack pointer
+ pop {r7,pc}
+
+L_short8:
+ subs r2, #8
+ blo 1f
+0: subs r2, #8
+ vst1.32 {d0}, [r0]!
+ bhs 0b
+1: adds r2, #8
+ beq 3f
+2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
+ vext.8 d0, d0, d0, #1 // Use VEXT to rotate pattern
+ subs r2, #1
+ bhi 2b
+3: pop {r7,pc}
+
+/******************************************************************************/
+
+.align 4
+_memset_pattern16$VARIANT$Swift:
+// The implementation of this function is substantially identical to that of
+// memset_pattern4. The only differences are in how we rotate the pattern for
+// the purposes of extracting the bytes to store. For clarity, only those
+// differences are commented here; consult memset_pattern4 (above) for
+// a detailed description of the algorithm used.
+ push {r7,lr}
+ mov r7, sp
+ vld1.8 {q0}, [r1]
+ subs r3, r2, #64
+ blo L_short16
+
+ bic sp, #0xf // Align stack to 16 bytes and write 48 bytes
+ sub sp, #16 // of pattern to the stack. We will use
+ vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
+ sub sp, #16 // to get rotated forms of the pattern.
+ vst1.8 {q0}, [sp,:128]
+ sub sp, #16
+ vst1.8 {q0}, [sp,:128]
+ and lr, r0, #0xf // Now generate an unaligned pointer to the
+ rsb ip, lr, #16 // rotated pattern that we need to use for
+ add ip, sp // aligned stores in the main loop.
+ vst1.8 {q0}, [r0]!
+ add r3, lr
+ bic r0, #0xf
+ subs r3, #16
+ blo 1f
+ vld1.8 {q1}, [ip]
+ vmov q2, q1
+0: subs r3, #64
+ vst1.32 {q1,q2}, [r0,:128]!
+ vst1.32 {q1,q2}, [r0,:128]!
+ bhi 0b
+1: and lr, r3, #0xf // Generate an unaligned pointer to the
+ add ip, lr // rotated pattern to use for cleanup.
+ vld1.8 {q8}, [ip]
+ vmov q9, q8
+ add r0, r3
+ vst1.32 {q8,q9}, [r0]!
+ vst1.32 {q8,q9}, [r0]
+ mov sp, r7 // Restore stack pointer
+ pop {r7,pc}
+
+L_short16:
+ subs r2, #16
+ blo 1f
+0: subs r2, #16
+ vst1.32 {q0}, [r0]!
+ bhs 0b
+1: adds r2, #16
+ beq 3f
+2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
+ vext.8 q0, q0, q0, #1 // Use VEXT to rotate pattern
+ subs r2, #1
+ bhi 2b
+3: pop {r7,pc}
+
+#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD