X-Git-Url: https://git.saurik.com/apple/libc.git/blobdiff_plain/7b00c0c43f52e9d27168e67a26aac19065cdb40c..ad3c9f2af814c84582fdd1649e49ec4f68572c5a:/arm/string/memset_pattern_Swift.s diff --git a/arm/string/memset_pattern_Swift.s b/arm/string/memset_pattern_Swift.s new file mode 100644 index 0000000..2f843a0 --- /dev/null +++ b/arm/string/memset_pattern_Swift.s @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + * + * This file implements the following functions for the Swift micro-arch: + * + * void memset_pattern4(void *b, const void *pattern4, size_t len); + * void memset_pattern8(void *b, const void *pattern8, size_t len); + * void memset_pattern16(void *b, const void *pattern16, size_t len); + * + * The implementation of all three functions is fundamentally the same. + * memset_pattern4 is extensively commented to explain, reference that + * if you have any questions about the other two. + */ + +#include +#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD + +.syntax unified +.code 32 +.text +.globl _memset_pattern4$VARIANT$Swift +.globl _memset_pattern8$VARIANT$Swift +.globl _memset_pattern16$VARIANT$Swift + +/******************************************************************************/ + +.align 4 +_memset_pattern4$VARIANT$Swift: + push {r7,lr} + mov r7, sp + +// Load the pattern and splat it to q0, then check if the buffer is at least +// 64 bytes long. If not, branch to a short-buffer implementation. + ldr r1, [r1] + vdup.32 q0, r1 + subs r3, r2, #64 + blo L_short4 + +// We want to use aligned vector stores to fill the bulk of the buffer. In +// order to make that work, we need to rotate the pattern as necessary to +// match up with aligned locations, and we also need to extract the alignment +// of the destination pointer mod 16. + lsl ip, r0, #3 + and lr, r0, #0xf // alignment of destination pointer mod 16 + rsb ip, ip, #32 // low five bits contain 32 - 8*(address%4). + +// Before we start the aligned stores, we do a single unaligned store of +// 16 bytes of the pattern to the start of the buffer. Since the buffer is +// at least 64 bytes long, this store is known to lie entirely inside the +// buffer: +// first aligned address in buffer +// v +// ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+--- +// ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ... +// ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+--- +// ^ +// unaligned store starts here: +// [ 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ] + vst1.8 {q0}, [r0]! + +// Subsequent stores will be aligned, and will start at the first aligned +// address in the buffer. We apply the rotation that we calculated before +// the vector store (in the low five bits of ip) to get the pattern that +// is to be stored starting at the aligned location. For example, in the +// picture above, the buffer had alignment of 3 mod 4, so the rotation to +// be applied is 32 - 8*3 = 8. Rotating the pattern right by 8 bits gives +// us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what +// needs to be stored starting at the first aligned location. +// +// Besides rotating the pattern, we also need to adjust the length (by +// subtracting 16 - alignment mod 16), and to advance the pointer to the +// first aligned location. + ror r1, ip // Pattern to use for aligned memory + add r3, lr + bic r0, #0xf // destination for first aligned store + subs r3, #16 // updated length + blo 1f + +// Splat the rotated value across q1 and q2 + vdup.32 q1, r1 + vmov q2, q1 + +// Main store loop. We write the splatted aligned pattern across 64 bytes +// per iteration, terminating the loop when the remaining length of the +// buffer is 64 bytes or less. +0: subs r3, #64 + vst1.32 {q1,q2}, [r0,:128]! + vst1.32 {q1,q2}, [r0,:128]! + bhi 0b + +// The remaining length of the buffer is 64 bytes or less (but the total +// length of the buffer is at least 64 bytes; otherwise we would have +// branched to the "short" path). Thus, we can handle the entirety of the +// remaining buffer with two 32-byte unaligned stores. +// +// Again, we need to rotate the pattern to match the alignment, this time +// by 8*(length%4), and we also need to back up the destination pointer +// so that it points to precisely 64 bytes before the end of the buffer. +// We accomplish this by adding r3, which contains the remaining length of +// the buffer minus 64. +1: lsl ip, r3, #3 + ror r1, ip + vdup.32 q8, r1 + vmov q9, q8 + add r0, r3 + vst1.32 {q8,q9}, [r0]! + vst1.32 {q8,q9}, [r0] + pop {r7,pc} + +L_short4: +// If we branch here, the buffer is less than 64 bytes long. At this point, +// register contents are as follows: +// +// r0 pointer to the buffer +// r1 pattern +// r2 buffer length +// q0 splatted pattern +// +// To begin, we store eight bytes at a time until the remaining length is +// less than eight bytes. + subs r3, r2, #8 + blo 1f +0: subs r3, #8 + vst1.32 {d0}, [r0]! + bhs 0b + +// Then we store one byte at a time, rotating the pattern to get the next +// byte, until we reach the end of the buffer. + add r2, r3, #8 +1: subs r2, #1 + strbhs r1, [r0],#1 + ror r1, #8 + bhi 1b + pop {r7,pc} + +/******************************************************************************/ + +.align 4 +_memset_pattern8$VARIANT$Swift: +// The implementation of this function is substantially identical to that of +// memset_pattern4. The only differences are in how we rotate the pattern for +// the purposes of extracting the bytes to store. For clarity, only those +// differences are commented here; consult memset_pattern4 (above) for +// a detailed description of the algorithm used. + push {r7,lr} + mov r7, sp + vld1.8 {d0}, [r1] + vmov d1, d0 + subs r3, r2, #64 + blo L_short8 + + bic sp, #0xf // Align stack to 16 bytes and write 32 bytes + sub sp, #16 // of pattern to the stack. We will use + vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer + sub sp, #16 // to get rotated forms of the pattern. + vst1.8 {q0}, [sp,:128] + and ip, r0, #0x7 // Now generate an unaligned pointer to the + rsb ip, ip, #8 // rotated pattern that we need to use for + add ip, sp // aligned stores in the main loop. + and lr, r0, #0xf + vst1.8 {q0}, [r0]! + add r3, lr + bic r0, #0xf + subs r3, #16 + blo 1f + vld1.8 {q1}, [ip] + vmov q2, q1 +0: subs r3, #64 + vst1.32 {q1,q2}, [r0,:128]! + vst1.32 {q1,q2}, [r0,:128]! + bhi 0b +1: and lr, r3, #0x7 // Generate an unaligned pointer to the + add ip, lr // rotated pattern to use for cleanup. + vld1.8 {q8}, [ip] + vmov q9, q8 + add r0, r3 + vst1.32 {q8,q9}, [r0]! + vst1.32 {q8,q9}, [r0] + mov sp, r7 // Restore stack pointer + pop {r7,pc} + +L_short8: + subs r2, #8 + blo 1f +0: subs r2, #8 + vst1.32 {d0}, [r0]! + bhs 0b +1: adds r2, #8 + beq 3f +2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON + vext.8 d0, d0, d0, #1 // Use VEXT to rotate pattern + subs r2, #1 + bhi 2b +3: pop {r7,pc} + +/******************************************************************************/ + +.align 4 +_memset_pattern16$VARIANT$Swift: +// The implementation of this function is substantially identical to that of +// memset_pattern4. The only differences are in how we rotate the pattern for +// the purposes of extracting the bytes to store. For clarity, only those +// differences are commented here; consult memset_pattern4 (above) for +// a detailed description of the algorithm used. + push {r7,lr} + mov r7, sp + vld1.8 {q0}, [r1] + subs r3, r2, #64 + blo L_short16 + + bic sp, #0xf // Align stack to 16 bytes and write 48 bytes + sub sp, #16 // of pattern to the stack. We will use + vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer + sub sp, #16 // to get rotated forms of the pattern. + vst1.8 {q0}, [sp,:128] + sub sp, #16 + vst1.8 {q0}, [sp,:128] + and lr, r0, #0xf // Now generate an unaligned pointer to the + rsb ip, lr, #16 // rotated pattern that we need to use for + add ip, sp // aligned stores in the main loop. + vst1.8 {q0}, [r0]! + add r3, lr + bic r0, #0xf + subs r3, #16 + blo 1f + vld1.8 {q1}, [ip] + vmov q2, q1 +0: subs r3, #64 + vst1.32 {q1,q2}, [r0,:128]! + vst1.32 {q1,q2}, [r0,:128]! + bhi 0b +1: and lr, r3, #0xf // Generate an unaligned pointer to the + add ip, lr // rotated pattern to use for cleanup. + vld1.8 {q8}, [ip] + vmov q9, q8 + add r0, r3 + vst1.32 {q8,q9}, [r0]! + vst1.32 {q8,q9}, [r0] + mov sp, r7 // Restore stack pointer + pop {r7,pc} + +L_short16: + subs r2, #16 + blo 1f +0: subs r2, #16 + vst1.32 {q0}, [r0]! + bhs 0b +1: adds r2, #16 + beq 3f +2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON + vext.8 q0, q0, q0, #1 // Use VEXT to rotate pattern + subs r2, #1 + bhi 2b +3: pop {r7,pc} + +#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD