X-Git-Url: https://git.saurik.com/apple/libc.git/blobdiff_plain/fbd86d4cc20b02a10edcca92fb7ae0a143e63cc4..1f2f436a38f7ae2d39a943ad2898d8fed4ed2e58:/x86_64/string/memset.s?ds=sidebyside diff --git a/x86_64/string/memset.s b/x86_64/string/memset.s index 423db2f..e79c214 100644 --- a/x86_64/string/memset.s +++ b/x86_64/string/memset.s @@ -21,7 +21,7 @@ */ #include - + /* This file contains the following functions: * @@ -31,7 +31,7 @@ * void memset_pattern16(void *b, const void *c16, size_t len); * * Calls of memset() with c==0 are routed to the bzero() routine. Most of the - * others go to _COMM_PAGE_MEMSET_PATTERN, which is entered as follows: + * others go to _memset_pattern, which is entered as follows: * %rdi = ptr to memory to set (aligned) * %edx = length (which can be short, though we bias in favor of long operands) * %xmm0 = the pattern to store @@ -41,24 +41,23 @@ * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow * on P4s and probably other processors. */ - - #define kShort 255 // for nonzero memset(), too short for commpage - - + +#define kShort 255 // for nonzero memset(), too short for commpage + + .text .globl _memset .align 2 _memset: // void *memset(void *b, int c, size_t len); andl $0xFF,%esi // (c==0) ? jnz LNonzero // not a bzero - - movq $(_COMM_PAGE_BZERO),%rax// map memset(p,0,n) into bzero(p,n) + movq %rdx,%rsi // put count where bzero() expects it - jmp *%rax // enter commpage + jmp _bzero // enter _bzero + +// Handle memset of a nonzero value. - // Handle memset of a nonzero value. - LNonzero: movq %rdi,%r8 // preserve the original pointer so we can return it movl %esi,%eax // replicate byte in %esi into all four bytes @@ -69,12 +68,12 @@ LNonzero: orl %esi,%eax // now %eax has "c" in all 4 bytes cmpq $(kShort),%rdx // is operand too short for SSE? ja LCallCommpage // no - -// Nonzero memset() too short to call commpage. -// %eax = replicated 4-byte pattern -// %rdi = ptr -// %edx = length (<= kShort) - + + // Nonzero memset() too short to call commpage. + // %eax = replicated 4-byte pattern + // %rdi = ptr + // %edx = length (<= kShort) + cmpl $16,%edx // long enough to word align? jge 3f // yes test %edx,%edx // length==0? @@ -109,12 +108,12 @@ LNonzero: 6: movq %r8,%rax // get return value (ie, original ptr) ret - -// Nonzero memset() is long enough to call commpage. -// %eax = replicated 4-byte pattern -// %rdi = ptr -// %rdx = length (> kShort) - + + // Nonzero memset() is long enough to call commpage. + // %eax = replicated 4-byte pattern + // %rdi = ptr + // %rdx = length (> kShort) + LCallCommpage: movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0 pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector @@ -129,14 +128,13 @@ LCallCommpage: subl $1,%ecx jnz 1b 2: // ptr aligned, length long enough to justify - movq $(_COMM_PAGE_MEMSET_PATTERN),%rax - call *%rax // call commpage to do the heavy lifting + call Lmemset_pattern // call commpage to do the heavy lifting movq %r8,%rax // get return value (ie, original ptr) ret -// Handle memset of a 16-byte pattern. - + // Handle memset of a 16-byte pattern. + .globl _memset_pattern16 .align 2, 0x90 _memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len); @@ -144,8 +142,8 @@ _memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t l jmp LAlignPtr -// Handle memset of an 8-byte pattern. - + // Handle memset of an 8-byte pattern. + .globl _memset_pattern8 .align 2, 0x90 _memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len); @@ -153,8 +151,8 @@ _memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len) punpcklqdq %xmm0,%xmm0 // replicate into all 16 jmp LAlignPtr -// Handle memset of a 4-byte pattern. - + // Handle memset of a 4-byte pattern. + .globl _memset_pattern4 .align 2, 0x90 _memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len); @@ -162,13 +160,13 @@ _memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len) pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector -// Align ptr if necessary. We must rotate the pattern right for each byte we -// store while aligning the ptr. Since there is no rotate instruction in SSE3, -// we have to synthesize the rotates. -// %rdi = ptr -// %rdx = length -// %xmm0 = pattern - + // Align ptr if necessary. We must rotate the pattern right for each byte we + // store while aligning the ptr. Since there is no rotate instruction in SSE3, + // we have to synthesize the rotates. + // %rdi = ptr + // %rdx = length + // %xmm0 = pattern + LAlignPtr: // NB: can drop down to here! cmpq $100,%rdx // long enough to bother aligning ptr? movq %rdi,%rcx // copy ptr @@ -177,7 +175,7 @@ LAlignPtr: // NB: can drop down to here! andl $15,%ecx // get #bytes to align ptr jz LReady // already aligned subq %rcx,%rdx // adjust length - + test $1,%cl // 1-byte store required? movd %xmm0,%eax // get 4 low bytes in %eax jz 2f // no @@ -207,10 +205,138 @@ LAlignPtr: // NB: can drop down to here! movq %xmm0,(%rdi) // store low 8 bytes of %xmm0 pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10) addq $8,%rdi // adjust ptr - -// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting. + + // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting. LReady: - movq $(_COMM_PAGE_MEMSET_PATTERN),%rax - call *%rax // call commpage to do the heavy lifting + call Lmemset_pattern // call commpage to do the heavy lifting ret + + +#define kLShort 63 +#define kVeryLong (1024*1024) + +Lmemset_pattern: + cmpq $(kLShort),%rdx // long enough to bother aligning? + ja LNotShort // yes + jmp LShort // no + + // Here for short operands or the end of long ones. + // %rdx = length (<= kLShort) + // %rdi = ptr (may not be not aligned) + // %xmm0 = pattern + +LUnalignedStore16: + movdqu %xmm0,(%rdi) // stuff in another 16 bytes + subl $16,%edx + addq $16,%rdi +LShort: + cmpl $16,%edx // room for another vector? + jge LUnalignedStore16 // yes +LLessThan16: // here at end of copy with < 16 bytes remaining + test $8,%dl // 8-byte store required? + jz 2f // no + movq %xmm0,(%rdi) // pack in 8 low bytes + psrldq $8,%xmm0 // then shift vector down 8 bytes + addq $8,%rdi +2: + test $4,%dl // 4-byte store required? + jz 3f // no + movd %xmm0,(%rdi) // pack in 4 low bytes + psrldq $4,%xmm0 // then shift vector down 4 bytes + addq $4,%rdi +3: + andl $3,%edx // more to go? + jz 5f // no + movd %xmm0,%eax // move remainders out into %eax +4: // loop on up to three bytes + movb %al,(%rdi) // pack in next byte + shrl $8,%eax // shift next byte into position + incq %rdi + dec %edx + jnz 4b +5: ret + +// Long enough to justify aligning ptr. Note that we have to rotate the +// pattern to account for any alignment. We do this by doing two unaligned +// stores, and then an aligned load from the middle of the two stores. +// This will stall on store forwarding alignment mismatch, and the unaligned +// stores can be pretty slow too, but the alternatives aren't any better. +// Fortunately, in most cases our caller has already aligned the ptr. +// %rdx = length (> kLShort) +// %rdi = ptr (may not be aligned) +// %xmm0 = pattern + +LNotShort: + movl %edi,%ecx // copy low bits of dest ptr + negl %ecx + andl $15,%ecx // mask down to #bytes to 16-byte align + jz LAligned // skip if already aligned + movdqu %xmm0,(%rdi) // store 16 unaligned bytes + movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk + addq %rcx,%rdi // now point to the aligned chunk + subq %rcx,%rdx // adjust remaining count + movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling) + addq $16,%rdi // skip past the aligned chunk + subq $16,%rdx + +// Set up for 64-byte loops. +// %rdx = length remaining +// %rdi = ptr (aligned) +// %xmm0 = rotated pattern + +LAligned: + movq %rdx,%rcx // copy length remaining + andl $63,%edx // mask down to residual length (0..63) + andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop + jz LNoMoreChunks // no 64-byte chunks + addq %rcx,%rdi // increment ptr by length to move + cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores? + jge LVeryLong // yes + negq %rcx // negate length to move + jmp 1f + +// Loop over 64-byte chunks, storing into cache. + + .align 4,0x90 // keep inner loops 16-byte aligned +1: + movdqa %xmm0,(%rdi,%rcx) + movdqa %xmm0,16(%rdi,%rcx) + movdqa %xmm0,32(%rdi,%rcx) + movdqa %xmm0,48(%rdi,%rcx) + addq $64,%rcx + jne 1b + + jmp LNoMoreChunks + +// Very long operands: use non-temporal stores to bypass cache. + +LVeryLong: + negq %rcx // negate length to move + jmp 1f + + .align 4,0x90 // keep inner loops 16-byte aligned +1: + movntdq %xmm0,(%rdi,%rcx) + movntdq %xmm0,16(%rdi,%rcx) + movntdq %xmm0,32(%rdi,%rcx) + movntdq %xmm0,48(%rdi,%rcx) + addq $64,%rcx + jne 1b + + sfence // required by non-temporal stores + jmp LNoMoreChunks + +// Handle leftovers: loop by 16. +// %edx = length remaining (<64) +// %edi = ptr (aligned) +// %xmm0 = rotated pattern + +LLoopBy16: + movdqa %xmm0,(%rdi) // pack in 16 more bytes + subl $16,%edx // decrement count + addq $16,%rdi // increment ptr +LNoMoreChunks: + cmpl $16,%edx // more to go? + jge LLoopBy16 // yes + jmp LLessThan16 // handle up to 15 remaining bytes