*/
#include <machine/cpu_capabilities.h>
-
+
/* This file contains the following functions:
*
* void memset_pattern16(void *b, const void *c16, size_t len);
*
* Calls of memset() with c==0 are routed to the bzero() routine. Most of the
- * others go to _COMM_PAGE_MEMSET_PATTERN, which is entered as follows:
+ * others go to _memset_pattern, which is entered as follows:
* %rdi = ptr to memory to set (aligned)
* %edx = length (which can be short, though we bias in favor of long operands)
* %xmm0 = the pattern to store
* NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
* on P4s and probably other processors.
*/
-
- #define kShort 255 // for nonzero memset(), too short for commpage
-
-
+
+#define kShort 255 // for nonzero memset(), too short for commpage
+
+
.text
.globl _memset
.align 2
_memset: // void *memset(void *b, int c, size_t len);
andl $0xFF,%esi // (c==0) ?
jnz LNonzero // not a bzero
-
- movq $(_COMM_PAGE_BZERO),%rax// map memset(p,0,n) into bzero(p,n)
+
movq %rdx,%rsi // put count where bzero() expects it
- jmp *%rax // enter commpage
+ jmp _bzero // enter _bzero
+
+// Handle memset of a nonzero value.
- // Handle memset of a nonzero value.
-
LNonzero:
movq %rdi,%r8 // preserve the original pointer so we can return it
movl %esi,%eax // replicate byte in %esi into all four bytes
orl %esi,%eax // now %eax has "c" in all 4 bytes
cmpq $(kShort),%rdx // is operand too short for SSE?
ja LCallCommpage // no
-
-// Nonzero memset() too short to call commpage.
-// %eax = replicated 4-byte pattern
-// %rdi = ptr
-// %edx = length (<= kShort)
-
+
+ // Nonzero memset() too short to call commpage.
+ // %eax = replicated 4-byte pattern
+ // %rdi = ptr
+ // %edx = length (<= kShort)
+
cmpl $16,%edx // long enough to word align?
jge 3f // yes
test %edx,%edx // length==0?
6:
movq %r8,%rax // get return value (ie, original ptr)
ret
-
-// Nonzero memset() is long enough to call commpage.
-// %eax = replicated 4-byte pattern
-// %rdi = ptr
-// %rdx = length (> kShort)
-
+
+ // Nonzero memset() is long enough to call commpage.
+ // %eax = replicated 4-byte pattern
+ // %rdi = ptr
+ // %rdx = length (> kShort)
+
LCallCommpage:
movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0
pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector
subl $1,%ecx
jnz 1b
2: // ptr aligned, length long enough to justify
- movq $(_COMM_PAGE_MEMSET_PATTERN),%rax
- call *%rax // call commpage to do the heavy lifting
+ call Lmemset_pattern // call commpage to do the heavy lifting
movq %r8,%rax // get return value (ie, original ptr)
ret
-// Handle memset of a 16-byte pattern.
-
+ // Handle memset of a 16-byte pattern.
+
.globl _memset_pattern16
.align 2, 0x90
_memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len);
jmp LAlignPtr
-// Handle memset of an 8-byte pattern.
-
+ // Handle memset of an 8-byte pattern.
+
.globl _memset_pattern8
.align 2, 0x90
_memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len);
punpcklqdq %xmm0,%xmm0 // replicate into all 16
jmp LAlignPtr
-// Handle memset of a 4-byte pattern.
-
+ // Handle memset of a 4-byte pattern.
+
.globl _memset_pattern4
.align 2, 0x90
_memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len);
pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector
-// Align ptr if necessary. We must rotate the pattern right for each byte we
-// store while aligning the ptr. Since there is no rotate instruction in SSE3,
-// we have to synthesize the rotates.
-// %rdi = ptr
-// %rdx = length
-// %xmm0 = pattern
-
+ // Align ptr if necessary. We must rotate the pattern right for each byte we
+ // store while aligning the ptr. Since there is no rotate instruction in SSE3,
+ // we have to synthesize the rotates.
+ // %rdi = ptr
+ // %rdx = length
+ // %xmm0 = pattern
+
LAlignPtr: // NB: can drop down to here!
cmpq $100,%rdx // long enough to bother aligning ptr?
movq %rdi,%rcx // copy ptr
andl $15,%ecx // get #bytes to align ptr
jz LReady // already aligned
subq %rcx,%rdx // adjust length
-
+
test $1,%cl // 1-byte store required?
movd %xmm0,%eax // get 4 low bytes in %eax
jz 2f // no
movq %xmm0,(%rdi) // store low 8 bytes of %xmm0
pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
addq $8,%rdi // adjust ptr
-
-// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
+
+ // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
LReady:
- movq $(_COMM_PAGE_MEMSET_PATTERN),%rax
- call *%rax // call commpage to do the heavy lifting
+ call Lmemset_pattern // call commpage to do the heavy lifting
ret
+
+
+#define kLShort 63
+#define kVeryLong (1024*1024)
+
+Lmemset_pattern:
+ cmpq $(kLShort),%rdx // long enough to bother aligning?
+ ja LNotShort // yes
+ jmp LShort // no
+
+ // Here for short operands or the end of long ones.
+ // %rdx = length (<= kLShort)
+ // %rdi = ptr (may not be not aligned)
+ // %xmm0 = pattern
+
+LUnalignedStore16:
+ movdqu %xmm0,(%rdi) // stuff in another 16 bytes
+ subl $16,%edx
+ addq $16,%rdi
+LShort:
+ cmpl $16,%edx // room for another vector?
+ jge LUnalignedStore16 // yes
+LLessThan16: // here at end of copy with < 16 bytes remaining
+ test $8,%dl // 8-byte store required?
+ jz 2f // no
+ movq %xmm0,(%rdi) // pack in 8 low bytes
+ psrldq $8,%xmm0 // then shift vector down 8 bytes
+ addq $8,%rdi
+2:
+ test $4,%dl // 4-byte store required?
+ jz 3f // no
+ movd %xmm0,(%rdi) // pack in 4 low bytes
+ psrldq $4,%xmm0 // then shift vector down 4 bytes
+ addq $4,%rdi
+3:
+ andl $3,%edx // more to go?
+ jz 5f // no
+ movd %xmm0,%eax // move remainders out into %eax
+4: // loop on up to three bytes
+ movb %al,(%rdi) // pack in next byte
+ shrl $8,%eax // shift next byte into position
+ incq %rdi
+ dec %edx
+ jnz 4b
+5: ret
+
+// Long enough to justify aligning ptr. Note that we have to rotate the
+// pattern to account for any alignment. We do this by doing two unaligned
+// stores, and then an aligned load from the middle of the two stores.
+// This will stall on store forwarding alignment mismatch, and the unaligned
+// stores can be pretty slow too, but the alternatives aren't any better.
+// Fortunately, in most cases our caller has already aligned the ptr.
+// %rdx = length (> kLShort)
+// %rdi = ptr (may not be aligned)
+// %xmm0 = pattern
+
+LNotShort:
+ movl %edi,%ecx // copy low bits of dest ptr
+ negl %ecx
+ andl $15,%ecx // mask down to #bytes to 16-byte align
+ jz LAligned // skip if already aligned
+ movdqu %xmm0,(%rdi) // store 16 unaligned bytes
+ movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk
+ addq %rcx,%rdi // now point to the aligned chunk
+ subq %rcx,%rdx // adjust remaining count
+ movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling)
+ addq $16,%rdi // skip past the aligned chunk
+ subq $16,%rdx
+
+// Set up for 64-byte loops.
+// %rdx = length remaining
+// %rdi = ptr (aligned)
+// %xmm0 = rotated pattern
+
+LAligned:
+ movq %rdx,%rcx // copy length remaining
+ andl $63,%edx // mask down to residual length (0..63)
+ andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop
+ jz LNoMoreChunks // no 64-byte chunks
+ addq %rcx,%rdi // increment ptr by length to move
+ cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores?
+ jge LVeryLong // yes
+ negq %rcx // negate length to move
+ jmp 1f
+
+// Loop over 64-byte chunks, storing into cache.
+
+ .align 4,0x90 // keep inner loops 16-byte aligned
+1:
+ movdqa %xmm0,(%rdi,%rcx)
+ movdqa %xmm0,16(%rdi,%rcx)
+ movdqa %xmm0,32(%rdi,%rcx)
+ movdqa %xmm0,48(%rdi,%rcx)
+ addq $64,%rcx
+ jne 1b
+
+ jmp LNoMoreChunks
+
+// Very long operands: use non-temporal stores to bypass cache.
+
+LVeryLong:
+ negq %rcx // negate length to move
+ jmp 1f
+
+ .align 4,0x90 // keep inner loops 16-byte aligned
+1:
+ movntdq %xmm0,(%rdi,%rcx)
+ movntdq %xmm0,16(%rdi,%rcx)
+ movntdq %xmm0,32(%rdi,%rcx)
+ movntdq %xmm0,48(%rdi,%rcx)
+ addq $64,%rcx
+ jne 1b
+
+ sfence // required by non-temporal stores
+ jmp LNoMoreChunks
+
+// Handle leftovers: loop by 16.
+// %edx = length remaining (<64)
+// %edi = ptr (aligned)
+// %xmm0 = rotated pattern
+
+LLoopBy16:
+ movdqa %xmm0,(%rdi) // pack in 16 more bytes
+ subl $16,%edx // decrement count
+ addq $16,%rdi // increment ptr
+LNoMoreChunks:
+ cmpl $16,%edx // more to go?
+ jge LLoopBy16 // yes
+ jmp LLessThan16 // handle up to 15 remaining bytes