+
+
+#define kLShort 63
+#define kVeryLong (1024*1024)
+
+Lmemset_pattern:
+ cmpq $(kLShort),%rdx // long enough to bother aligning?
+ ja LNotShort // yes
+ jmp LShort // no
+
+ // Here for short operands or the end of long ones.
+ // %rdx = length (<= kLShort)
+ // %rdi = ptr (may not be not aligned)
+ // %xmm0 = pattern
+
+LUnalignedStore16:
+ movdqu %xmm0,(%rdi) // stuff in another 16 bytes
+ subl $16,%edx
+ addq $16,%rdi
+LShort:
+ cmpl $16,%edx // room for another vector?
+ jge LUnalignedStore16 // yes
+LLessThan16: // here at end of copy with < 16 bytes remaining
+ test $8,%dl // 8-byte store required?
+ jz 2f // no
+ movq %xmm0,(%rdi) // pack in 8 low bytes
+ psrldq $8,%xmm0 // then shift vector down 8 bytes
+ addq $8,%rdi
+2:
+ test $4,%dl // 4-byte store required?
+ jz 3f // no
+ movd %xmm0,(%rdi) // pack in 4 low bytes
+ psrldq $4,%xmm0 // then shift vector down 4 bytes
+ addq $4,%rdi
+3:
+ andl $3,%edx // more to go?
+ jz 5f // no
+ movd %xmm0,%eax // move remainders out into %eax
+4: // loop on up to three bytes
+ movb %al,(%rdi) // pack in next byte
+ shrl $8,%eax // shift next byte into position
+ incq %rdi
+ dec %edx
+ jnz 4b
+5: ret
+
+// Long enough to justify aligning ptr. Note that we have to rotate the
+// pattern to account for any alignment. We do this by doing two unaligned
+// stores, and then an aligned load from the middle of the two stores.
+// This will stall on store forwarding alignment mismatch, and the unaligned
+// stores can be pretty slow too, but the alternatives aren't any better.
+// Fortunately, in most cases our caller has already aligned the ptr.
+// %rdx = length (> kLShort)
+// %rdi = ptr (may not be aligned)
+// %xmm0 = pattern
+
+LNotShort:
+ movl %edi,%ecx // copy low bits of dest ptr
+ negl %ecx
+ andl $15,%ecx // mask down to #bytes to 16-byte align
+ jz LAligned // skip if already aligned
+ movdqu %xmm0,(%rdi) // store 16 unaligned bytes
+ movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk
+ addq %rcx,%rdi // now point to the aligned chunk
+ subq %rcx,%rdx // adjust remaining count
+ movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling)
+ addq $16,%rdi // skip past the aligned chunk
+ subq $16,%rdx
+
+// Set up for 64-byte loops.
+// %rdx = length remaining
+// %rdi = ptr (aligned)
+// %xmm0 = rotated pattern
+
+LAligned:
+ movq %rdx,%rcx // copy length remaining
+ andl $63,%edx // mask down to residual length (0..63)
+ andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop
+ jz LNoMoreChunks // no 64-byte chunks
+ addq %rcx,%rdi // increment ptr by length to move
+ cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores?
+ jge LVeryLong // yes
+ negq %rcx // negate length to move
+ jmp 1f
+
+// Loop over 64-byte chunks, storing into cache.
+
+ .align 4,0x90 // keep inner loops 16-byte aligned
+1:
+ movdqa %xmm0,(%rdi,%rcx)
+ movdqa %xmm0,16(%rdi,%rcx)
+ movdqa %xmm0,32(%rdi,%rcx)
+ movdqa %xmm0,48(%rdi,%rcx)
+ addq $64,%rcx
+ jne 1b
+
+ jmp LNoMoreChunks
+
+// Very long operands: use non-temporal stores to bypass cache.
+
+LVeryLong:
+ negq %rcx // negate length to move
+ jmp 1f
+
+ .align 4,0x90 // keep inner loops 16-byte aligned
+1:
+ movntdq %xmm0,(%rdi,%rcx)
+ movntdq %xmm0,16(%rdi,%rcx)
+ movntdq %xmm0,32(%rdi,%rcx)
+ movntdq %xmm0,48(%rdi,%rcx)
+ addq $64,%rcx
+ jne 1b
+
+ sfence // required by non-temporal stores
+ jmp LNoMoreChunks
+
+// Handle leftovers: loop by 16.
+// %edx = length remaining (<64)
+// %edi = ptr (aligned)
+// %xmm0 = rotated pattern
+
+LLoopBy16:
+ movdqa %xmm0,(%rdi) // pack in 16 more bytes
+ subl $16,%edx // decrement count
+ addq $16,%rdi // increment ptr
+LNoMoreChunks:
+ cmpl $16,%edx // more to go?
+ jge LLoopBy16 // yes
+ jmp LLessThan16 // handle up to 15 remaining bytes