X-Git-Url: https://git.saurik.com/apple/libc.git/blobdiff_plain/fbd86d4cc20b02a10edcca92fb7ae0a143e63cc4..1f2f436a38f7ae2d39a943ad2898d8fed4ed2e58:/x86_64/string/memset.s?ds=sidebyside

diff --git a/x86_64/string/memset.s b/x86_64/string/memset.s
index 423db2f..e79c214 100644
--- a/x86_64/string/memset.s
+++ b/x86_64/string/memset.s
@@ -21,7 +21,7 @@
  */
 
 #include <machine/cpu_capabilities.h>
- 
+
 
 /* This file contains the following functions:
  *
@@ -31,7 +31,7 @@
  *	void memset_pattern16(void *b, const void *c16, size_t len);
  *
  * Calls of memset() with c==0 are routed to the bzero() routine.  Most of the
- * others go to _COMM_PAGE_MEMSET_PATTERN, which is entered as follows:
+ * others go to _memset_pattern, which is entered as follows:
  *	%rdi = ptr to memory to set (aligned)
  *	%edx = length (which can be short, though we bias in favor of long operands)
  *	%xmm0 = the pattern to store
@@ -41,24 +41,23 @@
  * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
  * on P4s and probably other processors.
  */
- 
- #define kShort	255			// for nonzero memset(), too short for commpage
- 
- 
+
+#define kShort	255			// for nonzero memset(), too short for commpage
+
+
 	.text
 	.globl	_memset
 	.align	2
 _memset:				// void *memset(void *b, int c, size_t len);
 	andl	$0xFF,%esi		// (c==0) ?
 	jnz		LNonzero		// not a bzero
-	
-	movq	$(_COMM_PAGE_BZERO),%rax// map memset(p,0,n) into bzero(p,n)
+
 	movq	%rdx,%rsi		// put count where bzero() expects it
-	jmp		*%rax			// enter commpage
+	jmp		_bzero			// enter _bzero
+
 
+// Handle memset of a nonzero value.
 
-	// Handle memset of a nonzero value.
-	
 LNonzero:
 	movq	%rdi,%r8		// preserve the original pointer so we can return it
 	movl	%esi,%eax		// replicate byte in %esi into all four bytes
@@ -69,12 +68,12 @@ LNonzero:
 	orl		%esi,%eax		// now %eax has "c" in all 4 bytes
 	cmpq	$(kShort),%rdx		// is operand too short for SSE?
 	ja		LCallCommpage		// no
-	
-// Nonzero memset() too short to call commpage.
-//	%eax = replicated 4-byte pattern
-//	%rdi = ptr
-//	%edx = length (<= kShort)
-	
+
+	// Nonzero memset() too short to call commpage.
+	//	%eax = replicated 4-byte pattern
+	//	%rdi = ptr
+	//	%edx = length (<= kShort)
+
 	cmpl	$16,%edx		// long enough to word align?
 	jge	3f			// yes
 	test	%edx,%edx		// length==0?
@@ -109,12 +108,12 @@ LNonzero:
 6:
 	movq	%r8,%rax		// get return value (ie, original ptr)
 	ret
-	
-// Nonzero memset() is long enough to call commpage.
-//	%eax = replicated 4-byte pattern
-//	%rdi = ptr
-//	%rdx = length (> kShort)
-	
+
+	// Nonzero memset() is long enough to call commpage.
+	//	%eax = replicated 4-byte pattern
+	//	%rdi = ptr
+	//	%rdx = length (> kShort)
+
 LCallCommpage:
 	movd	%eax,%xmm0		// move %eax to low 4 bytes of %xmm0
 	pshufd	$(0x00),%xmm0,%xmm0	// replicate across the vector
@@ -129,14 +128,13 @@ LCallCommpage:
 	subl	$1,%ecx
 	jnz	1b
 2:					// ptr aligned, length long enough to justify
-	movq	$(_COMM_PAGE_MEMSET_PATTERN),%rax
-	call	*%rax			// call commpage to do the heavy lifting
+	call	Lmemset_pattern	// call commpage to do the heavy lifting
 	movq	%r8,%rax		// get return value (ie, original ptr)
 	ret
 
 
-// Handle memset of a 16-byte pattern.
-	
+	// Handle memset of a 16-byte pattern.
+
 	.globl	_memset_pattern16
 	.align	2, 0x90
 _memset_pattern16:			// void memset_pattern16(void *b, const void *c16, size_t len);
@@ -144,8 +142,8 @@ _memset_pattern16:			// void memset_pattern16(void *b, const void *c16, size_t l
 	jmp	LAlignPtr
 
 
-// Handle memset of an 8-byte pattern.
-	
+	// Handle memset of an 8-byte pattern.
+
 	.globl	_memset_pattern8
 	.align	2, 0x90
 _memset_pattern8:			// void memset_pattern8(void *b, const void *c8, size_t len);
@@ -153,8 +151,8 @@ _memset_pattern8:			// void memset_pattern8(void *b, const void *c8, size_t len)
 	punpcklqdq %xmm0,%xmm0		// replicate into all 16
 	jmp	LAlignPtr
 
-// Handle memset of a 4-byte pattern.
-	
+	// Handle memset of a 4-byte pattern.
+
 	.globl	_memset_pattern4
 	.align	2, 0x90
 _memset_pattern4:			// void memset_pattern4(void *b, const void *c4, size_t len);
@@ -162,13 +160,13 @@ _memset_pattern4:			// void memset_pattern4(void *b, const void *c4, size_t len)
 	pshufd	$(0x00),%xmm0,%xmm0	// replicate the 4 bytes across the vector
 
 
-// Align ptr if necessary.  We must rotate the pattern right for each byte we
-// store while aligning the ptr.  Since there is no rotate instruction in SSE3,
-// we have to synthesize the rotates.
-//	%rdi = ptr
-//	%rdx = length
-//	%xmm0 = pattern
-	
+	// Align ptr if necessary.  We must rotate the pattern right for each byte we
+	// store while aligning the ptr.  Since there is no rotate instruction in SSE3,
+	// we have to synthesize the rotates.
+	//	%rdi = ptr
+	//	%rdx = length
+	//	%xmm0 = pattern
+
 LAlignPtr:				// NB: can drop down to here!
 	cmpq	$100,%rdx		// long enough to bother aligning ptr?
 	movq	%rdi,%rcx		// copy ptr
@@ -177,7 +175,7 @@ LAlignPtr:				// NB: can drop down to here!
 	andl	$15,%ecx		// get #bytes to align ptr
 	jz	LReady			// already aligned
 	subq	%rcx,%rdx		// adjust length
-	
+
 	test	$1,%cl			// 1-byte store required?
 	movd	%xmm0,%eax		// get 4 low bytes in %eax
 	jz	2f			// no
@@ -207,10 +205,138 @@ LAlignPtr:				// NB: can drop down to here!
 	movq	%xmm0,(%rdi)		// store low 8 bytes of %xmm0
 	pshufd	$(0x4e),%xmm0,%xmm0	// rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
 	addq	$8,%rdi			// adjust ptr
-	
-// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
+
+	// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
 
 LReady:
-	movq	$(_COMM_PAGE_MEMSET_PATTERN),%rax
-	call	*%rax			// call commpage to do the heavy lifting
+	call	Lmemset_pattern	// call commpage to do the heavy lifting
 	ret
+
+
+#define	kLShort		63
+#define	kVeryLong	(1024*1024)
+
+Lmemset_pattern:
+	cmpq    $(kLShort),%rdx		// long enough to bother aligning?
+	ja	LNotShort		// yes
+	jmp	LShort			// no
+
+	// Here for short operands or the end of long ones.
+	//      %rdx = length (<= kLShort)
+	//      %rdi = ptr (may not be not aligned)
+	//      %xmm0 = pattern
+
+LUnalignedStore16:
+	movdqu	%xmm0,(%rdi)		// stuff in another 16 bytes
+	subl	$16,%edx
+	addq	$16,%rdi
+LShort:	
+	cmpl	$16,%edx		// room for another vector?
+	jge	LUnalignedStore16	// yes
+LLessThan16:				// here at end of copy with < 16 bytes remaining
+	test	$8,%dl			// 8-byte store required?
+	jz	2f			// no
+	movq	%xmm0,(%rdi)		// pack in 8 low bytes
+	psrldq	$8,%xmm0		// then shift vector down 8 bytes
+	addq	$8,%rdi
+2:
+	test	$4,%dl			// 4-byte store required?
+	jz	3f			// no
+	movd	%xmm0,(%rdi)		// pack in 4 low bytes
+	psrldq	$4,%xmm0		// then shift vector down 4 bytes
+	addq	$4,%rdi
+3:
+	andl	$3,%edx			// more to go?
+	jz	5f			// no
+	movd	%xmm0,%eax		// move remainders out into %eax
+4:					// loop on up to three bytes
+	movb	%al,(%rdi)		// pack in next byte
+	shrl	$8,%eax			// shift next byte into position
+	incq	%rdi
+	dec	%edx
+	jnz	4b
+5:	ret
+
+// Long enough to justify aligning ptr.  Note that we have to rotate the
+// pattern to account for any alignment.  We do this by doing two unaligned
+// stores, and then an aligned load from the middle of the two stores.
+// This will stall on store forwarding alignment mismatch, and the unaligned
+// stores can be pretty slow too, but the alternatives aren't any better.
+// Fortunately, in most cases our caller has already aligned the ptr.
+//      %rdx = length (> kLShort)
+//      %rdi = ptr (may not be aligned)
+//      %xmm0 = pattern
+
+LNotShort:
+	movl    %edi,%ecx		// copy low bits of dest ptr
+	negl    %ecx
+	andl    $15,%ecx                // mask down to #bytes to 16-byte align
+	jz	LAligned		// skip if already aligned
+	movdqu	%xmm0,(%rdi)		// store 16 unaligned bytes
+	movdqu	%xmm0,16(%rdi)		// and 16 more, to be sure we have an aligned chunk
+	addq	%rcx,%rdi		// now point to the aligned chunk
+	subq	%rcx,%rdx		// adjust remaining count
+	movdqa	(%rdi),%xmm0		// get the rotated pattern (probably stalling)
+	addq	$16,%rdi		// skip past the aligned chunk
+	subq	$16,%rdx
+
+// Set up for 64-byte loops.
+//      %rdx = length remaining
+//      %rdi = ptr (aligned)
+//      %xmm0 = rotated pattern
+
+LAligned:
+	movq	%rdx,%rcx		// copy length remaining
+	andl    $63,%edx                // mask down to residual length (0..63)
+	andq    $-64,%rcx               // %ecx <- #bytes we will zero in by-64 loop
+	jz	LNoMoreChunks		// no 64-byte chunks
+	addq    %rcx,%rdi               // increment ptr by length to move
+	cmpq	$(kVeryLong),%rcx	// long enough to justify non-temporal stores?
+	jge	LVeryLong		// yes
+	negq    %rcx			// negate length to move
+	jmp	1f
+
+// Loop over 64-byte chunks, storing into cache.
+
+	.align	4,0x90			// keep inner loops 16-byte aligned
+1:
+	movdqa  %xmm0,(%rdi,%rcx)
+	movdqa  %xmm0,16(%rdi,%rcx)
+	movdqa  %xmm0,32(%rdi,%rcx)
+	movdqa  %xmm0,48(%rdi,%rcx)
+	addq    $64,%rcx
+	jne     1b
+
+	jmp	LNoMoreChunks
+
+// Very long operands: use non-temporal stores to bypass cache.
+
+LVeryLong:
+	negq    %rcx			// negate length to move
+	jmp	1f
+
+	.align	4,0x90			// keep inner loops 16-byte aligned
+1:
+	movntdq %xmm0,(%rdi,%rcx)
+	movntdq %xmm0,16(%rdi,%rcx)
+	movntdq %xmm0,32(%rdi,%rcx)
+	movntdq %xmm0,48(%rdi,%rcx)
+	addq    $64,%rcx
+	jne     1b
+
+	sfence                          // required by non-temporal stores
+	jmp	LNoMoreChunks
+
+// Handle leftovers: loop by 16.
+//      %edx = length remaining (<64)
+//      %edi = ptr (aligned)
+//      %xmm0 = rotated pattern
+
+LLoopBy16:
+	movdqa	%xmm0,(%rdi)		// pack in 16 more bytes
+	subl	$16,%edx		// decrement count
+	addq	$16,%rdi		// increment ptr
+LNoMoreChunks:
+	cmpl	$16,%edx		// more to go?
+	jge	LLoopBy16		// yes
+	jmp	LLessThan16		// handle up to 15 remaining bytes