git.saurik.com Git - apple/libc.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22
	23	#include <machine/cpu_capabilities.h>
	24
	25
	26	/* This file contains the following functions:
	27	*
	28	* void memset(void b, int c, size_t len);
	29	* void memset_pattern4(void b, const void c4, size_t len);
	30	* void memset_pattern8(void b, const void c8, size_t len);
	31	* void memset_pattern16(void b, const void c16, size_t len);
	32	*
	33	* Calls of memset() with c==0 are routed to the bzero() routine. Most of the
	34	* others go to _COMM_PAGE_MEMSET_PATTERN, which is entered as follows:
	35	* %rdi = ptr to memory to set (aligned)
	36	* %edx = length (which can be short, though we bias in favor of long operands)
	37	* %xmm0 = the pattern to store
	38	* Return conditions:
	39	* %eax, %edi, %esi, %ecx, and %edx all trashed
	40	*
	41	* NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
	42	* on P4s and probably other processors.
	43	*/
	44
	45	#define kShort 255 // for nonzero memset(), too short for commpage
	46
	47
	48	.text
	49	.globl _memset
	50	.align 2
	51	_memset: // void memset(void b, int c, size_t len);
	52	andl $0xFF,%esi // (c==0) ?
	53	jnz LNonzero // not a bzero
	54
	55	movq $(_COMM_PAGE_BZERO),%rax// map memset(p,0,n) into bzero(p,n)
	56	movq %rdx,%rsi // put count where bzero() expects it
	57	jmp *%rax // enter commpage
	58
	59
	60	// Handle memset of a nonzero value.
	61
	62	LNonzero:
	63	movq %rdi,%r8 // preserve the original pointer so we can return it
	64	movl %esi,%eax // replicate byte in %esi into all four bytes
	65	shll $8,%esi
	66	orl %esi,%eax
	67	movl %eax,%esi
	68	shll $16,%esi
	69	orl %esi,%eax // now %eax has "c" in all 4 bytes
	70	cmpq $(kShort),%rdx // is operand too short for SSE?
	71	ja LCallCommpage // no
	72
	73	// Nonzero memset() too short to call commpage.
	74	// %eax = replicated 4-byte pattern
	75	// %rdi = ptr
	76	// %edx = length (<= kShort)
	77
	78	cmpl $16,%edx // long enough to word align?
	79	jge 3f // yes
	80	test %edx,%edx // length==0?
	81	jz 6f
	82	1:
	83	movb %al,(%rdi) // pack in a byte
	84	addq $1,%rdi
	85	subl $1,%edx
	86	jnz 1b
	87	jmp 6f
	88	2:
	89	movb %al,(%rdi) // pack in a byte
	90	addq $1,%rdi
	91	subl $1,%edx
	92	3:
	93	test $3,%edi // is ptr doubleword aligned?
	94	jnz 2b // no
	95	movl %edx,%ecx // copy length
	96	shrl $2,%edx // #doublewords to store
	97	4:
	98	movl %eax,(%rdi) // store aligned doubleword
	99	addq $4,%rdi
	100	subl $1,%edx
	101	jnz 4b
	102	andl $3,%ecx // any leftover bytes?
	103	jz 6f // no
	104	5:
	105	movb %al,(%rdi) // pack in a byte
	106	addq $1,%rdi
	107	subl $1,%ecx
	108	jnz 5b
	109	6:
	110	movq %r8,%rax // get return value (ie, original ptr)
	111	ret
	112
	113	// Nonzero memset() is long enough to call commpage.
	114	// %eax = replicated 4-byte pattern
	115	// %rdi = ptr
	116	// %rdx = length (> kShort)
	117
	118	LCallCommpage:
	119	movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0
	120	pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector
	121	movq %rdi,%rcx // copy dest ptr
	122	negl %ecx
	123	andl $15,%ecx // get #bytes to align ptr
	124	jz 2f // skip if already aligned
	125	subq %rcx,%rdx // decrement length
	126	1:
	127	movb %al,(%rdi) // pack in a byte
	128	addq $1,%rdi
	129	subl $1,%ecx
	130	jnz 1b
	131	2: // ptr aligned, length long enough to justify
	132	movq $(_COMM_PAGE_MEMSET_PATTERN),%rax
	133	call *%rax // call commpage to do the heavy lifting
	134	movq %r8,%rax // get return value (ie, original ptr)
	135	ret
	136
	137
	138	// Handle memset of a 16-byte pattern.
	139
	140	.globl _memset_pattern16
	141	.align 2, 0x90
	142	_memset_pattern16: // void memset_pattern16(void b, const void c16, size_t len);
	143	movdqu (%rsi),%xmm0 // load the pattern
	144	jmp LAlignPtr
	145
	146
	147	// Handle memset of an 8-byte pattern.
	148
	149	.globl _memset_pattern8
	150	.align 2, 0x90
	151	_memset_pattern8: // void memset_pattern8(void b, const void c8, size_t len);
	152	movq (%rsi),%xmm0 // load pattern into low 8 bytes
	153	punpcklqdq %xmm0,%xmm0 // replicate into all 16
	154	jmp LAlignPtr
	155
	156	// Handle memset of a 4-byte pattern.
	157
	158	.globl _memset_pattern4
	159	.align 2, 0x90
	160	_memset_pattern4: // void memset_pattern4(void b, const void c4, size_t len);
	161	movd (%rsi),%xmm0 // load pattern into low 4 bytes
	162	pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector
	163
	164
	165	// Align ptr if necessary. We must rotate the pattern right for each byte we
	166	// store while aligning the ptr. Since there is no rotate instruction in SSE3,
	167	// we have to synthesize the rotates.
	168	// %rdi = ptr
	169	// %rdx = length
	170	// %xmm0 = pattern
	171
	172	LAlignPtr: // NB: can drop down to here!
	173	cmpq $100,%rdx // long enough to bother aligning ptr?
	174	movq %rdi,%rcx // copy ptr
	175	jb LReady // not long enough
	176	negl %ecx
	177	andl $15,%ecx // get #bytes to align ptr
	178	jz LReady // already aligned
	179	subq %rcx,%rdx // adjust length
	180
	181	test $1,%cl // 1-byte store required?
	182	movd %xmm0,%eax // get 4 low bytes in %eax
	183	jz 2f // no
	184	movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions
	185	movb %al,(%rdi) // pack in the low-order byte
	186	psrldq $1,%xmm0 // shift pattern right 1 byte
	187	addq $1,%rdi
	188	pslldq $15,%xmm1 // shift pattern left 15 bytes
	189	shrl $8,%eax // in case 2-byte store is required
	190	por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte
	191	2:
	192	test $2,%cl // 2-byte store required?
	193	jz 4f // no
	194	psrldq $2,%xmm0 // shift pattern down 2 bytes
	195	movw %ax,(%rdi) // pack in next two bytes
	196	pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0
	197	addq $2,%rdi // adjust ptr
	198	4:
	199	test $4,%cl // 4-byte store required?
	200	jz 8f // no
	201	movd %xmm0,(%rdi) // store low 4 bytes of %xmm0
	202	pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
	203	addq $4,%rdi // adjust ptr
	204	8:
	205	test $8,%cl // 8-byte store required?
	206	jz LReady // no
	207	movq %xmm0,(%rdi) // store low 8 bytes of %xmm0
	208	pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
	209	addq $8,%rdi // adjust ptr
	210
	211	// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
	212
	213	LReady:
	214	movq $(_COMM_PAGE_MEMSET_PATTERN),%rax
	215	call *%rax // call commpage to do the heavy lifting
	216	ret