-/*
- * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- *
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License"). You may not use this file except in compliance with the
- * License. Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- *
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- *
- * @APPLE_LICENSE_HEADER_END@
- */
-
-#include <machine/cpu_capabilities.h>
-
-
-/* This file contains the following functions:
- *
- * void *memset(void *b, int c, size_t len);
- * void memset_pattern4(void *b, const void *c4, size_t len);
- * void memset_pattern8(void *b, const void *c8, size_t len);
- * void memset_pattern16(void *b, const void *c16, size_t len);
- *
- * Calls of memset() with c==0 are routed to the bzero() routine. Most of the
- * others go to _COMM_PAGE_MEMSET_PATTERN, which is entered as follows:
- * %edi = ptr to memory to set (aligned)
- * %edx = length (which can be short, though we bias in favor of long operands)
- * %xmm0 = the pattern to store
- * Return conditions:
- * %eax, %edi, %esi, %ecx, and %edx all trashed
- *
- * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
- * on P4s and probably other processors.
- */
-
- #define kShort 255 // for nonzero memset(), too short for commpage
-
-
- .text
- .globl _memset
- .align 2
-_memset: // void *memset(void *b, int c, size_t len);
- movl 8(%esp),%eax // get 1-byte pattern
- movl 12(%esp),%edx // get length
- andl $0xFF,%eax // (c==0) ?
- jnz LNonzero // not a bzero
-
- movl $(_COMM_PAGE_BZERO),%eax// map memset(p,0,n) into bzero(p,n)
- movl %edx,8(%esp) // put count where bzero() expects it
- jmp %eax // enter commpage
-
-
- // Handle memset of a nonzero value.
-
-LNonzero:
- pushl %edi // save a few nonvolatiles
- pushl %esi
- movl %eax,%esi // replicate byte in %al into all four bytes
- movl 12(%esp),%edi // point to operand
- shll $8,%esi
- orl %esi,%eax
- movl %eax,%esi
- shll $16,%esi
- orl %esi,%eax // now %eax has "c" in all 4 bytes
- cmpl $(kShort),%edx // is operand too short for SSE?
- ja LCallCommpage // no
-
-// Nonzero memset() too short to call commpage.
-// %eax = replicated 4-byte pattern
-// %edi = ptr
-// %edx = length (<= kShort)
-
- cmpl $16,%edx // long enough to word align?
- jge 3f // yes
- test %edx,%edx // length==0?
- jz 6f
-1:
- movb %al,(%edi) // pack in a byte
- inc %edi
- dec %edx
- jnz 1b
- jmp 6f
-2:
- movb %al,(%edi) // pack in a byte
- inc %edi
- dec %edx
-3:
- test $3,%edi // is ptr doubleword aligned?
- jnz 2b // no
- movl %edx,%ecx // copy length
- shrl $2,%edx // #doublewords to store
-4:
- movl %eax,(%edi) // store aligned doubleword
- addl $4,%edi
- dec %edx
- jnz 4b
- andl $3,%ecx // any leftover bytes?
- jz 6f // no
-5:
- movb %al,(%edi) // pack in a byte
- inc %edi
- dec %ecx
- jnz 5b
-6:
- movl 12(%esp),%eax // get return value (ie, original ptr)
- popl %esi
- popl %edi
- ret
-
-// Nonzero memset() is long enough to call commpage.
-// %eax = replicated 4-byte pattern
-// %edi = ptr
-// %edx = length (> kShort)
-
-LCallCommpage:
- movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0
- pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector
- movl %edi,%ecx // copy dest ptr
- negl %ecx
- andl $15,%ecx // get #bytes to align ptr
- jz 2f // skip if already aligned
- subl %ecx,%edx // decrement length
-1:
- movb %al,(%edi) // pack in a byte
- inc %edi
- dec %ecx
- jnz 1b
-2: // ptr aligned, length long enough to justify
- movl $(_COMM_PAGE_MEMSET_PATTERN),%eax
- call %eax // call commpage to do the heavy lifting
- movl 12(%esp),%eax // get return value (ie, original ptr)
- popl %esi
- popl %edi
- ret
-
-
-// Handle memset of a 16-byte pattern.
-
- .globl _memset_pattern16
- .align 2, 0x90
-_memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len);
- pushl %edi
- pushl %esi
- movl 20(%esp),%edx // get length
- movl 16(%esp),%esi // get ptr to 16-byte pattern
- movl 12(%esp),%edi // point to operand
- movdqu (%esi),%xmm0 // load the pattern
- jmp LAlignPtr
-
-
-// Handle memset of an 8-byte pattern.
-
- .globl _memset_pattern8
- .align 2, 0x90
-_memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len);
- pushl %edi
- pushl %esi
- movl 20(%esp),%edx // get length
- movl 16(%esp),%esi // get ptr to 8-byte pattern
- movl 12(%esp),%edi // point to operand
- movq (%esi),%xmm0 // load pattern into low 8 bytes
- punpcklqdq %xmm0,%xmm0 // replicate into all 16
- jmp LAlignPtr
-
-// Handle memset of a 4-byte pattern.
-
- .globl _memset_pattern4
- .align 2, 0x90
-_memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len);
- pushl %edi
- pushl %esi
- movl 20(%esp),%edx // get length
- movl 16(%esp),%esi // get ptr to 4-byte pattern
- movl 12(%esp),%edi // point to operand
- movd (%esi),%xmm0 // load pattern into low 4 bytes
- pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector
-
-
-// Align ptr if necessary. We must rotate the pattern right for each byte we
-// store while aligning the ptr. Since there is no rotate instruction in SSE3,
-// we have to synthesize the rotates.
-// %edi = ptr
-// %edx = length
-// %xmm0 = pattern
-
-LAlignPtr: // NB: can drop down to here!
- cmpl $100,%edx // long enough to bother aligning ptr?
- movl %edi,%ecx // copy ptr
- jb LReady // not long enough
- negl %ecx
- andl $15,%ecx // get #bytes to align ptr
- jz LReady // already aligned
- subl %ecx,%edx // adjust length
-
- test $1,%cl // 1-byte store required?
- movd %xmm0,%eax // get 4 low bytes in %eax
- jz 2f // no
- movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions
- movb %al,(%edi) // pack in the low-order byte
- psrldq $1,%xmm0 // shift pattern right 1 byte
- inc %edi
- pslldq $15,%xmm1 // shift pattern left 15 bytes
- shrl $8,%eax // in case 2-byte store is required
- por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte
-2:
- test $2,%cl // 2-byte store required?
- jz 4f // no
- psrldq $2,%xmm0 // shift pattern down 2 bytes
- movw %ax,(%edi) // pack in next two bytes
- pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0
- addl $2,%edi // adjust ptr
-4:
- test $4,%cl // 4-byte store required?
- jz 8f // no
- movd %xmm0,(%edi) // store low 4 bytes of %xmm0
- pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
- addl $4,%edi // adjust ptr
-8:
- test $8,%cl // 8-byte store required?
- jz LReady // no
- movq %xmm0,(%edi) // store low 8 bytes of %xmm0
- pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
- addl $8,%edi // adjust ptr
-
-// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
-
-LReady:
- movl $(_COMM_PAGE_MEMSET_PATTERN),%eax
- call %eax // call commpage to do the heavy lifting
- popl %esi
- popl %edi
- ret