/* * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. * * @APPLE_LICENSE_HEADER_END@ */ #include /* This file contains the following functions: * * void *memset(void *b, int c, size_t len); * void memset_pattern4(void *b, const void *c4, size_t len); * void memset_pattern8(void *b, const void *c8, size_t len); * void memset_pattern16(void *b, const void *c16, size_t len); * * Calls of memset() with c==0 are routed to the bzero() routine. Most of the * others go to _memset_pattern, which is entered as follows: * %rdi = ptr to memory to set (aligned) * %edx = length (which can be short, though we bias in favor of long operands) * %xmm0 = the pattern to store * Return conditions: * %eax, %edi, %esi, %ecx, and %edx all trashed * * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow * on P4s and probably other processors. */ #define kShort 255 // for nonzero memset(), too short for commpage .text .globl _memset .align 2 _memset: // void *memset(void *b, int c, size_t len); andl $0xFF,%esi // (c==0) ? jnz LNonzero // not a bzero movq %rdx,%rsi // put count where bzero() expects it jmp _bzero // enter _bzero // Handle memset of a nonzero value. LNonzero: movq %rdi,%r8 // preserve the original pointer so we can return it movl %esi,%eax // replicate byte in %esi into all four bytes shll $8,%esi orl %esi,%eax movl %eax,%esi shll $16,%esi orl %esi,%eax // now %eax has "c" in all 4 bytes cmpq $(kShort),%rdx // is operand too short for SSE? ja LCallCommpage // no // Nonzero memset() too short to call commpage. // %eax = replicated 4-byte pattern // %rdi = ptr // %edx = length (<= kShort) cmpl $16,%edx // long enough to word align? jge 3f // yes test %edx,%edx // length==0? jz 6f 1: movb %al,(%rdi) // pack in a byte addq $1,%rdi subl $1,%edx jnz 1b jmp 6f 2: movb %al,(%rdi) // pack in a byte addq $1,%rdi subl $1,%edx 3: test $3,%edi // is ptr doubleword aligned? jnz 2b // no movl %edx,%ecx // copy length shrl $2,%edx // #doublewords to store 4: movl %eax,(%rdi) // store aligned doubleword addq $4,%rdi subl $1,%edx jnz 4b andl $3,%ecx // any leftover bytes? jz 6f // no 5: movb %al,(%rdi) // pack in a byte addq $1,%rdi subl $1,%ecx jnz 5b 6: movq %r8,%rax // get return value (ie, original ptr) ret // Nonzero memset() is long enough to call commpage. // %eax = replicated 4-byte pattern // %rdi = ptr // %rdx = length (> kShort) LCallCommpage: movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0 pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector movq %rdi,%rcx // copy dest ptr negl %ecx andl $15,%ecx // get #bytes to align ptr jz 2f // skip if already aligned subq %rcx,%rdx // decrement length 1: movb %al,(%rdi) // pack in a byte addq $1,%rdi subl $1,%ecx jnz 1b 2: // ptr aligned, length long enough to justify call Lmemset_pattern // call commpage to do the heavy lifting movq %r8,%rax // get return value (ie, original ptr) ret // Handle memset of a 16-byte pattern. .globl _memset_pattern16 .align 2, 0x90 _memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len); movdqu (%rsi),%xmm0 // load the pattern jmp LAlignPtr // Handle memset of an 8-byte pattern. .globl _memset_pattern8 .align 2, 0x90 _memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len); movq (%rsi),%xmm0 // load pattern into low 8 bytes punpcklqdq %xmm0,%xmm0 // replicate into all 16 jmp LAlignPtr // Handle memset of a 4-byte pattern. .globl _memset_pattern4 .align 2, 0x90 _memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len); movd (%rsi),%xmm0 // load pattern into low 4 bytes pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector // Align ptr if necessary. We must rotate the pattern right for each byte we // store while aligning the ptr. Since there is no rotate instruction in SSE3, // we have to synthesize the rotates. // %rdi = ptr // %rdx = length // %xmm0 = pattern LAlignPtr: // NB: can drop down to here! cmpq $100,%rdx // long enough to bother aligning ptr? movq %rdi,%rcx // copy ptr jb LReady // not long enough negl %ecx andl $15,%ecx // get #bytes to align ptr jz LReady // already aligned subq %rcx,%rdx // adjust length test $1,%cl // 1-byte store required? movd %xmm0,%eax // get 4 low bytes in %eax jz 2f // no movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions movb %al,(%rdi) // pack in the low-order byte psrldq $1,%xmm0 // shift pattern right 1 byte addq $1,%rdi pslldq $15,%xmm1 // shift pattern left 15 bytes shrl $8,%eax // in case 2-byte store is required por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte 2: test $2,%cl // 2-byte store required? jz 4f // no psrldq $2,%xmm0 // shift pattern down 2 bytes movw %ax,(%rdi) // pack in next two bytes pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0 addq $2,%rdi // adjust ptr 4: test $4,%cl // 4-byte store required? jz 8f // no movd %xmm0,(%rdi) // store low 4 bytes of %xmm0 pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01) addq $4,%rdi // adjust ptr 8: test $8,%cl // 8-byte store required? jz LReady // no movq %xmm0,(%rdi) // store low 8 bytes of %xmm0 pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10) addq $8,%rdi // adjust ptr // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting. LReady: call Lmemset_pattern // call commpage to do the heavy lifting ret #define kLShort 63 #define kVeryLong (1024*1024) Lmemset_pattern: cmpq $(kLShort),%rdx // long enough to bother aligning? ja LNotShort // yes jmp LShort // no // Here for short operands or the end of long ones. // %rdx = length (<= kLShort) // %rdi = ptr (may not be not aligned) // %xmm0 = pattern LUnalignedStore16: movdqu %xmm0,(%rdi) // stuff in another 16 bytes subl $16,%edx addq $16,%rdi LShort: cmpl $16,%edx // room for another vector? jge LUnalignedStore16 // yes LLessThan16: // here at end of copy with < 16 bytes remaining test $8,%dl // 8-byte store required? jz 2f // no movq %xmm0,(%rdi) // pack in 8 low bytes psrldq $8,%xmm0 // then shift vector down 8 bytes addq $8,%rdi 2: test $4,%dl // 4-byte store required? jz 3f // no movd %xmm0,(%rdi) // pack in 4 low bytes psrldq $4,%xmm0 // then shift vector down 4 bytes addq $4,%rdi 3: andl $3,%edx // more to go? jz 5f // no movd %xmm0,%eax // move remainders out into %eax 4: // loop on up to three bytes movb %al,(%rdi) // pack in next byte shrl $8,%eax // shift next byte into position incq %rdi dec %edx jnz 4b 5: ret // Long enough to justify aligning ptr. Note that we have to rotate the // pattern to account for any alignment. We do this by doing two unaligned // stores, and then an aligned load from the middle of the two stores. // This will stall on store forwarding alignment mismatch, and the unaligned // stores can be pretty slow too, but the alternatives aren't any better. // Fortunately, in most cases our caller has already aligned the ptr. // %rdx = length (> kLShort) // %rdi = ptr (may not be aligned) // %xmm0 = pattern LNotShort: movl %edi,%ecx // copy low bits of dest ptr negl %ecx andl $15,%ecx // mask down to #bytes to 16-byte align jz LAligned // skip if already aligned movdqu %xmm0,(%rdi) // store 16 unaligned bytes movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk addq %rcx,%rdi // now point to the aligned chunk subq %rcx,%rdx // adjust remaining count movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling) addq $16,%rdi // skip past the aligned chunk subq $16,%rdx // Set up for 64-byte loops. // %rdx = length remaining // %rdi = ptr (aligned) // %xmm0 = rotated pattern LAligned: movq %rdx,%rcx // copy length remaining andl $63,%edx // mask down to residual length (0..63) andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop jz LNoMoreChunks // no 64-byte chunks addq %rcx,%rdi // increment ptr by length to move cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores? jge LVeryLong // yes negq %rcx // negate length to move jmp 1f // Loop over 64-byte chunks, storing into cache. .align 4,0x90 // keep inner loops 16-byte aligned 1: movdqa %xmm0,(%rdi,%rcx) movdqa %xmm0,16(%rdi,%rcx) movdqa %xmm0,32(%rdi,%rcx) movdqa %xmm0,48(%rdi,%rcx) addq $64,%rcx jne 1b jmp LNoMoreChunks // Very long operands: use non-temporal stores to bypass cache. LVeryLong: negq %rcx // negate length to move jmp 1f .align 4,0x90 // keep inner loops 16-byte aligned 1: movntdq %xmm0,(%rdi,%rcx) movntdq %xmm0,16(%rdi,%rcx) movntdq %xmm0,32(%rdi,%rcx) movntdq %xmm0,48(%rdi,%rcx) addq $64,%rcx jne 1b sfence // required by non-temporal stores jmp LNoMoreChunks // Handle leftovers: loop by 16. // %edx = length remaining (<64) // %edi = ptr (aligned) // %xmm0 = rotated pattern LLoopBy16: movdqa %xmm0,(%rdi) // pack in 16 more bytes subl $16,%edx // decrement count addq $16,%rdi // increment ptr LNoMoreChunks: cmpl $16,%edx // more to go? jge LLoopBy16 // yes jmp LLessThan16 // handle up to 15 remaining bytes