[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 6/11/2003, tuned for the IBM 970.
 *
 *
 * Register usage.  Note the rather delicate way we assign multiple uses
 * to the same register.  Beware.
 *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1", "c16", or "cm17"
 *   r7  = "w2", "c32", or "cm33"
 *   r8  = "w3", "c48", or "cm49"
 *   r9  = "w4",        or "cm1"
 *   r10 = vrsave ("rv")
 *   r11 = unused
 *   r12 = destination ptr ("rd")
 *   v0  = permute vector ("vp") 
 * v1-v8 = qw's loaded from source
 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
 */
#define rs	r4
#define rd	r12
#define rc	r5
#define	rv	r10

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9

#define c16		r6
#define cm17	r6
#define c32		r7
#define cm33	r7
#define c48		r8
#define cm49	r8
#define cm1		r9

#define	vp	v0
#define	vw	v9
#define	vx	v10
#define	vy	v11
#define	vz	v12

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
        .globl	EXT(bcopy_970)


#define	kShort		64
#define	kVeryLong	(128*1024)


// Main entry points.

        .align 	5
bcopy_970:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kShort			// short or long?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        mr		rd,r4				// move registers to canonic spot
        mr		rs,r3
        blt		LShort				// handle short operands
        dcbt	0,rs				// touch in the first line of source
        dcbtst	0,rd				// touch in destination
        b		LLong1				// join long operand code

// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.

        .align	5
Lmemcpy_970:						// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_970:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kShort			// short or long?
        sub		w1,r3,r4			// must move in reverse if (rd-rs)<rc
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong0				// handle long operands

// Handle short operands.
//		rs = source
//		rd = destination
//		rc = count
//		w1 = (rd-rs), must move reverse if (rd-rs)<rc
        
LShort:
        cmplw	cr1,w1,rc			// set cr1 blt if we must move reverse
        mtcrf	0x02,rc				// move length to cr6 and cr7 one at a time
        mtcrf	0x01,rc
        blt--	cr1,LShortReverse
        
// Forward short operands.  This is the most frequent case, so it is inline.

        bf		26,0f				// 32-byte chunk to move?
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
0:
LShort32:
        bf		27,1f				// quadword to move?
        ld		w1,0(rs)
        ld		w3,8(rs)
        addi	rs,rs,16
        std		w1,0(rd)
        std		w3,8(rd)
        addi	rd,rd,16
1:
LShort16:							// join here to xfer 0-15 bytes
        bf		28,2f				// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		30,4f				// halfword to move?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
4:
        bflr	31					// skip if no odd byte
        lbz		w1,0(rs)
        stb		w1,0(rd)
        blr
        
        
// Handle short reverse operands.
//		cr = length in bits 26-31       

LShortReverse:
        add		rs,rs,rc			// adjust ptrs for reverse move
        add		rd,rd,rc
        bf		26,0f				// 32 bytes to move?
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ldu		w4,-32(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        stdu	w4,-32(rd)
0:
        bf		27,1f				// quadword to move?
        ld		w1,-8(rs)
        ldu		w2,-16(rs)
        std		w1,-8(rd)
        stdu	w2,-16(rd)
1:
LShortReverse16:					// join here to xfer 0-15 bytes and return
        bf		28,2f				// doubleword?
        ldu		w1,-8(rs)
        stdu	w1,-8(rd)
2:
        bf		29,3f				// word?
        lwzu	w1,-4(rs)
        stwu	w1,-4(rd)
3:
        bf		30,4f				// halfword to move?
        lhzu	w1,-2(rs)
        sthu	w1,-2(rd)
4:
        bflr	31					// done if no odd byte
        lbz 	w1,-1(rs)			// no update
        stb 	w1,-1(rd)
        blr
        

// Long operands, use Altivec in most cases.
//		rs = source
//		rd = destination
//		rc = count
//		w1 = (rd-rs), must move reverse if (rd-rs)<rc

LLong0:								// entry from memmove()
        dcbt	0,rs				// touch in source
        dcbtst	0,rd				// touch in destination
LLong1:								// entry from bcopy() with operands already touched in
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        neg		w3,rd				// start to compute #bytes to align destination
        rlwinm	w2,w1,0,0xF			// 16-byte aligned?  (w2==0 if so)
        andi.	w4,w3,0xF			// w4 <- #bytes to 16-byte align destination
        cmpwi	cr5,w2,0			// set cr5 beq if relatively 16-byte aligned
        blt--	cr1,LLongReverse	// handle reverse moves
        sub		rc,rc,w4			// adjust length for aligning destination
        srwi	r0,rc,7				// get #cache lines to copy (may be 0)
        cmpwi	cr1,r0,0			// set cr1 on #chunks
        beq		LFwdAligned			// dest is already aligned
        
// 16-byte align destination.

        mtcrf	0x01,w4				// cr7 <- #bytes to align dest (nonzero)
        bf		31,1f				// byte to move?
        lbz		w1,0(rs)
        addi	rs,rs,1
        stb		w1,0(rd)
        addi	rd,rd,1
1:
        bf		30,2f				// halfword?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		28,LFwdAligned		// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8


// Forward, destination is 16-byte aligned.  There are five cases:
//  1. If the length>=kVeryLong (ie, several pages), then use the
//     "bigcopy" path that pulls all the punches.  This is the fastest
//	   case for cold-cache operands, as any this long will likely be.
//	2. If length>=128 and source is 16-byte aligned, then use the
//	   lvx/stvx loop over 128-byte chunks.  This is the fastest
//     case for hot-cache operands, 2nd fastest for cold.
//	3. If length>=128 and source is not 16-byte aligned, then use the
//	   lvx/vperm/stvx loop over 128-byte chunks.
//	4. If length<128 and source is 8-byte aligned, then use the
//	   ld/std loop over 32-byte chunks.
//	5. If length<128 and source is not 8-byte aligned, then use the
//	   lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
// Registers at this point:
//		r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
//			rs = alignment unknown
//		    rd = 16-byte aligned
//			rc = bytes remaining
//			w2 = low 4 bits of (rd-rs), used to check alignment
//		   cr5 = beq if source is also 16-byte aligned

LFwdAligned:
        andi.	w3,w2,7				// is source at least 8-byte aligned?
        mtcrf	0x01,rc				// move leftover count to cr7 for LShort16
        bne		cr1,LFwdLongVectors	// at least one 128-byte chunk, so use vectors
        srwi	w1,rc,5				// get 32-byte chunk count
        mtcrf	0x02,rc				// move bit 27 of length to cr6 for LShort32
        mtctr	w1					// set up 32-byte loop (w1!=0)
        beq		LFwdMedAligned		// source is 8-byte aligned, so use ld/std loop
        mfspr	rv,vrsave			// get bitmap of live vector registers
        oris	w4,rv,0xFFF8		// we use v0-v12
        li		c16,16				// get constant used in lvx
        li		c32,32
        mtspr	vrsave,w4			// update mask
        lvx		v1,0,rs				// prefetch 1st source quadword
        lvsl	vp,0,rs				// get permute vector to shift left
        
        
// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.

1:									// loop over 32-byte chunks
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        addi	rs,rs,32
        vperm	vx,v1,v2,vp
        vperm	vy,v2,v3,vp
        vor		v1,v3,v3			// v1 <- v3
        stvx	vx,0,rd
        stvx	vy,c16,rd
        addi	rd,rd,32
        bdnz	1b
        
        mtspr	vrsave,rv			// restore bitmap of live vr's
        b		LShort32

        
// Fewer than 128 bytes and doubleword aligned: use ld/std.

        .align	5
LFwdMedAligned:									// loop over 32-byte chunks
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
        bdnz	LFwdMedAligned
        
        b		LShort32

        
// Forward, 128 bytes or more: use vectors.  When entered:
//	    r0 = 128-byte chunks to move (>0)
//		rd = 16-byte aligned
//	   cr5 = beq if source is 16-byte aligned
//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
// We set up many registers:
//	   ctr = number of 128-byte chunks to move
//	r0/cr0 = leftover QWs to move
//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//	   cr6 = beq if leftover byte count is 0
//		rv = original value of VRSave
// c16,c32,c48 = loaded

LFwdLongVectors:
        mfspr	rv,vrsave			// get bitmap of live vector registers
        lis		w3,kVeryLong>>16	// cutoff for very-long-operand special case path
        cmplw	cr1,rc,w3			// very long operand?
        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3
        bgea--	cr1,_COMM_PAGE_BIGCOPY	// handle big copies separately
        mtctr	r0					// set up loop count
        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
        oris	w4,rv,0xFFF8		// we use v0-v12
        rlwinm.	r0,rc,28,29,31		// get number of quadword leftovers (0-7) and set cr0
        li		c16,16				// get constants used in ldvx/stvx
        mtspr	vrsave,w4			// update mask
        li		c32,32
        li		c48,48
        beq		cr5,LFwdLongAligned	// source is also 16-byte aligned, no need for vperm
        lvsl	vp,0,rs				// get permute vector to shift left
        lvx		v1,0,rs				// prefetch 1st source quadword
        b		LFwdLongUnaligned


// Forward, long, unaligned vector loop.

        .align	5					// align inner loops
LFwdLongUnaligned:					// loop over 128-byte chunks
        addi	w4,rs,64
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        lvx		v4,c48,rs
        lvx		v5,0,w4
        lvx		v6,c16,w4
        vperm	vw,v1,v2,vp
        lvx		v7,c32,w4
        lvx		v8,c48,w4
        addi	rs,rs,128
        vperm	vx,v2,v3,vp
        addi	w4,rd,64
        lvx		v1,0,rs
        stvx	vw,0,rd
        vperm	vy,v3,v4,vp
        stvx	vx,c16,rd
        vperm	vz,v4,v5,vp
        stvx	vy,c32,rd
        vperm	vw,v5,v6,vp
        stvx	vz,c48,rd
        vperm	vx,v6,v7,vp
        addi	rd,rd,128
        stvx	vw,0,w4
        vperm	vy,v7,v8,vp
        stvx	vx,c16,w4
        vperm	vz,v8,v1,vp
        stvx	vy,c32,w4
        stvx	vz,c48,w4
        bdnz	LFwdLongUnaligned

        beq		4f					// no leftover quadwords
        mtctr	r0
3:									// loop over remaining quadwords
        lvx		v2,c16,rs
        addi	rs,rs,16
        vperm	vx,v1,v2,vp
        vor		v1,v2,v2			// v1 <- v2
        stvx	vx,0,rd
        addi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShort16		// handle last 0-15 bytes if any
        blr


// Forward, long, 16-byte aligned vector loop.

        .align	5
LFwdLongAligned:        			// loop over 128-byte chunks
        addi	w4,rs,64
        lvx		v1,0,rs
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        lvx		v4,c48,rs
        lvx		v5,0,w4
        lvx		v6,c16,w4
        lvx		v7,c32,w4
        lvx		v8,c48,w4
        addi	rs,rs,128
        addi	w4,rd,64
        stvx	v1,0,rd 
        stvx	v2,c16,rd
        stvx	v3,c32,rd
        stvx	v4,c48,rd
        stvx	v5,0,w4
        stvx	v6,c16,w4
        stvx	v7,c32,w4
        stvx	v8,c48,w4
        addi	rd,rd,128
        bdnz	LFwdLongAligned
                
        beq		4f					// no leftover quadwords
        mtctr	r0
3:									// loop over remaining quadwords (1-7)
        lvx		v1,0,rs
        addi	rs,rs,16
        stvx	v1,0,rd
        addi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShort16		// handle last 0-15 bytes if any
        blr
        

// Long, reverse moves.
//		rs = source
//		rd = destination
//		rc = count
//	   cr5 = beq if relatively 16-byte aligned

LLongReverse:
        add		rd,rd,rc			// point to end of operands
        add		rs,rs,rc
        andi.	r0,rd,0xF			// #bytes to 16-byte align destination
        beq		2f					// already aligned
        
// 16-byte align destination.

        mtctr	r0					// set up for loop
        sub		rc,rc,r0
1:
        lbzu	w1,-1(rs)
        stbu	w1,-1(rd)
        bdnz	1b

// Prepare for reverse vector loop.  When entered:
//		rd = 16-byte aligned
//		cr5 = beq if source also 16-byte aligned
// We set up many registers:
//		ctr/cr1 = number of 64-byte chunks to move (may be 0)
//		r0/cr0 = leftover QWs to move
//		cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//		cr6 = beq if leftover byte count is 0
//		cm1 = -1
//		rv = original value of vrsave

2:
        mfspr	rv,vrsave			// get bitmap of live vector registers
        srwi	r0,rc,6				// get count of 64-byte chunks to move (may be 0)
        oris	w1,rv,0xFFF8		// we use v0-v12
        mtcrf	0x01,rc				// prepare for moving last 0-15 bytes in LShortReverse16
        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3 too
        cmpwi	cr1,r0,0			// set cr1 on chunk count
        mtspr	vrsave,w1			// update mask
        mtctr	r0					// set up loop count
        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
        rlwinm.	r0,rc,28,30,31		// get number of quadword leftovers (0-3) and set cr0
        li		cm1,-1				// get constants used in ldvx/stvx
        
        bne		cr5,LReverseVecUnal	// handle unaligned operands
        beq		cr1,2f				// no chunks (if no chunks, must be leftover QWs)
        li		cm17,-17
        li		cm33,-33
        li		cm49,-49
        b		1f

// Long, reverse 16-byte-aligned vector loop.
      
        .align	5					// align inner loops
1:        							// loop over 64-byte chunks
        lvx		v1,cm1,rs
        lvx		v2,cm17,rs
        lvx		v3,cm33,rs
        lvx		v4,cm49,rs
        subi	rs,rs,64
        stvx	v1,cm1,rd
        stvx	v2,cm17,rd
        stvx	v3,cm33,rd
        stvx	v4,cm49,rd
        subi	rd,rd,64
        bdnz	1b
        
        beq		4f					// no leftover quadwords
2:									// r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
        mtctr	r0
3:									// loop over remaining quadwords (1-7)
        lvx		v1,cm1,rs
        subi	rs,rs,16
        stvx	v1,cm1,rd
        subi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShortReverse16	// handle last 0-15 bytes if any
        blr


// Long, reverse, unaligned vector loop.
//		ctr/cr1 = number of 64-byte chunks to move (may be 0)
//		r0/cr0 = leftover QWs to move
//		cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//		cr6 = beq if leftover byte count is 0
//		rv = original value of vrsave
//		cm1 = -1

LReverseVecUnal:
        lvsl	vp,0,rs				// get permute vector to shift left
        lvx		v1,cm1,rs			// v1 always looks ahead
        li		cm17,-17
        beq		cr1,2f				// no chunks (if no chunks, must be leftover QWs)
        li		cm33,-33
        li		cm49,-49
        b		1f
        
        .align	5					// align the inner loops
1:									// loop over 64-byte chunks
        lvx		v2,cm17,rs
        lvx		v3,cm33,rs
        lvx		v4,cm49,rs
        subi	rs,rs,64
        vperm	vx,v2,v1,vp
        lvx		v1,cm1,rs
        vperm	vy,v3,v2,vp
        stvx	vx,cm1,rd
        vperm	vz,v4,v3,vp
        stvx	vy,cm17,rd
        vperm	vx,v1,v4,vp
        stvx	vz,cm33,rd
        stvx	vx,cm49,rd
        subi	rd,rd,64
        bdnz	1b

        beq		4f					// no leftover quadwords
2:									// r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
        mtctr	r0
3:									// loop over 1-3 quadwords
        lvx		v2,cm17,rs
        subi	rs,rs,16
        vperm	vx,v2,v1,vp
        vor		v1,v2,v2			// v1 <- v2
        stvx	vx,cm1,rd
        subi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShortReverse16	// handle last 0-15 bytes iff any
        blr

        COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0,kCommPageMTCRF)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
	7	*
	8	* This file contains Original Code and/or Modifications of Original Code
	9	* as defined in and that are subject to the Apple Public Source License
	10	* Version 2.0 (the 'License'). You may not use this file except in
	11	* compliance with the License. Please obtain a copy of the License at
	12	* http://www.opensource.apple.com/apsl/ and read it before using this
	13	* file.
	14	*
	15	* The Original Code and all software distributed under the License are
	16	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	17	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	18	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	19	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	20	* Please see the License for the specific language governing rights and
	21	* limitations under the License.
	22	*
	23	* @APPLE_LICENSE_HEADER_END@
	24	*/
	25	/* =======================================
	26	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	27	* =======================================
	28	*
	29	* Version of 6/11/2003, tuned for the IBM 970.
	30	*
	31	*
	32	* Register usage. Note the rather delicate way we assign multiple uses
	33	* to the same register. Beware.
	34	* r0 = temp (NB: cannot use r0 for any constant such as "c16")
	35	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	36	* r4 = source ptr ("rs")
	37	* r5 = count of bytes to move ("rc")
	38	* r6 = "w1", "c16", or "cm17"
	39	* r7 = "w2", "c32", or "cm33"
	40	* r8 = "w3", "c48", or "cm49"
	41	* r9 = "w4", or "cm1"
	42	* r10 = vrsave ("rv")
	43	* r11 = unused
	44	* r12 = destination ptr ("rd")
	45	* v0 = permute vector ("vp")
	46	* v1-v8 = qw's loaded from source
	47	*v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
	48	*/
	49	#define rs r4
	50	#define rd r12
	51	#define rc r5
	52	#define rv r10
	53
	54	#define w1 r6
	55	#define w2 r7
	56	#define w3 r8
	57	#define w4 r9
	58
	59	#define c16 r6
	60	#define cm17 r6
	61	#define c32 r7
	62	#define cm33 r7
	63	#define c48 r8
	64	#define cm49 r8
65	#define cm1 r9
66
67	#define vp v0
68	#define vw v9
69	#define vx v10
70	#define vy v11
71	#define vz v12
72
73	#define ASSEMBLER
74	#include <sys/appleapiopts.h>
75	#include <ppc/asm.h>
76	#include <machine/cpu_capabilities.h>
77	#include <machine/commpage.h>
78
79	.text
80	.globl EXT(bcopy_970)
81
82
83	#define kShort 64
84	#define kVeryLong (128*1024)
85
86
87	// Main entry points.
88
89	.align 5
90	bcopy_970: // void bcopy(const void src, void dst, size_t len)
91	cmplwi rc,kShort // short or long?
92	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
93	mr rd,r4 // move registers to canonic spot
94	mr rs,r3
95	blt LShort // handle short operands
96	dcbt 0,rs // touch in the first line of source
97	dcbtst 0,rd // touch in destination
98	b LLong1 // join long operand code
99
100	// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
101
102	.align 5
103	Lmemcpy_970: // void* memcpy(void dst, void src, size_t len)
104	Lmemmove_970: // void* memmove(void dst, const void src, size_t len)
105	cmplwi rc,kShort // short or long?
106	sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
107	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
108	bge LLong0 // handle long operands
109
110	// Handle short operands.
111	// rs = source
112	// rd = destination
113	// rc = count
114	// w1 = (rd-rs), must move reverse if (rd-rs)<rc
115
116	LShort:
117	cmplw cr1,w1,rc // set cr1 blt if we must move reverse
118	mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
119	mtcrf 0x01,rc
120	blt-- cr1,LShortReverse
121
122	// Forward short operands. This is the most frequent case, so it is inline.
123
124	bf 26,0f // 32-byte chunk to move?
125	ld w1,0(rs)
126	ld w2,8(rs)
127	ld w3,16(rs)
128	ld w4,24(rs)
129	addi rs,rs,32
130	std w1,0(rd)
131	std w2,8(rd)
132	std w3,16(rd)
133	std w4,24(rd)
134	addi rd,rd,32
135	0:
136	LShort32:
137	bf 27,1f // quadword to move?
138	ld w1,0(rs)
139	ld w3,8(rs)
140	addi rs,rs,16
141	std w1,0(rd)
142	std w3,8(rd)
143	addi rd,rd,16
144	1:
145	LShort16: // join here to xfer 0-15 bytes
146	bf 28,2f // doubleword?
147	ld w1,0(rs)
148	addi rs,rs,8
149	std w1,0(rd)
150	addi rd,rd,8
151	2:
152	bf 29,3f // word?
153	lwz w1,0(rs)
154	addi rs,rs,4
155	stw w1,0(rd)
156	addi rd,rd,4
157	3:
158	bf 30,4f // halfword to move?
159	lhz w1,0(rs)
160	addi rs,rs,2
161	sth w1,0(rd)
162	addi rd,rd,2
163	4:
164	bflr 31 // skip if no odd byte
165	lbz w1,0(rs)
166	stb w1,0(rd)
167	blr
168
169
170	// Handle short reverse operands.
171	// cr = length in bits 26-31
172
173	LShortReverse:
174	add rs,rs,rc // adjust ptrs for reverse move
175	add rd,rd,rc
176	bf 26,0f // 32 bytes to move?
177	ld w1,-8(rs)
178	ld w2,-16(rs)
179	ld w3,-24(rs)
180	ldu w4,-32(rs)
181	std w1,-8(rd)
182	std w2,-16(rd)
183	std w3,-24(rd)
184	stdu w4,-32(rd)
185	0:
186	bf 27,1f // quadword to move?
187	ld w1,-8(rs)
188	ldu w2,-16(rs)
189	std w1,-8(rd)
190	stdu w2,-16(rd)
191	1:
192	LShortReverse16: // join here to xfer 0-15 bytes and return
193	bf 28,2f // doubleword?
194	ldu w1,-8(rs)
195	stdu w1,-8(rd)
196	2:
197	bf 29,3f // word?
198	lwzu w1,-4(rs)
199	stwu w1,-4(rd)
200	3:
201	bf 30,4f // halfword to move?
202	lhzu w1,-2(rs)
203	sthu w1,-2(rd)
204	4:
205	bflr 31 // done if no odd byte
206	lbz w1,-1(rs) // no update
207	stb w1,-1(rd)
208	blr
209
210
211	// Long operands, use Altivec in most cases.
212	// rs = source
213	// rd = destination
214	// rc = count
215	// w1 = (rd-rs), must move reverse if (rd-rs)<rc
216
217	LLong0: // entry from memmove()
218	dcbt 0,rs // touch in source
219	dcbtst 0,rd // touch in destination
220	LLong1: // entry from bcopy() with operands already touched in
221	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
222	neg w3,rd // start to compute #bytes to align destination
223	rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
224	andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
225	cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
226	blt-- cr1,LLongReverse // handle reverse moves
227	sub rc,rc,w4 // adjust length for aligning destination
228	srwi r0,rc,7 // get #cache lines to copy (may be 0)
229	cmpwi cr1,r0,0 // set cr1 on #chunks
230	beq LFwdAligned // dest is already aligned
231
232	// 16-byte align destination.
233
234	mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
235	bf 31,1f // byte to move?
236	lbz w1,0(rs)
237	addi rs,rs,1
238	stb w1,0(rd)
239	addi rd,rd,1
240	1:
241	bf 30,2f // halfword?
242	lhz w1,0(rs)
243	addi rs,rs,2
244	sth w1,0(rd)
245	addi rd,rd,2
246	2:
247	bf 29,3f // word?
248	lwz w1,0(rs)
249	addi rs,rs,4
250	stw w1,0(rd)
251	addi rd,rd,4
252	3:
253	bf 28,LFwdAligned // doubleword?
254	ld w1,0(rs)
255	addi rs,rs,8
256	std w1,0(rd)
257	addi rd,rd,8
258
259
260	// Forward, destination is 16-byte aligned. There are five cases:
261	// 1. If the length>=kVeryLong (ie, several pages), then use the
262	// "bigcopy" path that pulls all the punches. This is the fastest
263	// case for cold-cache operands, as any this long will likely be.
264	// 2. If length>=128 and source is 16-byte aligned, then use the
265	// lvx/stvx loop over 128-byte chunks. This is the fastest
266	// case for hot-cache operands, 2nd fastest for cold.
267	// 3. If length>=128 and source is not 16-byte aligned, then use the
268	// lvx/vperm/stvx loop over 128-byte chunks.
269	// 4. If length<128 and source is 8-byte aligned, then use the
270	// ld/std loop over 32-byte chunks.
271	// 5. If length<128 and source is not 8-byte aligned, then use the
272	// lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
273	// Registers at this point:
274	// r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
275	// rs = alignment unknown
276	// rd = 16-byte aligned
277	// rc = bytes remaining
278	// w2 = low 4 bits of (rd-rs), used to check alignment
279	// cr5 = beq if source is also 16-byte aligned
280
281	LFwdAligned:
282	andi. w3,w2,7 // is source at least 8-byte aligned?
283	mtcrf 0x01,rc // move leftover count to cr7 for LShort16
284	bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
285	srwi w1,rc,5 // get 32-byte chunk count
286	mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
287	mtctr w1 // set up 32-byte loop (w1!=0)
288	beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
289	mfspr rv,vrsave // get bitmap of live vector registers
290	oris w4,rv,0xFFF8 // we use v0-v12
291	li c16,16 // get constant used in lvx
292	li c32,32
293	mtspr vrsave,w4 // update mask
294	lvx v1,0,rs // prefetch 1st source quadword
295	lvsl vp,0,rs // get permute vector to shift left
296
297
298	// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
299
300	1: // loop over 32-byte chunks
301	lvx v2,c16,rs
302	lvx v3,c32,rs
303	addi rs,rs,32
304	vperm vx,v1,v2,vp
305	vperm vy,v2,v3,vp
306	vor v1,v3,v3 // v1 <- v3
307	stvx vx,0,rd
308	stvx vy,c16,rd
309	addi rd,rd,32
310	bdnz 1b
311
312	mtspr vrsave,rv // restore bitmap of live vr's
313	b LShort32
314
315
316	// Fewer than 128 bytes and doubleword aligned: use ld/std.
317
318	.align 5
319	LFwdMedAligned: // loop over 32-byte chunks
320	ld w1,0(rs)
321	ld w2,8(rs)
322	ld w3,16(rs)
323	ld w4,24(rs)
324	addi rs,rs,32
325	std w1,0(rd)
326	std w2,8(rd)
327	std w3,16(rd)
328	std w4,24(rd)
329	addi rd,rd,32
330	bdnz LFwdMedAligned
331
332	b LShort32
333
334
335	// Forward, 128 bytes or more: use vectors. When entered:
336	// r0 = 128-byte chunks to move (>0)
337	// rd = 16-byte aligned
338	// cr5 = beq if source is 16-byte aligned
339	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
340	// We set up many registers:
341	// ctr = number of 128-byte chunks to move
342	// r0/cr0 = leftover QWs to move
343	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
344	// cr6 = beq if leftover byte count is 0
345	// rv = original value of VRSave
346	// c16,c32,c48 = loaded
347
348	LFwdLongVectors:
349	mfspr rv,vrsave // get bitmap of live vector registers
350	lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
351	cmplw cr1,rc,w3 // very long operand?
352	rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
353	bgea-- cr1,_COMM_PAGE_BIGCOPY // handle big copies separately
354	mtctr r0 // set up loop count
355	cmpwi cr6,w3,0 // set cr6 on leftover byte count
356	oris w4,rv,0xFFF8 // we use v0-v12
357	rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
358	li c16,16 // get constants used in ldvx/stvx
359	mtspr vrsave,w4 // update mask
360	li c32,32
361	li c48,48
362	beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
363	lvsl vp,0,rs // get permute vector to shift left
364	lvx v1,0,rs // prefetch 1st source quadword
365	b LFwdLongUnaligned
366
367
368	// Forward, long, unaligned vector loop.
369
370	.align 5 // align inner loops
371	LFwdLongUnaligned: // loop over 128-byte chunks
372	addi w4,rs,64
373	lvx v2,c16,rs
374	lvx v3,c32,rs
375	lvx v4,c48,rs
376	lvx v5,0,w4
377	lvx v6,c16,w4
378	vperm vw,v1,v2,vp
379	lvx v7,c32,w4
380	lvx v8,c48,w4
381	addi rs,rs,128
382	vperm vx,v2,v3,vp
383	addi w4,rd,64
384	lvx v1,0,rs
385	stvx vw,0,rd
386	vperm vy,v3,v4,vp
387	stvx vx,c16,rd
388	vperm vz,v4,v5,vp
389	stvx vy,c32,rd
390	vperm vw,v5,v6,vp
391	stvx vz,c48,rd
392	vperm vx,v6,v7,vp
393	addi rd,rd,128
394	stvx vw,0,w4
395	vperm vy,v7,v8,vp
396	stvx vx,c16,w4
397	vperm vz,v8,v1,vp
398	stvx vy,c32,w4
399	stvx vz,c48,w4
400	bdnz LFwdLongUnaligned
401
402	beq 4f // no leftover quadwords
403	mtctr r0
404	3: // loop over remaining quadwords
405	lvx v2,c16,rs
406	addi rs,rs,16
407	vperm vx,v1,v2,vp
408	vor v1,v2,v2 // v1 <- v2
409	stvx vx,0,rd
410	addi rd,rd,16
411	bdnz 3b
412	4:
413	mtspr vrsave,rv // restore bitmap of live vr's
414	bne cr6,LShort16 // handle last 0-15 bytes if any
415	blr
416
417
418	// Forward, long, 16-byte aligned vector loop.
419
420	.align 5
421	LFwdLongAligned: // loop over 128-byte chunks
422	addi w4,rs,64
423	lvx v1,0,rs
424	lvx v2,c16,rs
425	lvx v3,c32,rs
426	lvx v4,c48,rs
427	lvx v5,0,w4
428	lvx v6,c16,w4
429	lvx v7,c32,w4
430	lvx v8,c48,w4
431	addi rs,rs,128
432	addi w4,rd,64
433	stvx v1,0,rd
434	stvx v2,c16,rd
435	stvx v3,c32,rd
436	stvx v4,c48,rd
437	stvx v5,0,w4
438	stvx v6,c16,w4
439	stvx v7,c32,w4
440	stvx v8,c48,w4
441	addi rd,rd,128
442	bdnz LFwdLongAligned
443
444	beq 4f // no leftover quadwords
445	mtctr r0
446	3: // loop over remaining quadwords (1-7)
447	lvx v1,0,rs
448	addi rs,rs,16
449	stvx v1,0,rd
450	addi rd,rd,16
451	bdnz 3b
452	4:
453	mtspr vrsave,rv // restore bitmap of live vr's
454	bne cr6,LShort16 // handle last 0-15 bytes if any
455	blr
456
457
458	// Long, reverse moves.
459	// rs = source
460	// rd = destination
461	// rc = count
462	// cr5 = beq if relatively 16-byte aligned
463
464	LLongReverse:
465	add rd,rd,rc // point to end of operands
466	add rs,rs,rc
467	andi. r0,rd,0xF // #bytes to 16-byte align destination
468	beq 2f // already aligned
469
470	// 16-byte align destination.
471
472	mtctr r0 // set up for loop
473	sub rc,rc,r0
474	1:
475	lbzu w1,-1(rs)
476	stbu w1,-1(rd)
477	bdnz 1b
478
479	// Prepare for reverse vector loop. When entered:
480	// rd = 16-byte aligned
481	// cr5 = beq if source also 16-byte aligned
482	// We set up many registers:
483	// ctr/cr1 = number of 64-byte chunks to move (may be 0)
484	// r0/cr0 = leftover QWs to move
485	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
486	// cr6 = beq if leftover byte count is 0
487	// cm1 = -1
488	// rv = original value of vrsave
489
490	2:
491	mfspr rv,vrsave // get bitmap of live vector registers
492	srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
493	oris w1,rv,0xFFF8 // we use v0-v12
494	mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
495	rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
496	cmpwi cr1,r0,0 // set cr1 on chunk count
497	mtspr vrsave,w1 // update mask
498	mtctr r0 // set up loop count
499	cmpwi cr6,w3,0 // set cr6 on leftover byte count
500	rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
501	li cm1,-1 // get constants used in ldvx/stvx
502
503	bne cr5,LReverseVecUnal // handle unaligned operands
504	beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
505	li cm17,-17
506	li cm33,-33
507	li cm49,-49
508	b 1f
509
510	// Long, reverse 16-byte-aligned vector loop.
511
512	.align 5 // align inner loops
513	1: // loop over 64-byte chunks
514	lvx v1,cm1,rs
515	lvx v2,cm17,rs
516	lvx v3,cm33,rs
517	lvx v4,cm49,rs
518	subi rs,rs,64
519	stvx v1,cm1,rd
520	stvx v2,cm17,rd
521	stvx v3,cm33,rd
522	stvx v4,cm49,rd
523	subi rd,rd,64
524	bdnz 1b
525
526	beq 4f // no leftover quadwords
527	2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
528	mtctr r0
529	3: // loop over remaining quadwords (1-7)
530	lvx v1,cm1,rs
531	subi rs,rs,16
532	stvx v1,cm1,rd
533	subi rd,rd,16
534	bdnz 3b
535	4:
536	mtspr vrsave,rv // restore bitmap of live vr's
537	bne cr6,LShortReverse16 // handle last 0-15 bytes if any
538	blr
539
540
541	// Long, reverse, unaligned vector loop.
542	// ctr/cr1 = number of 64-byte chunks to move (may be 0)
543	// r0/cr0 = leftover QWs to move
544	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
545	// cr6 = beq if leftover byte count is 0
546	// rv = original value of vrsave
547	// cm1 = -1
548
549	LReverseVecUnal:
550	lvsl vp,0,rs // get permute vector to shift left
551	lvx v1,cm1,rs // v1 always looks ahead
552	li cm17,-17
553	beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
554	li cm33,-33
555	li cm49,-49
556	b 1f
557
558	.align 5 // align the inner loops
559	1: // loop over 64-byte chunks
560	lvx v2,cm17,rs
561	lvx v3,cm33,rs
562	lvx v4,cm49,rs
563	subi rs,rs,64
564	vperm vx,v2,v1,vp
565	lvx v1,cm1,rs
566	vperm vy,v3,v2,vp
567	stvx vx,cm1,rd
568	vperm vz,v4,v3,vp
569	stvx vy,cm17,rd
570	vperm vx,v1,v4,vp
571	stvx vz,cm33,rd
572	stvx vx,cm49,rd
573	subi rd,rd,64
574	bdnz 1b
575
576	beq 4f // no leftover quadwords
577	2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
578	mtctr r0
579	3: // loop over 1-3 quadwords
580	lvx v2,cm17,rs
581	subi rs,rs,16
582	vperm vx,v2,v1,vp
583	vor v1,v2,v2 // v1 <- v2
584	stvx vx,cm1,rd
585	subi rd,rd,16
586	bdnz 3b
587	4:
588	mtspr vrsave,rv // restore bitmap of live vr's
589	bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
590	blr
591
592	COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0,kCommPageMTCRF)