[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code 
 * as defined in and that are subject to the Apple Public Source License 
 * Version 2.0 (the 'License'). You may not use this file except in 
 * compliance with the License.  The rights granted to you under the 
 * License may not be used to create, or enable the creation or 
 * redistribution of, unlawful or unlicensed copies of an Apple operating 
 * system, or to circumvent, violate, or enable the circumvention or 
 * violation of, any terms of an Apple operating system software license 
 * agreement.
 *
 * Please obtain a copy of the License at 
 * http://www.opensource.apple.com/apsl/ and read it before using this 
 * file.
 *
 * The Original Code and all software distributed under the License are 
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
 * Please see the License for the specific language governing rights and 
 * limitations under the License.
 *
 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 6/11/2003, tuned for the IBM 970.
 *
 * Register usage.  Note the rather delicate way we assign multiple uses
 * to the same register.  Beware.
 *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1", "c16", or "cm17"
 *   r7  = "w2", "c32", or "cm33"
 *   r8  = "w3", "c48", or "cm49"
 *   r9  = "w4",        or "cm1"
 *   r10 = vrsave ("rv")
 *   r11 = unused
 *   r12 = destination ptr ("rd")
 *   v0  = permute vector ("vp") 
 * v1-v8 = qw's loaded from source
 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
 */
#define rs	r4
#define rd	r12
#define rc	r5
#define	rv	r10

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9

#define c16		r6
#define cm17	r6
#define c32		r7
#define cm33	r7
#define c48		r8
#define cm49	r8
#define cm1		r9

#define	vp	v0
#define	vw	v9
#define	vx	v10
#define	vy	v11
#define	vz	v12

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
/*
 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
 * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
 * simple transformations:
 *      - all word compares are changed to doubleword
 *      - all "srwi[.]" opcodes are changed to "srdi[.]"                      
 * Nothing else is done.  For this to work, the following rules must be
 * carefully followed:
 *      - do not use carry or overflow
 *      - only use record mode if you are sure the results are mode-invariant
 *        for example, all "andi." and almost all "rlwinm." are fine
 *      - do not use "slwi", "slw", or "srw"
 * An imaginative programmer could break the porting model in other ways, but the above
 * are the most likely problem areas.  It is perhaps surprising how well in practice
 * this simple method works.
 */

#define	kShort		64
#define	kVeryLong	(128*1024)


// Main entry points.

        .align 	5
bcopy_970:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kShort			// short or long?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        mr		rd,r4				// move registers to canonic spot
        mr		rs,r3
        blt		LShort				// handle short operands
        dcbt	0,rs				// touch in the first line of source
        dcbtst	0,rd				// touch in destination
        b		LLong1				// join long operand code

// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.

        .align	5
Lmemcpy_970:						// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_970:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kShort			// short or long?
        sub		w1,r3,r4			// must move in reverse if (rd-rs)<rc
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong0				// handle long operands

// Handle short operands.
//		rs = source
//		rd = destination
//		rc = count
//		w1 = (rd-rs), must move reverse if (rd-rs)<rc
        
LShort:
        cmplw	cr1,w1,rc			// set cr1 blt if we must move reverse
        mtcrf	0x02,rc				// move length to cr6 and cr7 one at a time
        mtcrf	0x01,rc
        blt--	cr1,LShortReverse
        
// Forward short operands.  This is the most frequent case, so it is inline.

        bf		26,0f				// 32-byte chunk to move?
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
0:
LShort32:
        bf		27,1f				// quadword to move?
        ld		w1,0(rs)
        ld		w3,8(rs)
        addi	rs,rs,16
        std		w1,0(rd)
        std		w3,8(rd)
        addi	rd,rd,16
1:
LShort16:							// join here to xfer 0-15 bytes
        bf		28,2f				// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		30,4f				// halfword to move?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
4:
        bflr	31					// skip if no odd byte
        lbz		w1,0(rs)
        stb		w1,0(rd)
        blr
        
        
// Handle short reverse operands.
//		cr = length in bits 26-31       

LShortReverse:
        add		rs,rs,rc			// adjust ptrs for reverse move
        add		rd,rd,rc
        bf		26,0f				// 32 bytes to move?
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ldu		w4,-32(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        stdu	w4,-32(rd)
0:
        bf		27,1f				// quadword to move?
        ld		w1,-8(rs)
        ldu		w2,-16(rs)
        std		w1,-8(rd)
        stdu	w2,-16(rd)
1:
LShortReverse16:					// join here to xfer 0-15 bytes and return
        bf		28,2f				// doubleword?
        ldu		w1,-8(rs)
        stdu	w1,-8(rd)
2:
        bf		29,3f				// word?
        lwzu	w1,-4(rs)
        stwu	w1,-4(rd)
3:
        bf		30,4f				// halfword to move?
        lhzu	w1,-2(rs)
        sthu	w1,-2(rd)
4:
        bflr	31					// done if no odd byte
        lbz 	w1,-1(rs)			// no update
        stb 	w1,-1(rd)
        blr
        

// Long operands, use Altivec in most cases.
//		rs = source
//		rd = destination
//		rc = count
//		w1 = (rd-rs), must move reverse if (rd-rs)<rc

LLong0:								// entry from memmove()
        dcbt	0,rs				// touch in source
        dcbtst	0,rd				// touch in destination
LLong1:								// entry from bcopy() with operands already touched in
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        neg		w3,rd				// start to compute #bytes to align destination
        rlwinm	w2,w1,0,0xF			// 16-byte aligned?  (w2==0 if so)
        andi.	w4,w3,0xF			// w4 <- #bytes to 16-byte align destination
        cmpwi	cr5,w2,0			// set cr5 beq if relatively 16-byte aligned
        blt--	cr1,LLongReverse	// handle reverse moves
        sub		rc,rc,w4			// adjust length for aligning destination
        srwi	r0,rc,7				// get #cache lines to copy (may be 0)
        cmpwi	cr1,r0,0			// set cr1 on #chunks
        beq		LFwdAligned			// dest is already aligned
        
// 16-byte align destination.

        mtcrf	0x01,w4				// cr7 <- #bytes to align dest (nonzero)
        bf		31,1f				// byte to move?
        lbz		w1,0(rs)
        addi	rs,rs,1
        stb		w1,0(rd)
        addi	rd,rd,1
1:
        bf		30,2f				// halfword?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		28,LFwdAligned		// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8


// Forward, destination is 16-byte aligned.  There are five cases:
//  1. If the length>=kVeryLong (ie, several pages), then use the
//     "bigcopy" path that pulls all the punches.  This is the fastest
//	   case for cold-cache operands, as any this long will likely be.
//	2. If length>=128 and source is 16-byte aligned, then use the
//	   lvx/stvx loop over 128-byte chunks.  This is the fastest
//     case for hot-cache operands, 2nd fastest for cold.
//	3. If length>=128 and source is not 16-byte aligned, then use the
//	   lvx/vperm/stvx loop over 128-byte chunks.
//	4. If length<128 and source is 8-byte aligned, then use the
//	   ld/std loop over 32-byte chunks.
//	5. If length<128 and source is not 8-byte aligned, then use the
//	   lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
// Registers at this point:
//		r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
//			rs = alignment unknown
//		    rd = 16-byte aligned
//			rc = bytes remaining
//			w2 = low 4 bits of (rd-rs), used to check alignment
//		   cr5 = beq if source is also 16-byte aligned

LFwdAligned:
        andi.	w3,w2,7				// is source at least 8-byte aligned?
        mtcrf	0x01,rc				// move leftover count to cr7 for LShort16
        bne		cr1,LFwdLongVectors	// at least one 128-byte chunk, so use vectors
        srwi	w1,rc,5				// get 32-byte chunk count
        mtcrf	0x02,rc				// move bit 27 of length to cr6 for LShort32
        mtctr	w1					// set up 32-byte loop (w1!=0)
        beq		LFwdMedAligned		// source is 8-byte aligned, so use ld/std loop
        mfspr	rv,vrsave			// get bitmap of live vector registers
        oris	w4,rv,0xFFF8		// we use v0-v12
        li		c16,16				// get constant used in lvx
        li		c32,32
        mtspr	vrsave,w4			// update mask
        lvx		v1,0,rs				// prefetch 1st source quadword
        lvsl	vp,0,rs				// get permute vector to shift left
        
        
// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.

1:									// loop over 32-byte chunks
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        addi	rs,rs,32
        vperm	vx,v1,v2,vp
        vperm	vy,v2,v3,vp
        vor		v1,v3,v3			// v1 <- v3
        stvx	vx,0,rd
        stvx	vy,c16,rd
        addi	rd,rd,32
        bdnz	1b
        
        mtspr	vrsave,rv			// restore bitmap of live vr's
        b		LShort32

        
// Fewer than 128 bytes and doubleword aligned: use ld/std.

        .align	5
LFwdMedAligned:									// loop over 32-byte chunks
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
        bdnz	LFwdMedAligned
        
        b		LShort32

        
// Forward, 128 bytes or more: use vectors.  When entered:
//	    r0 = 128-byte chunks to move (>0)
//		rd = 16-byte aligned
//	   cr5 = beq if source is 16-byte aligned
//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
// We set up many registers:
//	   ctr = number of 128-byte chunks to move
//	r0/cr0 = leftover QWs to move
//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//	   cr6 = beq if leftover byte count is 0
//		rv = original value of VRSave
// c16,c32,c48 = loaded

LFwdLongVectors:
        mfspr	rv,vrsave			// get bitmap of live vector registers
        lis		w3,kVeryLong>>16	// cutoff for very-long-operand special case path
        cmplw	cr1,rc,w3			// very long operand?
        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3
        bge--	cr1,LBigCopy        // handle big copies separately
        mtctr	r0					// set up loop count
        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
        oris	w4,rv,0xFFF8		// we use v0-v12
        rlwinm.	r0,rc,28,29,31		// get number of quadword leftovers (0-7) and set cr0
        li		c16,16				// get constants used in ldvx/stvx
        mtspr	vrsave,w4			// update mask
        li		c32,32
        li		c48,48
        beq		cr5,LFwdLongAligned	// source is also 16-byte aligned, no need for vperm
        lvsl	vp,0,rs				// get permute vector to shift left
        lvx		v1,0,rs				// prefetch 1st source quadword
        b		LFwdLongUnaligned


// Forward, long, unaligned vector loop.

        .align	5					// align inner loops
LFwdLongUnaligned:					// loop over 128-byte chunks
        addi	w4,rs,64
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        lvx		v4,c48,rs
        lvx		v5,0,w4
        lvx		v6,c16,w4
        vperm	vw,v1,v2,vp
        lvx		v7,c32,w4
        lvx		v8,c48,w4
        addi	rs,rs,128
        vperm	vx,v2,v3,vp
        addi	w4,rd,64
        lvx		v1,0,rs
        stvx	vw,0,rd
        vperm	vy,v3,v4,vp
        stvx	vx,c16,rd
        vperm	vz,v4,v5,vp
        stvx	vy,c32,rd
        vperm	vw,v5,v6,vp
        stvx	vz,c48,rd
        vperm	vx,v6,v7,vp
        addi	rd,rd,128
        stvx	vw,0,w4
        vperm	vy,v7,v8,vp
        stvx	vx,c16,w4
        vperm	vz,v8,v1,vp
        stvx	vy,c32,w4
        stvx	vz,c48,w4
        bdnz	LFwdLongUnaligned

        beq		4f					// no leftover quadwords
        mtctr	r0
3:									// loop over remaining quadwords
        lvx		v2,c16,rs
        addi	rs,rs,16
        vperm	vx,v1,v2,vp
        vor		v1,v2,v2			// v1 <- v2
        stvx	vx,0,rd
        addi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShort16		// handle last 0-15 bytes if any
        blr


// Forward, long, 16-byte aligned vector loop.

        .align	5
LFwdLongAligned:        			// loop over 128-byte chunks
        addi	w4,rs,64
        lvx		v1,0,rs
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        lvx		v4,c48,rs
        lvx		v5,0,w4
        lvx		v6,c16,w4
        lvx		v7,c32,w4
        lvx		v8,c48,w4
        addi	rs,rs,128
        addi	w4,rd,64
        stvx	v1,0,rd 
        stvx	v2,c16,rd
        stvx	v3,c32,rd
        stvx	v4,c48,rd
        stvx	v5,0,w4
        stvx	v6,c16,w4
        stvx	v7,c32,w4
        stvx	v8,c48,w4
        addi	rd,rd,128
        bdnz	LFwdLongAligned
                
        beq		4f					// no leftover quadwords
        mtctr	r0
3:									// loop over remaining quadwords (1-7)
        lvx		v1,0,rs
        addi	rs,rs,16
        stvx	v1,0,rd
        addi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShort16		// handle last 0-15 bytes if any
        blr
        

// Long, reverse moves.
//		rs = source
//		rd = destination
//		rc = count
//	   cr5 = beq if relatively 16-byte aligned

LLongReverse:
        add		rd,rd,rc			// point to end of operands
        add		rs,rs,rc
        andi.	r0,rd,0xF			// #bytes to 16-byte align destination
        beq		2f					// already aligned
        
// 16-byte align destination.

        mtctr	r0					// set up for loop
        sub		rc,rc,r0
1:
        lbzu	w1,-1(rs)
        stbu	w1,-1(rd)
        bdnz	1b

// Prepare for reverse vector loop.  When entered:
//		rd = 16-byte aligned
//		cr5 = beq if source also 16-byte aligned
// We set up many registers:
//		ctr/cr1 = number of 64-byte chunks to move (may be 0)
//		r0/cr0 = leftover QWs to move
//		cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//		cr6 = beq if leftover byte count is 0
//		cm1 = -1
//		rv = original value of vrsave

2:
        mfspr	rv,vrsave			// get bitmap of live vector registers
        srwi	r0,rc,6				// get count of 64-byte chunks to move (may be 0)
        oris	w1,rv,0xFFF8		// we use v0-v12
        mtcrf	0x01,rc				// prepare for moving last 0-15 bytes in LShortReverse16
        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3 too
        cmpwi	cr1,r0,0			// set cr1 on chunk count
        mtspr	vrsave,w1			// update mask
        mtctr	r0					// set up loop count
        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
        rlwinm.	r0,rc,28,30,31		// get number of quadword leftovers (0-3) and set cr0
        li		cm1,-1				// get constants used in ldvx/stvx
        
        bne		cr5,LReverseVecUnal	// handle unaligned operands
        beq		cr1,2f				// no chunks (if no chunks, must be leftover QWs)
        li		cm17,-17
        li		cm33,-33
        li		cm49,-49
        b		1f

// Long, reverse 16-byte-aligned vector loop.
      
        .align	5					// align inner loops
1:        							// loop over 64-byte chunks
        lvx		v1,cm1,rs
        lvx		v2,cm17,rs
        lvx		v3,cm33,rs
        lvx		v4,cm49,rs
        subi	rs,rs,64
        stvx	v1,cm1,rd
        stvx	v2,cm17,rd
        stvx	v3,cm33,rd
        stvx	v4,cm49,rd
        subi	rd,rd,64
        bdnz	1b
        
        beq		4f					// no leftover quadwords
2:									// r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
        mtctr	r0
3:									// loop over remaining quadwords (1-7)
        lvx		v1,cm1,rs
        subi	rs,rs,16
        stvx	v1,cm1,rd
        subi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShortReverse16	// handle last 0-15 bytes if any
        blr


// Long, reverse, unaligned vector loop.
//		ctr/cr1 = number of 64-byte chunks to move (may be 0)
//		r0/cr0 = leftover QWs to move
//		cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//		cr6 = beq if leftover byte count is 0
//		rv = original value of vrsave
//		cm1 = -1

LReverseVecUnal:
        lvsl	vp,0,rs				// get permute vector to shift left
        lvx		v1,cm1,rs			// v1 always looks ahead
        li		cm17,-17
        beq		cr1,2f				// no chunks (if no chunks, must be leftover QWs)
        li		cm33,-33
        li		cm49,-49
        b		1f
        
        .align	5					// align the inner loops
1:									// loop over 64-byte chunks
        lvx		v2,cm17,rs
        lvx		v3,cm33,rs
        lvx		v4,cm49,rs
        subi	rs,rs,64
        vperm	vx,v2,v1,vp
        lvx		v1,cm1,rs
        vperm	vy,v3,v2,vp
        stvx	vx,cm1,rd
        vperm	vz,v4,v3,vp
        stvx	vy,cm17,rd
        vperm	vx,v1,v4,vp
        stvx	vz,cm33,rd
        stvx	vx,cm49,rd
        subi	rd,rd,64
        bdnz	1b

        beq		4f					// no leftover quadwords
2:									// r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
        mtctr	r0
3:									// loop over 1-3 quadwords
        lvx		v2,cm17,rs
        subi	rs,rs,16
        vperm	vx,v2,v1,vp
        vor		v1,v2,v2			// v1 <- v2
        stvx	vx,cm1,rd
        subi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShortReverse16	// handle last 0-15 bytes iff any
        blr

        
// Very Big Copy Path.  Save our return address in the stack for help decoding backtraces.
// The conditions bigcopy expects are:
//  r0 = return address (also stored in caller's SF)
//	r4 = source ptr
//	r5 = length (at least several pages)
// r12 = dest ptr

LBigCopy:
		lis		r2,0x4000			// r2 <- 0x40000000
        mflr    r0                  // get our return address
		add.	r2,r2,r2			// set cr0_lt if running in 32-bit mode
        stw     r0,8(r1)            // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
		blta	_COMM_PAGE_BIGCOPY  // 32-bit mode, join big operand copy
		std		r0,16(r1)			// save return in correct spot for 64-bit mode
        ba      _COMM_PAGE_BIGCOPY  // then join big operand code
        

	COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
				kCommPageMTCRF+kCommPageBoth+kPort32to64)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
8ad349bb	4	* @APPLE_LICENSE_OSREFERENCE_HEADER_START@
55e303ae	5	*
8ad349bb A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the
	10	* License may not be used to create, or enable the creation or
	11	* redistribution of, unlawful or unlicensed copies of an Apple operating
	12	* system, or to circumvent, violate, or enable the circumvention or
	13	* violation of, any terms of an Apple operating system software license
	14	* agreement.
	15	*
	16	* Please obtain a copy of the License at
	17	* http://www.opensource.apple.com/apsl/ and read it before using this
	18	* file.
	19	*
	20	* The Original Code and all software distributed under the License are
	21	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	22	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	23	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	24	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	25	* Please see the License for the specific language governing rights and
	26	* limitations under the License.
	27	*
	28	* @APPLE_LICENSE_OSREFERENCE_HEADER_END@
55e303ae A	29	*/
	30	/* =======================================
	31	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	32	* =======================================
	33	*
	34	* Version of 6/11/2003, tuned for the IBM 970.
	35	*
55e303ae A	36	* Register usage. Note the rather delicate way we assign multiple uses
	37	* to the same register. Beware.
	38	* r0 = temp (NB: cannot use r0 for any constant such as "c16")
	39	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	40	* r4 = source ptr ("rs")
	41	* r5 = count of bytes to move ("rc")
	42	* r6 = "w1", "c16", or "cm17"
	43	* r7 = "w2", "c32", or "cm33"
	44	* r8 = "w3", "c48", or "cm49"
	45	* r9 = "w4", or "cm1"
	46	* r10 = vrsave ("rv")
	47	* r11 = unused
	48	* r12 = destination ptr ("rd")
	49	* v0 = permute vector ("vp")
	50	* v1-v8 = qw's loaded from source
	51	*v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
	52	*/
	53	#define rs r4
	54	#define rd r12
	55	#define rc r5
	56	#define rv r10
	57
	58	#define w1 r6
	59	#define w2 r7
	60	#define w3 r8
	61	#define w4 r9
	62
	63	#define c16 r6
	64	#define cm17 r6
	65	#define c32 r7
	66	#define cm33 r7
	67	#define c48 r8
	68	#define cm49 r8
	69	#define cm1 r9
	70
	71	#define vp v0
	72	#define vw v9
	73	#define vx v10
	74	#define vy v11
	75	#define vz v12
	76
	77	#define ASSEMBLER
	78	#include <sys/appleapiopts.h>
	79	#include <ppc/asm.h>
	80	#include <machine/cpu_capabilities.h>
	81	#include <machine/commpage.h>
	82
	83	.text
91447636 A	84	/*
	85	* WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
	86	* to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
	87	* simple transformations:
	88	* - all word compares are changed to doubleword
	89	* - all "srwi[.]" opcodes are changed to "srdi[.]"
	90	* Nothing else is done. For this to work, the following rules must be
	91	* carefully followed:
	92	* - do not use carry or overflow
	93	* - only use record mode if you are sure the results are mode-invariant
	94	* for example, all "andi." and almost all "rlwinm." are fine
	95	* - do not use "slwi", "slw", or "srw"
	96	* An imaginative programmer could break the porting model in other ways, but the above
	97	* are the most likely problem areas. It is perhaps surprising how well in practice
	98	* this simple method works.
	99	*/
55e303ae A	100
	101	#define kShort 64
	102	#define kVeryLong (128*1024)
	103
	104
	105	// Main entry points.
	106
	107	.align 5
	108	bcopy_970: // void bcopy(const void src, void dst, size_t len)
	109	cmplwi rc,kShort // short or long?
	110	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
	111	mr rd,r4 // move registers to canonic spot
	112	mr rs,r3
	113	blt LShort // handle short operands
	114	dcbt 0,rs // touch in the first line of source
	115	dcbtst 0,rd // touch in destination
	116	b LLong1 // join long operand code
	117
	118	// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
	119
	120	.align 5
	121	Lmemcpy_970: // void* memcpy(void dst, void src, size_t len)
	122	Lmemmove_970: // void* memmove(void dst, const void src, size_t len)
	123	cmplwi rc,kShort // short or long?
	124	sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
	125	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
	126	bge LLong0 // handle long operands
	127
	128	// Handle short operands.
	129	// rs = source
	130	// rd = destination
	131	// rc = count
	132	// w1 = (rd-rs), must move reverse if (rd-rs)<rc
	133
	134	LShort:
	135	cmplw cr1,w1,rc // set cr1 blt if we must move reverse
	136	mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
	137	mtcrf 0x01,rc
	138	blt-- cr1,LShortReverse
	139
	140	// Forward short operands. This is the most frequent case, so it is inline.
	141
	142	bf 26,0f // 32-byte chunk to move?
	143	ld w1,0(rs)
	144	ld w2,8(rs)
	145	ld w3,16(rs)
	146	ld w4,24(rs)
	147	addi rs,rs,32
	148	std w1,0(rd)
	149	std w2,8(rd)
	150	std w3,16(rd)
	151	std w4,24(rd)
	152	addi rd,rd,32
	153	0:
	154	LShort32:
	155	bf 27,1f // quadword to move?
	156	ld w1,0(rs)
	157	ld w3,8(rs)
	158	addi rs,rs,16
	159	std w1,0(rd)
	160	std w3,8(rd)
	161	addi rd,rd,16
	162	1:
	163	LShort16: // join here to xfer 0-15 bytes
164	bf 28,2f // doubleword?
165	ld w1,0(rs)
166	addi rs,rs,8
167	std w1,0(rd)
168	addi rd,rd,8
169	2:
170	bf 29,3f // word?
171	lwz w1,0(rs)
172	addi rs,rs,4
173	stw w1,0(rd)
174	addi rd,rd,4
175	3:
176	bf 30,4f // halfword to move?
177	lhz w1,0(rs)
178	addi rs,rs,2
179	sth w1,0(rd)
180	addi rd,rd,2
181	4:
182	bflr 31 // skip if no odd byte
183	lbz w1,0(rs)
184	stb w1,0(rd)
185	blr
186
187
188	// Handle short reverse operands.
189	// cr = length in bits 26-31
190
191	LShortReverse:
192	add rs,rs,rc // adjust ptrs for reverse move
193	add rd,rd,rc
194	bf 26,0f // 32 bytes to move?
195	ld w1,-8(rs)
196	ld w2,-16(rs)
197	ld w3,-24(rs)
198	ldu w4,-32(rs)
199	std w1,-8(rd)
200	std w2,-16(rd)
201	std w3,-24(rd)
202	stdu w4,-32(rd)
203	0:
204	bf 27,1f // quadword to move?
205	ld w1,-8(rs)
206	ldu w2,-16(rs)
207	std w1,-8(rd)
208	stdu w2,-16(rd)
209	1:
210	LShortReverse16: // join here to xfer 0-15 bytes and return
211	bf 28,2f // doubleword?
212	ldu w1,-8(rs)
213	stdu w1,-8(rd)
214	2:
215	bf 29,3f // word?
216	lwzu w1,-4(rs)
217	stwu w1,-4(rd)
218	3:
219	bf 30,4f // halfword to move?
220	lhzu w1,-2(rs)
221	sthu w1,-2(rd)
222	4:
223	bflr 31 // done if no odd byte
224	lbz w1,-1(rs) // no update
225	stb w1,-1(rd)
226	blr
227
228
229	// Long operands, use Altivec in most cases.
230	// rs = source
231	// rd = destination
232	// rc = count
233	// w1 = (rd-rs), must move reverse if (rd-rs)<rc
234
235	LLong0: // entry from memmove()
236	dcbt 0,rs // touch in source
237	dcbtst 0,rd // touch in destination
238	LLong1: // entry from bcopy() with operands already touched in
239	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
240	neg w3,rd // start to compute #bytes to align destination
241	rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
242	andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
243	cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
244	blt-- cr1,LLongReverse // handle reverse moves
245	sub rc,rc,w4 // adjust length for aligning destination
246	srwi r0,rc,7 // get #cache lines to copy (may be 0)
247	cmpwi cr1,r0,0 // set cr1 on #chunks
248	beq LFwdAligned // dest is already aligned
249
250	// 16-byte align destination.
251
252	mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
253	bf 31,1f // byte to move?
254	lbz w1,0(rs)
255	addi rs,rs,1
256	stb w1,0(rd)
257	addi rd,rd,1
258	1:
259	bf 30,2f // halfword?
260	lhz w1,0(rs)
261	addi rs,rs,2
262	sth w1,0(rd)
263	addi rd,rd,2
264	2:
265	bf 29,3f // word?
266	lwz w1,0(rs)
267	addi rs,rs,4
268	stw w1,0(rd)
269	addi rd,rd,4
270	3:
271	bf 28,LFwdAligned // doubleword?
272	ld w1,0(rs)
273	addi rs,rs,8
274	std w1,0(rd)
275	addi rd,rd,8
276
277
278	// Forward, destination is 16-byte aligned. There are five cases:
279	// 1. If the length>=kVeryLong (ie, several pages), then use the
280	// "bigcopy" path that pulls all the punches. This is the fastest
281	// case for cold-cache operands, as any this long will likely be.
282	// 2. If length>=128 and source is 16-byte aligned, then use the
283	// lvx/stvx loop over 128-byte chunks. This is the fastest
284	// case for hot-cache operands, 2nd fastest for cold.
285	// 3. If length>=128 and source is not 16-byte aligned, then use the
286	// lvx/vperm/stvx loop over 128-byte chunks.
287	// 4. If length<128 and source is 8-byte aligned, then use the
288	// ld/std loop over 32-byte chunks.
289	// 5. If length<128 and source is not 8-byte aligned, then use the
290	// lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
291	// Registers at this point:
292	// r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
293	// rs = alignment unknown
294	// rd = 16-byte aligned
295	// rc = bytes remaining
296	// w2 = low 4 bits of (rd-rs), used to check alignment
297	// cr5 = beq if source is also 16-byte aligned
298
299	LFwdAligned:
300	andi. w3,w2,7 // is source at least 8-byte aligned?
301	mtcrf 0x01,rc // move leftover count to cr7 for LShort16
302	bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
303	srwi w1,rc,5 // get 32-byte chunk count
304	mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
305	mtctr w1 // set up 32-byte loop (w1!=0)
306	beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
307	mfspr rv,vrsave // get bitmap of live vector registers
308	oris w4,rv,0xFFF8 // we use v0-v12
309	li c16,16 // get constant used in lvx
310	li c32,32
311	mtspr vrsave,w4 // update mask
312	lvx v1,0,rs // prefetch 1st source quadword
313	lvsl vp,0,rs // get permute vector to shift left
314
315
316	// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
317
318	1: // loop over 32-byte chunks
319	lvx v2,c16,rs
320	lvx v3,c32,rs
321	addi rs,rs,32
322	vperm vx,v1,v2,vp
323	vperm vy,v2,v3,vp
324	vor v1,v3,v3 // v1 <- v3
325	stvx vx,0,rd
326	stvx vy,c16,rd
327	addi rd,rd,32
328	bdnz 1b
329
330	mtspr vrsave,rv // restore bitmap of live vr's
331	b LShort32
332
333
334	// Fewer than 128 bytes and doubleword aligned: use ld/std.
335
336	.align 5
337	LFwdMedAligned: // loop over 32-byte chunks
338	ld w1,0(rs)
339	ld w2,8(rs)
340	ld w3,16(rs)
341	ld w4,24(rs)
342	addi rs,rs,32
343	std w1,0(rd)
344	std w2,8(rd)
345	std w3,16(rd)
346	std w4,24(rd)
347	addi rd,rd,32
348	bdnz LFwdMedAligned
349
350	b LShort32
351
352
353	// Forward, 128 bytes or more: use vectors. When entered:
354	// r0 = 128-byte chunks to move (>0)
355	// rd = 16-byte aligned
356	// cr5 = beq if source is 16-byte aligned
357	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
358	// We set up many registers:
359	// ctr = number of 128-byte chunks to move
360	// r0/cr0 = leftover QWs to move
361	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
362	// cr6 = beq if leftover byte count is 0
363	// rv = original value of VRSave
364	// c16,c32,c48 = loaded
365
366	LFwdLongVectors:
367	mfspr rv,vrsave // get bitmap of live vector registers
368	lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
369	cmplw cr1,rc,w3 // very long operand?
370	rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
91447636	371	bge-- cr1,LBigCopy // handle big copies separately
55e303ae A	372	mtctr r0 // set up loop count
	373	cmpwi cr6,w3,0 // set cr6 on leftover byte count
	374	oris w4,rv,0xFFF8 // we use v0-v12
	375	rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
	376	li c16,16 // get constants used in ldvx/stvx
	377	mtspr vrsave,w4 // update mask
	378	li c32,32
	379	li c48,48
	380	beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
	381	lvsl vp,0,rs // get permute vector to shift left
	382	lvx v1,0,rs // prefetch 1st source quadword
	383	b LFwdLongUnaligned
	384
	385
	386	// Forward, long, unaligned vector loop.
	387
	388	.align 5 // align inner loops
	389	LFwdLongUnaligned: // loop over 128-byte chunks
	390	addi w4,rs,64
	391	lvx v2,c16,rs
	392	lvx v3,c32,rs
	393	lvx v4,c48,rs
	394	lvx v5,0,w4
	395	lvx v6,c16,w4
	396	vperm vw,v1,v2,vp
	397	lvx v7,c32,w4
	398	lvx v8,c48,w4
	399	addi rs,rs,128
	400	vperm vx,v2,v3,vp
	401	addi w4,rd,64
	402	lvx v1,0,rs
	403	stvx vw,0,rd
	404	vperm vy,v3,v4,vp
	405	stvx vx,c16,rd
	406	vperm vz,v4,v5,vp
	407	stvx vy,c32,rd
	408	vperm vw,v5,v6,vp
	409	stvx vz,c48,rd
	410	vperm vx,v6,v7,vp
	411	addi rd,rd,128
	412	stvx vw,0,w4
	413	vperm vy,v7,v8,vp
	414	stvx vx,c16,w4
	415	vperm vz,v8,v1,vp
	416	stvx vy,c32,w4
	417	stvx vz,c48,w4
	418	bdnz LFwdLongUnaligned
	419
	420	beq 4f // no leftover quadwords
	421	mtctr r0
	422	3: // loop over remaining quadwords
	423	lvx v2,c16,rs
	424	addi rs,rs,16
	425	vperm vx,v1,v2,vp
	426	vor v1,v2,v2 // v1 <- v2
	427	stvx vx,0,rd
	428	addi rd,rd,16
	429	bdnz 3b
	430	4:
	431	mtspr vrsave,rv // restore bitmap of live vr's
	432	bne cr6,LShort16 // handle last 0-15 bytes if any
	433	blr
	434
	435
436	// Forward, long, 16-byte aligned vector loop.
437
438	.align 5
439	LFwdLongAligned: // loop over 128-byte chunks
440	addi w4,rs,64
441	lvx v1,0,rs
442	lvx v2,c16,rs
443	lvx v3,c32,rs
444	lvx v4,c48,rs
445	lvx v5,0,w4
446	lvx v6,c16,w4
447	lvx v7,c32,w4
448	lvx v8,c48,w4
449	addi rs,rs,128
450	addi w4,rd,64
451	stvx v1,0,rd
452	stvx v2,c16,rd
453	stvx v3,c32,rd
454	stvx v4,c48,rd
455	stvx v5,0,w4
456	stvx v6,c16,w4
457	stvx v7,c32,w4
458	stvx v8,c48,w4
459	addi rd,rd,128
460	bdnz LFwdLongAligned
461
462	beq 4f // no leftover quadwords
463	mtctr r0
464	3: // loop over remaining quadwords (1-7)
465	lvx v1,0,rs
466	addi rs,rs,16
467	stvx v1,0,rd
468	addi rd,rd,16
469	bdnz 3b
470	4:
471	mtspr vrsave,rv // restore bitmap of live vr's
472	bne cr6,LShort16 // handle last 0-15 bytes if any
473	blr
474
475
476	// Long, reverse moves.
477	// rs = source
478	// rd = destination
479	// rc = count
480	// cr5 = beq if relatively 16-byte aligned
481
482	LLongReverse:
483	add rd,rd,rc // point to end of operands
484	add rs,rs,rc
485	andi. r0,rd,0xF // #bytes to 16-byte align destination
486	beq 2f // already aligned
487
488	// 16-byte align destination.
489
490	mtctr r0 // set up for loop
491	sub rc,rc,r0
492	1:
493	lbzu w1,-1(rs)
494	stbu w1,-1(rd)
495	bdnz 1b
496
497	// Prepare for reverse vector loop. When entered:
498	// rd = 16-byte aligned
499	// cr5 = beq if source also 16-byte aligned
500	// We set up many registers:
501	// ctr/cr1 = number of 64-byte chunks to move (may be 0)
502	// r0/cr0 = leftover QWs to move
503	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
504	// cr6 = beq if leftover byte count is 0
505	// cm1 = -1
506	// rv = original value of vrsave
507
508	2:
509	mfspr rv,vrsave // get bitmap of live vector registers
510	srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
511	oris w1,rv,0xFFF8 // we use v0-v12
512	mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
513	rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
514	cmpwi cr1,r0,0 // set cr1 on chunk count
515	mtspr vrsave,w1 // update mask
516	mtctr r0 // set up loop count
517	cmpwi cr6,w3,0 // set cr6 on leftover byte count
518	rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
519	li cm1,-1 // get constants used in ldvx/stvx
520
521	bne cr5,LReverseVecUnal // handle unaligned operands
522	beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
523	li cm17,-17
524	li cm33,-33
525	li cm49,-49
526	b 1f
527
528	// Long, reverse 16-byte-aligned vector loop.
529
530	.align 5 // align inner loops
531	1: // loop over 64-byte chunks
532	lvx v1,cm1,rs
533	lvx v2,cm17,rs
534	lvx v3,cm33,rs
535	lvx v4,cm49,rs
536	subi rs,rs,64
537	stvx v1,cm1,rd
538	stvx v2,cm17,rd
539	stvx v3,cm33,rd
540	stvx v4,cm49,rd
541	subi rd,rd,64
542	bdnz 1b
543
544	beq 4f // no leftover quadwords
545	2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
546	mtctr r0
547	3: // loop over remaining quadwords (1-7)
548	lvx v1,cm1,rs
549	subi rs,rs,16
550	stvx v1,cm1,rd
551	subi rd,rd,16
552	bdnz 3b
553	4:
554	mtspr vrsave,rv // restore bitmap of live vr's
555	bne cr6,LShortReverse16 // handle last 0-15 bytes if any
556	blr
557
558
559	// Long, reverse, unaligned vector loop.
560	// ctr/cr1 = number of 64-byte chunks to move (may be 0)
561	// r0/cr0 = leftover QWs to move
562	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
563	// cr6 = beq if leftover byte count is 0
564	// rv = original value of vrsave
565	// cm1 = -1
566
567	LReverseVecUnal:
568	lvsl vp,0,rs // get permute vector to shift left
569	lvx v1,cm1,rs // v1 always looks ahead
570	li cm17,-17
571	beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
572	li cm33,-33
573	li cm49,-49
574	b 1f
575
576	.align 5 // align the inner loops
577	1: // loop over 64-byte chunks
578	lvx v2,cm17,rs
579	lvx v3,cm33,rs
580	lvx v4,cm49,rs
581	subi rs,rs,64
582	vperm vx,v2,v1,vp
583	lvx v1,cm1,rs
584	vperm vy,v3,v2,vp
585	stvx vx,cm1,rd
586	vperm vz,v4,v3,vp
587	stvx vy,cm17,rd
588	vperm vx,v1,v4,vp
589	stvx vz,cm33,rd
590	stvx vx,cm49,rd
591	subi rd,rd,64
592	bdnz 1b
593
594	beq 4f // no leftover quadwords
595	2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
596	mtctr r0
597	3: // loop over 1-3 quadwords
598	lvx v2,cm17,rs
599	subi rs,rs,16
600	vperm vx,v2,v1,vp
601	vor v1,v2,v2 // v1 <- v2
602	stvx vx,cm1,rd
603	subi rd,rd,16
604	bdnz 3b
605	4:
606	mtspr vrsave,rv // restore bitmap of live vr's
607	bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
608	blr
609
91447636 A	610
	611	// Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
	612	// The conditions bigcopy expects are:
	613	// r0 = return address (also stored in caller's SF)
	614	// r4 = source ptr
	615	// r5 = length (at least several pages)
	616	// r12 = dest ptr
	617
	618	LBigCopy:
	619	lis r2,0x4000 // r2 <- 0x40000000
	620	mflr r0 // get our return address
	621	add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
	622	stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
	623	blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
	624	std r0,16(r1) // save return in correct spot for 64-bit mode
	625	ba _COMM_PAGE_BIGCOPY // then join big operand code
	626
	627
	628	COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
	629	kCommPageMTCRF+kCommPageBoth+kPort32to64)