[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 6/11/2003, tuned for the IBM 970.
 *
 * Register usage.  Note the rather delicate way we assign multiple uses
 * to the same register.  Beware.
 *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1", "c16", or "cm17"
 *   r7  = "w2", "c32", or "cm33"
 *   r8  = "w3", "c48", or "cm49"
 *   r9  = "w4",        or "cm1"
 *   r10 = vrsave ("rv")
 *   r11 = unused
 *   r12 = destination ptr ("rd")
 *   v0  = permute vector ("vp") 
 * v1-v8 = qw's loaded from source
 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
 */
#define rs	r4
#define rd	r12
#define rc	r5
#define	rv	r10

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9

#define c16		r6
#define cm17	r6
#define c32		r7
#define cm33	r7
#define c48		r8
#define cm49	r8
#define cm1		r9

#define	vp	v0
#define	vw	v9
#define	vx	v10
#define	vy	v11
#define	vz	v12

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
/*
 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
 * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
 * simple transformations:
 *      - all word compares are changed to doubleword
 *      - all "srwi[.]" opcodes are changed to "srdi[.]"                      
 * Nothing else is done.  For this to work, the following rules must be
 * carefully followed:
 *      - do not use carry or overflow
 *      - only use record mode if you are sure the results are mode-invariant
 *        for example, all "andi." and almost all "rlwinm." are fine
 *      - do not use "slwi", "slw", or "srw"
 * An imaginative programmer could break the porting model in other ways, but the above
 * are the most likely problem areas.  It is perhaps surprising how well in practice
 * this simple method works.
 */

#define	kShort		64
#define	kVeryLong	(128*1024)


// Main entry points.

        .align 	5
bcopy_970:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kShort			// short or long?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        mr		rd,r4				// move registers to canonic spot
        mr		rs,r3
        blt		LShort				// handle short operands
        dcbt	0,rs				// touch in the first line of source
        dcbtst	0,rd				// touch in destination
        b		LLong1				// join long operand code

// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.

        .align	5
Lmemcpy_970:						// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_970:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kShort			// short or long?
        sub		w1,r3,r4			// must move in reverse if (rd-rs)<rc
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong0				// handle long operands

// Handle short operands.
//		rs = source
//		rd = destination
//		rc = count
//		w1 = (rd-rs), must move reverse if (rd-rs)<rc
        
LShort:
        cmplw	cr1,w1,rc			// set cr1 blt if we must move reverse
        mtcrf	0x02,rc				// move length to cr6 and cr7 one at a time
        mtcrf	0x01,rc
        blt--	cr1,LShortReverse
        
// Forward short operands.  This is the most frequent case, so it is inline.

        bf		26,0f				// 32-byte chunk to move?
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
0:
LShort32:
        bf		27,1f				// quadword to move?
        ld		w1,0(rs)
        ld		w3,8(rs)
        addi	rs,rs,16
        std		w1,0(rd)
        std		w3,8(rd)
        addi	rd,rd,16
1:
LShort16:							// join here to xfer 0-15 bytes
        bf		28,2f				// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		30,4f				// halfword to move?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
4:
        bflr	31					// skip if no odd byte
        lbz		w1,0(rs)
        stb		w1,0(rd)
        blr
        
        
// Handle short reverse operands.
//		cr = length in bits 26-31       

LShortReverse:
        add		rs,rs,rc			// adjust ptrs for reverse move
        add		rd,rd,rc
        bf		26,0f				// 32 bytes to move?
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ldu		w4,-32(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        stdu	w4,-32(rd)
0:
        bf		27,1f				// quadword to move?
        ld		w1,-8(rs)
        ldu		w2,-16(rs)
        std		w1,-8(rd)
        stdu	w2,-16(rd)
1:
LShortReverse16:					// join here to xfer 0-15 bytes and return
        bf		28,2f				// doubleword?
        ldu		w1,-8(rs)
        stdu	w1,-8(rd)
2:
        bf		29,3f				// word?
        lwzu	w1,-4(rs)
        stwu	w1,-4(rd)
3:
        bf		30,4f				// halfword to move?
        lhzu	w1,-2(rs)
        sthu	w1,-2(rd)
4:
        bflr	31					// done if no odd byte
        lbz 	w1,-1(rs)			// no update
        stb 	w1,-1(rd)
        blr
        

// Long operands, use Altivec in most cases.
//		rs = source
//		rd = destination
//		rc = count
//		w1 = (rd-rs), must move reverse if (rd-rs)<rc

LLong0:								// entry from memmove()
        dcbt	0,rs				// touch in source
        dcbtst	0,rd				// touch in destination
LLong1:								// entry from bcopy() with operands already touched in
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        neg		w3,rd				// start to compute #bytes to align destination
        rlwinm	w2,w1,0,0xF			// 16-byte aligned?  (w2==0 if so)
        andi.	w4,w3,0xF			// w4 <- #bytes to 16-byte align destination
        cmpwi	cr5,w2,0			// set cr5 beq if relatively 16-byte aligned
        blt--	cr1,LLongReverse	// handle reverse moves
        sub		rc,rc,w4			// adjust length for aligning destination
        srwi	r0,rc,7				// get #cache lines to copy (may be 0)
        cmpwi	cr1,r0,0			// set cr1 on #chunks
        beq		LFwdAligned			// dest is already aligned
        
// 16-byte align destination.

        mtcrf	0x01,w4				// cr7 <- #bytes to align dest (nonzero)
        bf		31,1f				// byte to move?
        lbz		w1,0(rs)
        addi	rs,rs,1
        stb		w1,0(rd)
        addi	rd,rd,1
1:
        bf		30,2f				// halfword?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		28,LFwdAligned		// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8


// Forward, destination is 16-byte aligned.  There are five cases:
//  1. If the length>=kVeryLong (ie, several pages), then use the
//     "bigcopy" path that pulls all the punches.  This is the fastest
//	   case for cold-cache operands, as any this long will likely be.
//	2. If length>=128 and source is 16-byte aligned, then use the
//	   lvx/stvx loop over 128-byte chunks.  This is the fastest
//     case for hot-cache operands, 2nd fastest for cold.
//	3. If length>=128 and source is not 16-byte aligned, then use the
//	   lvx/vperm/stvx loop over 128-byte chunks.
//	4. If length<128 and source is 8-byte aligned, then use the
//	   ld/std loop over 32-byte chunks.
//	5. If length<128 and source is not 8-byte aligned, then use the
//	   lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
// Registers at this point:
//		r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
//			rs = alignment unknown
//		    rd = 16-byte aligned
//			rc = bytes remaining
//			w2 = low 4 bits of (rd-rs), used to check alignment
//		   cr5 = beq if source is also 16-byte aligned

LFwdAligned:
        andi.	w3,w2,7				// is source at least 8-byte aligned?
        mtcrf	0x01,rc				// move leftover count to cr7 for LShort16
        bne		cr1,LFwdLongVectors	// at least one 128-byte chunk, so use vectors
        srwi	w1,rc,5				// get 32-byte chunk count
        mtcrf	0x02,rc				// move bit 27 of length to cr6 for LShort32
        mtctr	w1					// set up 32-byte loop (w1!=0)
        beq		LFwdMedAligned		// source is 8-byte aligned, so use ld/std loop
        mfspr	rv,vrsave			// get bitmap of live vector registers
        oris	w4,rv,0xFFF8		// we use v0-v12
        li		c16,16				// get constant used in lvx
        li		c32,32
        mtspr	vrsave,w4			// update mask
        lvx		v1,0,rs				// prefetch 1st source quadword
        lvsl	vp,0,rs				// get permute vector to shift left
        
        
// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.

1:									// loop over 32-byte chunks
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        addi	rs,rs,32
        vperm	vx,v1,v2,vp
        vperm	vy,v2,v3,vp
        vor		v1,v3,v3			// v1 <- v3
        stvx	vx,0,rd
        stvx	vy,c16,rd
        addi	rd,rd,32
        bdnz	1b
        
        mtspr	vrsave,rv			// restore bitmap of live vr's
        b		LShort32

        
// Fewer than 128 bytes and doubleword aligned: use ld/std.

        .align	5
LFwdMedAligned:									// loop over 32-byte chunks
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
        bdnz	LFwdMedAligned
        
        b		LShort32

        
// Forward, 128 bytes or more: use vectors.  When entered:
//	    r0 = 128-byte chunks to move (>0)
//		rd = 16-byte aligned
//	   cr5 = beq if source is 16-byte aligned
//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
// We set up many registers:
//	   ctr = number of 128-byte chunks to move
//	r0/cr0 = leftover QWs to move
//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//	   cr6 = beq if leftover byte count is 0
//		rv = original value of VRSave
// c16,c32,c48 = loaded

LFwdLongVectors:
        mfspr	rv,vrsave			// get bitmap of live vector registers
        lis		w3,kVeryLong>>16	// cutoff for very-long-operand special case path
        cmplw	cr1,rc,w3			// very long operand?
        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3
        bge--	cr1,LBigCopy        // handle big copies separately
        mtctr	r0					// set up loop count
        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
        oris	w4,rv,0xFFF8		// we use v0-v12
        rlwinm.	r0,rc,28,29,31		// get number of quadword leftovers (0-7) and set cr0
        li		c16,16				// get constants used in ldvx/stvx
        mtspr	vrsave,w4			// update mask
        li		c32,32
        li		c48,48
        beq		cr5,LFwdLongAligned	// source is also 16-byte aligned, no need for vperm
        lvsl	vp,0,rs				// get permute vector to shift left
        lvx		v1,0,rs				// prefetch 1st source quadword
        b		LFwdLongUnaligned


// Forward, long, unaligned vector loop.

        .align	5					// align inner loops
LFwdLongUnaligned:					// loop over 128-byte chunks
        addi	w4,rs,64
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        lvx		v4,c48,rs
        lvx		v5,0,w4
        lvx		v6,c16,w4
        vperm	vw,v1,v2,vp
        lvx		v7,c32,w4
        lvx		v8,c48,w4
        addi	rs,rs,128
        vperm	vx,v2,v3,vp
        addi	w4,rd,64
        lvx		v1,0,rs
        stvx	vw,0,rd
        vperm	vy,v3,v4,vp
        stvx	vx,c16,rd
        vperm	vz,v4,v5,vp
        stvx	vy,c32,rd
        vperm	vw,v5,v6,vp
        stvx	vz,c48,rd
        vperm	vx,v6,v7,vp
        addi	rd,rd,128
        stvx	vw,0,w4
        vperm	vy,v7,v8,vp
        stvx	vx,c16,w4
        vperm	vz,v8,v1,vp
        stvx	vy,c32,w4
        stvx	vz,c48,w4
        bdnz	LFwdLongUnaligned

        beq		4f					// no leftover quadwords
        mtctr	r0
3:									// loop over remaining quadwords
        lvx		v2,c16,rs
        addi	rs,rs,16
        vperm	vx,v1,v2,vp
        vor		v1,v2,v2			// v1 <- v2
        stvx	vx,0,rd
        addi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShort16		// handle last 0-15 bytes if any
        blr


// Forward, long, 16-byte aligned vector loop.

        .align	5
LFwdLongAligned:        			// loop over 128-byte chunks
        addi	w4,rs,64
        lvx		v1,0,rs
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        lvx		v4,c48,rs
        lvx		v5,0,w4
        lvx		v6,c16,w4
        lvx		v7,c32,w4
        lvx		v8,c48,w4
        addi	rs,rs,128
        addi	w4,rd,64
        stvx	v1,0,rd 
        stvx	v2,c16,rd
        stvx	v3,c32,rd
        stvx	v4,c48,rd
        stvx	v5,0,w4
        stvx	v6,c16,w4
        stvx	v7,c32,w4
        stvx	v8,c48,w4
        addi	rd,rd,128
        bdnz	LFwdLongAligned
                
        beq		4f					// no leftover quadwords
        mtctr	r0
3:									// loop over remaining quadwords (1-7)
        lvx		v1,0,rs
        addi	rs,rs,16
        stvx	v1,0,rd
        addi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShort16		// handle last 0-15 bytes if any
        blr
        

// Long, reverse moves.
//		rs = source
//		rd = destination
//		rc = count
//	   cr5 = beq if relatively 16-byte aligned

LLongReverse:
        add		rd,rd,rc			// point to end of operands
        add		rs,rs,rc
        andi.	r0,rd,0xF			// #bytes to 16-byte align destination
        beq		2f					// already aligned
        
// 16-byte align destination.

        mtctr	r0					// set up for loop
        sub		rc,rc,r0
1:
        lbzu	w1,-1(rs)
        stbu	w1,-1(rd)
        bdnz	1b

// Prepare for reverse vector loop.  When entered:
//		rd = 16-byte aligned
//		cr5 = beq if source also 16-byte aligned
// We set up many registers:
//		ctr/cr1 = number of 64-byte chunks to move (may be 0)
//		r0/cr0 = leftover QWs to move
//		cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//		cr6 = beq if leftover byte count is 0
//		cm1 = -1
//		rv = original value of vrsave

2:
        mfspr	rv,vrsave			// get bitmap of live vector registers
        srwi	r0,rc,6				// get count of 64-byte chunks to move (may be 0)
        oris	w1,rv,0xFFF8		// we use v0-v12
        mtcrf	0x01,rc				// prepare for moving last 0-15 bytes in LShortReverse16
        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3 too
        cmpwi	cr1,r0,0			// set cr1 on chunk count
        mtspr	vrsave,w1			// update mask
        mtctr	r0					// set up loop count
        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
        rlwinm.	r0,rc,28,30,31		// get number of quadword leftovers (0-3) and set cr0
        li		cm1,-1				// get constants used in ldvx/stvx
        
        bne		cr5,LReverseVecUnal	// handle unaligned operands
        beq		cr1,2f				// no chunks (if no chunks, must be leftover QWs)
        li		cm17,-17
        li		cm33,-33
        li		cm49,-49
        b		1f

// Long, reverse 16-byte-aligned vector loop.
      
        .align	5					// align inner loops
1:        							// loop over 64-byte chunks
        lvx		v1,cm1,rs
        lvx		v2,cm17,rs
        lvx		v3,cm33,rs
        lvx		v4,cm49,rs
        subi	rs,rs,64
        stvx	v1,cm1,rd
        stvx	v2,cm17,rd
        stvx	v3,cm33,rd
        stvx	v4,cm49,rd
        subi	rd,rd,64
        bdnz	1b
        
        beq		4f					// no leftover quadwords
2:									// r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
        mtctr	r0
3:									// loop over remaining quadwords (1-7)
        lvx		v1,cm1,rs
        subi	rs,rs,16
        stvx	v1,cm1,rd
        subi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShortReverse16	// handle last 0-15 bytes if any
        blr


// Long, reverse, unaligned vector loop.
//		ctr/cr1 = number of 64-byte chunks to move (may be 0)
//		r0/cr0 = leftover QWs to move
//		cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//		cr6 = beq if leftover byte count is 0
//		rv = original value of vrsave
//		cm1 = -1

LReverseVecUnal:
        lvsl	vp,0,rs				// get permute vector to shift left
        lvx		v1,cm1,rs			// v1 always looks ahead
        li		cm17,-17
        beq		cr1,2f				// no chunks (if no chunks, must be leftover QWs)
        li		cm33,-33
        li		cm49,-49
        b		1f
        
        .align	5					// align the inner loops
1:									// loop over 64-byte chunks
        lvx		v2,cm17,rs
        lvx		v3,cm33,rs
        lvx		v4,cm49,rs
        subi	rs,rs,64
        vperm	vx,v2,v1,vp
        lvx		v1,cm1,rs
        vperm	vy,v3,v2,vp
        stvx	vx,cm1,rd
        vperm	vz,v4,v3,vp
        stvx	vy,cm17,rd
        vperm	vx,v1,v4,vp
        stvx	vz,cm33,rd
        stvx	vx,cm49,rd
        subi	rd,rd,64
        bdnz	1b

        beq		4f					// no leftover quadwords
2:									// r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
        mtctr	r0
3:									// loop over 1-3 quadwords
        lvx		v2,cm17,rs
        subi	rs,rs,16
        vperm	vx,v2,v1,vp
        vor		v1,v2,v2			// v1 <- v2
        stvx	vx,cm1,rd
        subi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShortReverse16	// handle last 0-15 bytes iff any
        blr

        
// Very Big Copy Path.  Save our return address in the stack for help decoding backtraces.
// The conditions bigcopy expects are:
//  r0 = return address (also stored in caller's SF)
//	r4 = source ptr
//	r5 = length (at least several pages)
// r12 = dest ptr

LBigCopy:
		lis		r2,0x4000			// r2 <- 0x40000000
        mflr    r0                  // get our return address
		add.	r2,r2,r2			// set cr0_lt if running in 32-bit mode
        stw     r0,8(r1)            // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
		blta	_COMM_PAGE_BIGCOPY  // 32-bit mode, join big operand copy
		std		r0,16(r1)			// save return in correct spot for 64-bit mode
        ba      _COMM_PAGE_BIGCOPY  // then join big operand code
        

	COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
				kCommPageMTCRF+kCommPageBoth+kPort32to64)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
e5568f75 A	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
55e303ae	11	*
e5568f75 A	12	* This Original Code and all software distributed under the License are
e5568f75 A	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
55e303ae A	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
55e303ae A	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
e5568f75 A	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
55e303ae A	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22	/* =======================================
	23	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	24	* =======================================
	25	*
	26	* Version of 6/11/2003, tuned for the IBM 970.
	27	*
55e303ae A	28	* Register usage. Note the rather delicate way we assign multiple uses
	29	* to the same register. Beware.
	30	* r0 = temp (NB: cannot use r0 for any constant such as "c16")
	31	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	32	* r4 = source ptr ("rs")
	33	* r5 = count of bytes to move ("rc")
	34	* r6 = "w1", "c16", or "cm17"
	35	* r7 = "w2", "c32", or "cm33"
	36	* r8 = "w3", "c48", or "cm49"
	37	* r9 = "w4", or "cm1"
	38	* r10 = vrsave ("rv")
	39	* r11 = unused
	40	* r12 = destination ptr ("rd")
	41	* v0 = permute vector ("vp")
	42	* v1-v8 = qw's loaded from source
	43	*v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
	44	*/
	45	#define rs r4
	46	#define rd r12
	47	#define rc r5
	48	#define rv r10
	49
	50	#define w1 r6
	51	#define w2 r7
	52	#define w3 r8
	53	#define w4 r9
	54
	55	#define c16 r6
	56	#define cm17 r6
	57	#define c32 r7
	58	#define cm33 r7
	59	#define c48 r8
	60	#define cm49 r8
	61	#define cm1 r9
	62
	63	#define vp v0
	64	#define vw v9
	65	#define vx v10
	66	#define vy v11
	67	#define vz v12
	68
	69	#define ASSEMBLER
	70	#include <sys/appleapiopts.h>
	71	#include <ppc/asm.h>
	72	#include <machine/cpu_capabilities.h>
	73	#include <machine/commpage.h>
	74
	75	.text
91447636 A	76	/*
	77	* WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
	78	* to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
	79	* simple transformations:
	80	* - all word compares are changed to doubleword
	81	* - all "srwi[.]" opcodes are changed to "srdi[.]"
	82	* Nothing else is done. For this to work, the following rules must be
	83	* carefully followed:
	84	* - do not use carry or overflow
	85	* - only use record mode if you are sure the results are mode-invariant
	86	* for example, all "andi." and almost all "rlwinm." are fine
	87	* - do not use "slwi", "slw", or "srw"
	88	* An imaginative programmer could break the porting model in other ways, but the above
	89	* are the most likely problem areas. It is perhaps surprising how well in practice
	90	* this simple method works.
	91	*/
55e303ae A	92
	93	#define kShort 64
	94	#define kVeryLong (128*1024)
	95
	96
	97	// Main entry points.
	98
	99	.align 5
	100	bcopy_970: // void bcopy(const void src, void dst, size_t len)
	101	cmplwi rc,kShort // short or long?
	102	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
	103	mr rd,r4 // move registers to canonic spot
	104	mr rs,r3
	105	blt LShort // handle short operands
	106	dcbt 0,rs // touch in the first line of source
	107	dcbtst 0,rd // touch in destination
	108	b LLong1 // join long operand code
	109
	110	// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
	111
	112	.align 5
	113	Lmemcpy_970: // void* memcpy(void dst, void src, size_t len)
	114	Lmemmove_970: // void* memmove(void dst, const void src, size_t len)
	115	cmplwi rc,kShort // short or long?
	116	sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
	117	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
	118	bge LLong0 // handle long operands
	119
	120	// Handle short operands.
	121	// rs = source
	122	// rd = destination
	123	// rc = count
	124	// w1 = (rd-rs), must move reverse if (rd-rs)<rc
	125
	126	LShort:
	127	cmplw cr1,w1,rc // set cr1 blt if we must move reverse
	128	mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
	129	mtcrf 0x01,rc
	130	blt-- cr1,LShortReverse
	131
	132	// Forward short operands. This is the most frequent case, so it is inline.
	133
	134	bf 26,0f // 32-byte chunk to move?
	135	ld w1,0(rs)
	136	ld w2,8(rs)
	137	ld w3,16(rs)
	138	ld w4,24(rs)
	139	addi rs,rs,32
	140	std w1,0(rd)
	141	std w2,8(rd)
	142	std w3,16(rd)
	143	std w4,24(rd)
	144	addi rd,rd,32
	145	0:
	146	LShort32:
	147	bf 27,1f // quadword to move?
	148	ld w1,0(rs)
	149	ld w3,8(rs)
	150	addi rs,rs,16
	151	std w1,0(rd)
	152	std w3,8(rd)
	153	addi rd,rd,16
	154	1:
	155	LShort16: // join here to xfer 0-15 bytes
156	bf 28,2f // doubleword?
157	ld w1,0(rs)
158	addi rs,rs,8
159	std w1,0(rd)
160	addi rd,rd,8
161	2:
162	bf 29,3f // word?
163	lwz w1,0(rs)
164	addi rs,rs,4
165	stw w1,0(rd)
166	addi rd,rd,4
167	3:
168	bf 30,4f // halfword to move?
169	lhz w1,0(rs)
170	addi rs,rs,2
171	sth w1,0(rd)
172	addi rd,rd,2
173	4:
174	bflr 31 // skip if no odd byte
175	lbz w1,0(rs)
176	stb w1,0(rd)
177	blr
178
179
180	// Handle short reverse operands.
181	// cr = length in bits 26-31
182
183	LShortReverse:
184	add rs,rs,rc // adjust ptrs for reverse move
185	add rd,rd,rc
186	bf 26,0f // 32 bytes to move?
187	ld w1,-8(rs)
188	ld w2,-16(rs)
189	ld w3,-24(rs)
190	ldu w4,-32(rs)
191	std w1,-8(rd)
192	std w2,-16(rd)
193	std w3,-24(rd)
194	stdu w4,-32(rd)
195	0:
196	bf 27,1f // quadword to move?
197	ld w1,-8(rs)
198	ldu w2,-16(rs)
199	std w1,-8(rd)
200	stdu w2,-16(rd)
201	1:
202	LShortReverse16: // join here to xfer 0-15 bytes and return
203	bf 28,2f // doubleword?
204	ldu w1,-8(rs)
205	stdu w1,-8(rd)
206	2:
207	bf 29,3f // word?
208	lwzu w1,-4(rs)
209	stwu w1,-4(rd)
210	3:
211	bf 30,4f // halfword to move?
212	lhzu w1,-2(rs)
213	sthu w1,-2(rd)
214	4:
215	bflr 31 // done if no odd byte
216	lbz w1,-1(rs) // no update
217	stb w1,-1(rd)
218	blr
219
220
221	// Long operands, use Altivec in most cases.
222	// rs = source
223	// rd = destination
224	// rc = count
225	// w1 = (rd-rs), must move reverse if (rd-rs)<rc
226
227	LLong0: // entry from memmove()
228	dcbt 0,rs // touch in source
229	dcbtst 0,rd // touch in destination
230	LLong1: // entry from bcopy() with operands already touched in
231	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
232	neg w3,rd // start to compute #bytes to align destination
233	rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
234	andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
235	cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
236	blt-- cr1,LLongReverse // handle reverse moves
237	sub rc,rc,w4 // adjust length for aligning destination
238	srwi r0,rc,7 // get #cache lines to copy (may be 0)
239	cmpwi cr1,r0,0 // set cr1 on #chunks
240	beq LFwdAligned // dest is already aligned
241
242	// 16-byte align destination.
243
244	mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
245	bf 31,1f // byte to move?
246	lbz w1,0(rs)
247	addi rs,rs,1
248	stb w1,0(rd)
249	addi rd,rd,1
250	1:
251	bf 30,2f // halfword?
252	lhz w1,0(rs)
253	addi rs,rs,2
254	sth w1,0(rd)
255	addi rd,rd,2
256	2:
257	bf 29,3f // word?
258	lwz w1,0(rs)
259	addi rs,rs,4
260	stw w1,0(rd)
261	addi rd,rd,4
262	3:
263	bf 28,LFwdAligned // doubleword?
264	ld w1,0(rs)
265	addi rs,rs,8
266	std w1,0(rd)
267	addi rd,rd,8
268
269
270	// Forward, destination is 16-byte aligned. There are five cases:
271	// 1. If the length>=kVeryLong (ie, several pages), then use the
272	// "bigcopy" path that pulls all the punches. This is the fastest
273	// case for cold-cache operands, as any this long will likely be.
274	// 2. If length>=128 and source is 16-byte aligned, then use the
275	// lvx/stvx loop over 128-byte chunks. This is the fastest
276	// case for hot-cache operands, 2nd fastest for cold.
277	// 3. If length>=128 and source is not 16-byte aligned, then use the
278	// lvx/vperm/stvx loop over 128-byte chunks.
279	// 4. If length<128 and source is 8-byte aligned, then use the
280	// ld/std loop over 32-byte chunks.
281	// 5. If length<128 and source is not 8-byte aligned, then use the
282	// lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
283	// Registers at this point:
284	// r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
285	// rs = alignment unknown
286	// rd = 16-byte aligned
287	// rc = bytes remaining
288	// w2 = low 4 bits of (rd-rs), used to check alignment
289	// cr5 = beq if source is also 16-byte aligned
290
291	LFwdAligned:
292	andi. w3,w2,7 // is source at least 8-byte aligned?
293	mtcrf 0x01,rc // move leftover count to cr7 for LShort16
294	bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
295	srwi w1,rc,5 // get 32-byte chunk count
296	mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
297	mtctr w1 // set up 32-byte loop (w1!=0)
298	beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
299	mfspr rv,vrsave // get bitmap of live vector registers
300	oris w4,rv,0xFFF8 // we use v0-v12
301	li c16,16 // get constant used in lvx
302	li c32,32
303	mtspr vrsave,w4 // update mask
304	lvx v1,0,rs // prefetch 1st source quadword
305	lvsl vp,0,rs // get permute vector to shift left
306
307
308	// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
309
310	1: // loop over 32-byte chunks
311	lvx v2,c16,rs
312	lvx v3,c32,rs
313	addi rs,rs,32
314	vperm vx,v1,v2,vp
315	vperm vy,v2,v3,vp
316	vor v1,v3,v3 // v1 <- v3
317	stvx vx,0,rd
318	stvx vy,c16,rd
319	addi rd,rd,32
320	bdnz 1b
321
322	mtspr vrsave,rv // restore bitmap of live vr's
323	b LShort32
324
325
326	// Fewer than 128 bytes and doubleword aligned: use ld/std.
327
328	.align 5
329	LFwdMedAligned: // loop over 32-byte chunks
330	ld w1,0(rs)
331	ld w2,8(rs)
332	ld w3,16(rs)
333	ld w4,24(rs)
334	addi rs,rs,32
335	std w1,0(rd)
336	std w2,8(rd)
337	std w3,16(rd)
338	std w4,24(rd)
339	addi rd,rd,32
340	bdnz LFwdMedAligned
341
342	b LShort32
343
344
345	// Forward, 128 bytes or more: use vectors. When entered:
346	// r0 = 128-byte chunks to move (>0)
347	// rd = 16-byte aligned
348	// cr5 = beq if source is 16-byte aligned
349	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
350	// We set up many registers:
351	// ctr = number of 128-byte chunks to move
352	// r0/cr0 = leftover QWs to move
353	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
354	// cr6 = beq if leftover byte count is 0
355	// rv = original value of VRSave
356	// c16,c32,c48 = loaded
357
358	LFwdLongVectors:
359	mfspr rv,vrsave // get bitmap of live vector registers
360	lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
361	cmplw cr1,rc,w3 // very long operand?
362	rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
91447636	363	bge-- cr1,LBigCopy // handle big copies separately
55e303ae A	364	mtctr r0 // set up loop count
	365	cmpwi cr6,w3,0 // set cr6 on leftover byte count
	366	oris w4,rv,0xFFF8 // we use v0-v12
	367	rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
	368	li c16,16 // get constants used in ldvx/stvx
	369	mtspr vrsave,w4 // update mask
	370	li c32,32
	371	li c48,48
	372	beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
	373	lvsl vp,0,rs // get permute vector to shift left
	374	lvx v1,0,rs // prefetch 1st source quadword
	375	b LFwdLongUnaligned
	376
	377
	378	// Forward, long, unaligned vector loop.
	379
	380	.align 5 // align inner loops
	381	LFwdLongUnaligned: // loop over 128-byte chunks
	382	addi w4,rs,64
	383	lvx v2,c16,rs
	384	lvx v3,c32,rs
	385	lvx v4,c48,rs
	386	lvx v5,0,w4
	387	lvx v6,c16,w4
	388	vperm vw,v1,v2,vp
	389	lvx v7,c32,w4
	390	lvx v8,c48,w4
	391	addi rs,rs,128
	392	vperm vx,v2,v3,vp
	393	addi w4,rd,64
	394	lvx v1,0,rs
	395	stvx vw,0,rd
	396	vperm vy,v3,v4,vp
	397	stvx vx,c16,rd
	398	vperm vz,v4,v5,vp
	399	stvx vy,c32,rd
	400	vperm vw,v5,v6,vp
	401	stvx vz,c48,rd
	402	vperm vx,v6,v7,vp
	403	addi rd,rd,128
	404	stvx vw,0,w4
	405	vperm vy,v7,v8,vp
	406	stvx vx,c16,w4
	407	vperm vz,v8,v1,vp
	408	stvx vy,c32,w4
	409	stvx vz,c48,w4
	410	bdnz LFwdLongUnaligned
	411
	412	beq 4f // no leftover quadwords
	413	mtctr r0
	414	3: // loop over remaining quadwords
	415	lvx v2,c16,rs
	416	addi rs,rs,16
	417	vperm vx,v1,v2,vp
	418	vor v1,v2,v2 // v1 <- v2
	419	stvx vx,0,rd
	420	addi rd,rd,16
	421	bdnz 3b
	422	4:
	423	mtspr vrsave,rv // restore bitmap of live vr's
	424	bne cr6,LShort16 // handle last 0-15 bytes if any
	425	blr
	426
	427
428	// Forward, long, 16-byte aligned vector loop.
429
430	.align 5
431	LFwdLongAligned: // loop over 128-byte chunks
432	addi w4,rs,64
433	lvx v1,0,rs
434	lvx v2,c16,rs
435	lvx v3,c32,rs
436	lvx v4,c48,rs
437	lvx v5,0,w4
438	lvx v6,c16,w4
439	lvx v7,c32,w4
440	lvx v8,c48,w4
441	addi rs,rs,128
442	addi w4,rd,64
443	stvx v1,0,rd
444	stvx v2,c16,rd
445	stvx v3,c32,rd
446	stvx v4,c48,rd
447	stvx v5,0,w4
448	stvx v6,c16,w4
449	stvx v7,c32,w4
450	stvx v8,c48,w4
451	addi rd,rd,128
452	bdnz LFwdLongAligned
453
454	beq 4f // no leftover quadwords
455	mtctr r0
456	3: // loop over remaining quadwords (1-7)
457	lvx v1,0,rs
458	addi rs,rs,16
459	stvx v1,0,rd
460	addi rd,rd,16
461	bdnz 3b
462	4:
463	mtspr vrsave,rv // restore bitmap of live vr's
464	bne cr6,LShort16 // handle last 0-15 bytes if any
465	blr
466
467
468	// Long, reverse moves.
469	// rs = source
470	// rd = destination
471	// rc = count
472	// cr5 = beq if relatively 16-byte aligned
473
474	LLongReverse:
475	add rd,rd,rc // point to end of operands
476	add rs,rs,rc
477	andi. r0,rd,0xF // #bytes to 16-byte align destination
478	beq 2f // already aligned
479
480	// 16-byte align destination.
481
482	mtctr r0 // set up for loop
483	sub rc,rc,r0
484	1:
485	lbzu w1,-1(rs)
486	stbu w1,-1(rd)
487	bdnz 1b
488
489	// Prepare for reverse vector loop. When entered:
490	// rd = 16-byte aligned
491	// cr5 = beq if source also 16-byte aligned
492	// We set up many registers:
493	// ctr/cr1 = number of 64-byte chunks to move (may be 0)
494	// r0/cr0 = leftover QWs to move
495	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
496	// cr6 = beq if leftover byte count is 0
497	// cm1 = -1
498	// rv = original value of vrsave
499
500	2:
501	mfspr rv,vrsave // get bitmap of live vector registers
502	srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
503	oris w1,rv,0xFFF8 // we use v0-v12
504	mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
505	rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
506	cmpwi cr1,r0,0 // set cr1 on chunk count
507	mtspr vrsave,w1 // update mask
508	mtctr r0 // set up loop count
509	cmpwi cr6,w3,0 // set cr6 on leftover byte count
510	rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
511	li cm1,-1 // get constants used in ldvx/stvx
512
513	bne cr5,LReverseVecUnal // handle unaligned operands
514	beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
515	li cm17,-17
516	li cm33,-33
517	li cm49,-49
518	b 1f
519
520	// Long, reverse 16-byte-aligned vector loop.
521
522	.align 5 // align inner loops
523	1: // loop over 64-byte chunks
524	lvx v1,cm1,rs
525	lvx v2,cm17,rs
526	lvx v3,cm33,rs
527	lvx v4,cm49,rs
528	subi rs,rs,64
529	stvx v1,cm1,rd
530	stvx v2,cm17,rd
531	stvx v3,cm33,rd
532	stvx v4,cm49,rd
533	subi rd,rd,64
534	bdnz 1b
535
536	beq 4f // no leftover quadwords
537	2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
538	mtctr r0
539	3: // loop over remaining quadwords (1-7)
540	lvx v1,cm1,rs
541	subi rs,rs,16
542	stvx v1,cm1,rd
543	subi rd,rd,16
544	bdnz 3b
545	4:
546	mtspr vrsave,rv // restore bitmap of live vr's
547	bne cr6,LShortReverse16 // handle last 0-15 bytes if any
548	blr
549
550
551	// Long, reverse, unaligned vector loop.
552	// ctr/cr1 = number of 64-byte chunks to move (may be 0)
553	// r0/cr0 = leftover QWs to move
554	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
555	// cr6 = beq if leftover byte count is 0
556	// rv = original value of vrsave
557	// cm1 = -1
558
559	LReverseVecUnal:
560	lvsl vp,0,rs // get permute vector to shift left
561	lvx v1,cm1,rs // v1 always looks ahead
562	li cm17,-17
563	beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
564	li cm33,-33
565	li cm49,-49
566	b 1f
567
568	.align 5 // align the inner loops
569	1: // loop over 64-byte chunks
570	lvx v2,cm17,rs
571	lvx v3,cm33,rs
572	lvx v4,cm49,rs
573	subi rs,rs,64
574	vperm vx,v2,v1,vp
575	lvx v1,cm1,rs
576	vperm vy,v3,v2,vp
577	stvx vx,cm1,rd
578	vperm vz,v4,v3,vp
579	stvx vy,cm17,rd
580	vperm vx,v1,v4,vp
581	stvx vz,cm33,rd
582	stvx vx,cm49,rd
583	subi rd,rd,64
584	bdnz 1b
585
586	beq 4f // no leftover quadwords
587	2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
588	mtctr r0
589	3: // loop over 1-3 quadwords
590	lvx v2,cm17,rs
591	subi rs,rs,16
592	vperm vx,v2,v1,vp
593	vor v1,v2,v2 // v1 <- v2
594	stvx vx,cm1,rd
595	subi rd,rd,16
596	bdnz 3b
597	4:
598	mtspr vrsave,rv // restore bitmap of live vr's
599	bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
600	blr
601
91447636 A	602
	603	// Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
	604	// The conditions bigcopy expects are:
	605	// r0 = return address (also stored in caller's SF)
	606	// r4 = source ptr
	607	// r5 = length (at least several pages)
	608	// r12 = dest ptr
	609
	610	LBigCopy:
	611	lis r2,0x4000 // r2 <- 0x40000000
	612	mflr r0 // get our return address
	613	add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
	614	stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
	615	blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
	616	std r0,16(r1) // save return in correct spot for 64-bit mode
	617	ba _COMM_PAGE_BIGCOPY // then join big operand code
	618
	619
	620	COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
	621	kCommPageMTCRF+kCommPageBoth+kPort32to64)