git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22	/*
	23	* @OSF_COPYRIGHT@
	24	*/
	25	#include <debug.h>
	26	#include <ppc/asm.h>
	27	#include <ppc/proc_reg.h>
	28	#include <mach/ppc/vm_param.h>
	29	#include <assym.s>
	30	#include <sys/errno.h>
	31
	32	#define INSTRUMENT 0
	33
	34	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	35	/*
	36	* void pmap_zero_page(vm_offset_t pa)
	37	*
	38	* Zero a page of physical memory. This routine runs in 32 or 64-bit mode,
	39	* and handles 32 and 128-byte cache lines.
	40	*/
	41
	42
	43	.align 5
	44	.globl EXT(pmap_zero_page)
	45
	46	LEXT(pmap_zero_page)
	47
	48	mflr r12 // save return address
	49	bl EXT(ml_set_physical_disabled) // turn DR and EE off, SF on, get features in r10
	50	mtlr r12 // restore return address
	51	andi. r9,r10,pf32Byte+pf128Byte // r9 <- cache line size
	52
	53	subfic r4,r9,PPC_PGBYTES // r4 <- starting offset in page
	54
	55	bt++ pf64Bitb,page0S4 // Go do the big guys...
	56
	57	slwi r3,r3,12 // get page address from page num
	58	b page_zero_1 // Jump to line aligned loop...
	59
	60	.align 5
	61
	62	nop
	63	nop
	64	nop
	65	nop
	66	nop
	67	nop
	68	nop
	69
	70	page0S4:
	71	sldi r3,r3,12 // get page address from page num
	72
	73	page_zero_1: // loop zeroing cache lines
	74	sub. r5,r4,r9 // more to go?
	75	dcbz128 r3,r4 // zero either 32 or 128 bytes
	76	sub r4,r5,r9 // generate next offset
	77	dcbz128 r3,r5
	78	bne-- page_zero_1
	79
	80	b EXT(ml_restore) // restore MSR and do the isync
	81
	82
	83	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	84	/* void
	85	* phys_copy(src, dst, bytecount)
	86	* addr64_t src;
	87	* addr64_t dst;
	88	* int bytecount
	89	*
	90	* This routine will copy bytecount bytes from physical address src to physical
	91	* address dst. It runs in 64-bit mode if necessary, but does not handle
	92	* overlap or make any attempt to be optimal. Length must be a signed word.
	93	* Not performance critical.
	94	*/
	95
	96
	97	.align 5
	98	.globl EXT(phys_copy)
	99
	100	LEXT(phys_copy)
	101
	102	rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
	103	mflr r12 // get return address
	104	rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
	105	rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
	106	bl EXT(ml_set_physical_disabled) // turn DR and EE off, SF on, get features in r10
	107	rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
	108	mtlr r12 // restore return address
	109	subic. r5,r7,4 // a word to copy?
	110	b phys_copy_2
	111
	112	.align 5
	113
	114	phys_copy_1: // loop copying words
	115	subic. r5,r5,4 // more to go?
	116	lwz r0,0(r3)
	117	addi r3,r3,4
	118	stw r0,0(r4)
	119	addi r4,r4,4
	120	phys_copy_2:
	121	bge phys_copy_1
	122	addic. r5,r5,4 // restore count
	123	ble phys_copy_4 // no more
	124
	125	// Loop is aligned here
	126
	127	phys_copy_3: // loop copying bytes
	128	subic. r5,r5,1 // more to go?
	129	lbz r0,0(r3)
	130	addi r3,r3,1
	131	stb r0,0(r4)
	132	addi r4,r4,1
	133	bgt phys_copy_3
	134	phys_copy_4:
	135	b EXT(ml_restore) // restore MSR and do the isync
	136
	137
	138	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	139	/* void
	140	* pmap_copy_page(src, dst)
	141	* ppnum_t src;
	142	* ppnum_t dst;
	143	*
	144	* This routine will copy the physical page src to physical page dst
	145	*
	146	* This routine assumes that the src and dst are page numbers and that the
	147	* destination is cached. It runs on 32 and 64 bit processors, with and
	148	* without altivec, and with 32 and 128 byte cache lines.
	149	* We also must assume that no-one will be executing within the destination
	150	* page, and that this will be used for paging. Because this
	151	* is a common routine, we have tuned loops for each processor class.
	152	*
	153	*/
	154	#define kSFSize (FM_SIZE+160)
	155
	156	ENTRY(pmap_copy_page, TAG_NO_FRAME_USED)
	157
	158	lis r2,hi16(MASK(MSR_VEC)) ; Get the vector flag
	159	mflr r0 // get return
	160	ori r2,r2,lo16(MASK(MSR_FP)) ; Add the FP flag
	161	stw r0,8(r1) // save
	162	stwu r1,-kSFSize(r1) // set up a stack frame for VRs or FPRs
	163	mfmsr r11 // save MSR at entry
	164	mfsprg r10,2 // get feature flags
	165	andc r11,r11,r2 // Clear out vec and fp
	166	ori r2,r2,lo16(MASK(MSR_EE)) // Get EE on also
	167	andc r2,r11,r2 // Clear out EE as well
	168	mtcrf 0x02,r10 // we need to test pf64Bit
	169	ori r2,r2,MASK(MSR_FP) // must enable FP for G3...
	170	mtcrf 0x80,r10 // we need to test pfAltivec too
	171	oris r2,r2,hi16(MASK(MSR_VEC)) // enable altivec for G4 (ignored if G3)
	172	mtmsr r2 // turn EE off, FP and VEC on
	173	isync
	174	bt++ pf64Bitb,pmap_copy_64 // skip if 64-bit processor (only they take hint)
	175	slwi r3,r3,12 // get page address from page num
	176	slwi r4,r4,12 // get page address from page num
	177	rlwinm r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1 // get ready to turn off DR
	178	bt pfAltivecb,pmap_copy_g4 // altivec but not 64-bit means G4
	179
	180
	181	// G3 -- copy using FPRs
	182
	183	stfd f0,FM_SIZE+0(r1) // save the 4 FPRs we use to copy
	184	stfd f1,FM_SIZE+8(r1)
	185	li r5,PPC_PGBYTES/32 // count of cache lines in a page
	186	stfd f2,FM_SIZE+16(r1)
	187	mtctr r5
	188	stfd f3,FM_SIZE+24(r1)
	189	mtmsr r12 // turn off DR after saving FPRs on stack
	190	isync
	191
	192	pmap_g3_copy_loop: // loop over 32-byte cache lines
	193	dcbz 0,r4 // avoid read of dest line
	194	lfd f0,0(r3)
	195	lfd f1,8(r3)
	196	lfd f2,16(r3)
	197	lfd f3,24(r3)
	198	addi r3,r3,32
	199	stfd f0,0(r4)
	200	stfd f1,8(r4)
	201	stfd f2,16(r4)
	202	stfd f3,24(r4)
	203	dcbst 0,r4 // flush dest line to RAM
	204	addi r4,r4,32
	205	bdnz pmap_g3_copy_loop
	206
	207	sync // wait for stores to take
	208	subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
	209	li r6,PPC_PGBYTES-32 // point to last line in page
	210	pmap_g3_icache_flush:
	211	subic. r5,r6,32 // more to go?
	212	icbi r4,r6 // flush another line in icache
	213	subi r6,r5,32 // get offset to next line
	214	icbi r4,r5
	215	bne pmap_g3_icache_flush
	216
	217	sync
	218	mtmsr r2 // turn DR back on
	219	isync
	220	lfd f0,FM_SIZE+0(r1) // restore the FPRs
	221	lfd f1,FM_SIZE+8(r1)
	222	lfd f2,FM_SIZE+16(r1)
	223	lfd f3,FM_SIZE+24(r1)
	224
	225	b pmap_g4_restore // restore MSR and done
	226
	227
	228	// G4 -- copy using VRs
	229
	230	pmap_copy_g4: // r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR
	231	la r9,FM_SIZE+16(r1) // place where we save VRs to r9
	232	li r5,16 // load x-form offsets into r5-r9
	233	li r6,32 // another offset
	234	stvx v0,0,r9 // save some VRs so we can use to copy
	235	li r7,48 // another offset
	236	stvx v1,r5,r9
	237	li r0,PPC_PGBYTES/64 // we loop over 64-byte chunks
	238	stvx v2,r6,r9
	239	mtctr r0
	240	li r8,96 // get look-ahead for touch
	241	stvx v3,r7,r9
	242	li r9,128
	243	mtmsr r12 // now we've saved VRs on stack, turn off DR
	244	isync // wait for it to happen
	245	b pmap_g4_copy_loop
	246
	247	.align 5 // align inner loops
	248	pmap_g4_copy_loop: // loop over 64-byte chunks
	249	dcbt r3,r8 // touch 3 lines ahead
	250	nop // avoid a 17-word loop...
	251	dcbt r3,r9 // touch 4 lines ahead
	252	nop // more padding
	253	dcba 0,r4 // avoid pre-fetch of 1st dest line
	254	lvx v0,0,r3 // offset 0
	255	lvx v1,r5,r3 // offset 16
	256	lvx v2,r6,r3 // offset 32
	257	lvx v3,r7,r3 // offset 48
	258	addi r3,r3,64
	259	dcba r6,r4 // avoid pre-fetch of 2nd line
	260	stvx v0,0,r4 // offset 0
	261	stvx v1,r5,r4 // offset 16
	262	stvx v2,r6,r4 // offset 32
	263	stvx v3,r7,r4 // offset 48
	264	dcbf 0,r4 // push line 1
	265	dcbf r6,r4 // and line 2
	266	addi r4,r4,64
	267	bdnz pmap_g4_copy_loop
	268
	269	sync // wait for stores to take
	270	subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
	271	li r8,PPC_PGBYTES-32 // point to last line in page
	272	pmap_g4_icache_flush:
	273	subic. r9,r8,32 // more to go?
	274	icbi r4,r8 // flush from icache
	275	subi r8,r9,32 // get offset to next line
	276	icbi r4,r9
	277	bne pmap_g4_icache_flush
	278
	279	sync
	280	mtmsr r2 // turn DR back on
	281	isync
	282	la r9,FM_SIZE+16(r1) // get base of VR save area
	283	lvx v0,0,r9 // restore the VRs
	284	lvx v1,r5,r9
	285	lvx v2,r6,r9
	286	lvx v3,r7,r9
	287
	288	pmap_g4_restore: // r11=MSR
	289	mtmsr r11 // turn EE on, VEC and FR off
	290	isync // wait for it to happen
	291	addi r1,r1,kSFSize // pop off our stack frame
	292	lwz r0,8(r1) // restore return address
	293	mtlr r0
	294	blr
	295
	296
	297	// 64-bit/128-byte processor: copy using VRs
	298
	299	pmap_copy_64: // r10=features, r11=old MSR
	300	sldi r3,r3,12 // get page address from page num
	301	sldi r4,r4,12 // get page address from page num
	302	la r9,FM_SIZE+16(r1) // get base of VR save area
	303	li r5,16 // load x-form offsets into r5-r9
	304	li r6,32 // another offset
	305	bf pfAltivecb,pmap_novmx_copy // altivec suppressed...
	306	stvx v0,0,r9 // save 8 VRs so we can copy wo bubbles
	307	stvx v1,r5,r9
	308	li r7,48 // another offset
	309	li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks
	310	stvx v2,r6,r9
	311	stvx v3,r7,r9
	312	addi r9,r9,64 // advance base ptr so we can store another 4
	313	mtctr r0
	314	li r0,MASK(MSR_DR) // get DR bit
	315	stvx v4,0,r9
	316	stvx v5,r5,r9
	317	andc r12,r2,r0 // turn off DR bit
	318	li r0,1 // get a 1 to slam into SF
	319	stvx v6,r6,r9
	320	stvx v7,r7,r9
	321	rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0)
	322	li r8,-128 // offset so we can reach back one line
	323	mtmsrd r12 // now we've saved VRs, turn DR off and SF on
	324	isync // wait for it to happen
	325	dcbt128 0,r3,1 // start a forward stream
	326	b pmap_64_copy_loop
	327
	328	.align 5 // align inner loops
	329	pmap_64_copy_loop: // loop over 128-byte chunks
	330	dcbz128 0,r4 // avoid read of destination line
	331	lvx v0,0,r3 // offset 0
	332	lvx v1,r5,r3 // offset 16
	333	lvx v2,r6,r3 // offset 32
	334	lvx v3,r7,r3 // offset 48
	335	addi r3,r3,64 // don't have enough GPRs so add 64 2x
	336	lvx v4,0,r3 // offset 64
	337	lvx v5,r5,r3 // offset 80
	338	lvx v6,r6,r3 // offset 96
	339	lvx v7,r7,r3 // offset 112
	340	addi r3,r3,64
	341	stvx v0,0,r4 // offset 0
	342	stvx v1,r5,r4 // offset 16
	343	stvx v2,r6,r4 // offset 32
	344	stvx v3,r7,r4 // offset 48
	345	addi r4,r4,64
	346	stvx v4,0,r4 // offset 64
	347	stvx v5,r5,r4 // offset 80
	348	stvx v6,r6,r4 // offset 96
	349	stvx v7,r7,r4 // offset 112
	350	addi r4,r4,64
	351	dcbf r8,r4 // flush the line we just wrote
	352	bdnz pmap_64_copy_loop
	353
	354	sync // wait for stores to take
	355	subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
	356	li r8,PPC_PGBYTES-128 // point to last line in page
	357	pmap_64_icache_flush:
	358	subic. r9,r8,128 // more to go?
	359	icbi r4,r8 // flush from icache
	360	subi r8,r9,128 // get offset to next line
	361	icbi r4,r9
	362	bne pmap_64_icache_flush
	363
	364	sync
	365	mtmsrd r2 // turn DR back on, SF off
	366	isync
	367	la r9,FM_SIZE+16(r1) // get base address of VR save area on stack
	368	lvx v0,0,r9 // restore the VRs
	369	lvx v1,r5,r9
	370	lvx v2,r6,r9
	371	lvx v3,r7,r9
	372	addi r9,r9,64
	373	lvx v4,0,r9
	374	lvx v5,r5,r9
	375	lvx v6,r6,r9
	376	lvx v7,r7,r9
	377
	378	b pmap_g4_restore // restore lower half of MSR and return
	379
	380	//
	381	// Copy on 64-bit without VMX
	382	//
	383
	384	pmap_novmx_copy:
	385	li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks
	386	mtctr r0
	387	li r0,MASK(MSR_DR) // get DR bit
	388	andc r12,r2,r0 // turn off DR bit
	389	li r0,1 // get a 1 to slam into SF
	390	rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0)
	391	mtmsrd r12 // now we've saved VRs, turn DR off and SF on
	392	isync // wait for it to happen
	393	dcbt128 0,r3,1 // start a forward stream
	394
	395	pmap_novmx_copy_loop: // loop over 128-byte cache lines
	396	dcbz128 0,r4 // avoid read of dest line
	397
	398	ld r0,0(r3) // Load half a line
	399	ld r12,8(r3)
	400	ld r5,16(r3)
	401	ld r6,24(r3)
	402	ld r7,32(r3)
	403	ld r8,40(r3)
	404	ld r9,48(r3)
	405	ld r10,56(r3)
	406
	407	std r0,0(r4) // Store half a line
	408	std r12,8(r4)
	409	std r5,16(r4)
	410	std r6,24(r4)
	411	std r7,32(r4)
	412	std r8,40(r4)
	413	std r9,48(r4)
	414	std r10,56(r4)
	415
	416	ld r0,64(r3) // Load half a line
	417	ld r12,72(r3)
	418	ld r5,80(r3)
	419	ld r6,88(r3)
	420	ld r7,96(r3)
	421	ld r8,104(r3)
	422	ld r9,112(r3)
	423	ld r10,120(r3)
	424
	425	addi r3,r3,128
	426
	427	std r0,64(r4) // Store half a line
	428	std r12,72(r4)
	429	std r5,80(r4)
	430	std r6,88(r4)
	431	std r7,96(r4)
	432	std r8,104(r4)
	433	std r9,112(r4)
	434	std r10,120(r4)
	435
	436	dcbf 0,r4 // flush the line we just wrote
	437	addi r4,r4,128
	438	bdnz pmap_novmx_copy_loop
	439
	440	sync // wait for stores to take
	441	subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
	442	li r8,PPC_PGBYTES-128 // point to last line in page
	443
	444	pmap_novmx_icache_flush:
	445	subic. r9,r8,128 // more to go?
	446	icbi r4,r8 // flush from icache
	447	subi r8,r9,128 // get offset to next line
	448	icbi r4,r9
	449	bne pmap_novmx_icache_flush
	450
	451	sync
	452	mtmsrd r2 // turn DR back on, SF off
	453	isync
	454
	455	b pmap_g4_restore // restore lower half of MSR and return
	456
	457
	458
	459	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	460
	461	// Stack frame format used by copyin, copyout, copyinstr and copyoutstr.
	462	// These routines all run both on 32 and 64-bit machines, though because they are called
	463	// by the BSD kernel they are always in 32-bit mode when entered. The mapped ptr returned
	464	// by MapUserAddressSpace will be 64 bits however on 64-bit machines. Beware to avoid
	465	// using compare instructions on this ptr. This mapped ptr is kept globally in r31, so there
	466	// is no need to store or load it, which are mode-dependent operations since it could be
	467	// 32 or 64 bits.
	468
	469	#define kkFrameSize (FM_SIZE+32)
	470
	471	#define kkBufSize (FM_SIZE+0)
	472	#define kkCR (FM_SIZE+4)
	473	#define kkSource (FM_SIZE+8)
	474	#define kkDest (FM_SIZE+12)
	475	#define kkCountPtr (FM_SIZE+16)
	476	#define kkR31Save (FM_SIZE+20)
	477
	478
	479	// nonvolatile CR bits we use as flags in cr3
	480
	481	#define kk64bit 12
	482	#define kkNull 13
	483	#define kkIn 14
	484	#define kkString 15
	485	#define kkZero 15
	486
	487
	488	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	489	/*
	490	* int
	491	* copyoutstr(src, dst, maxcount, count)
	492	* vm_offset_t src;
	493	* vm_offset_t dst;
	494	* vm_size_t maxcount;
	495	* vm_size_t* count;
	496	*
	497	* Set *count to the number of bytes copied.
	498	*/
	499
	500	ENTRY(copyoutstr, TAG_NO_FRAME_USED)
	501	mfcr r2 // we use nonvolatile cr3
	502	li r0,0
	503	crset kkString // flag as a string op
	504	mr r10,r4 // for copyout, dest ptr (r4) is in user space
	505	stw r0,0(r6) // initialize #bytes moved
	506	crclr kkIn // flag as copyout
	507	b copyJoin
	508
	509
	510	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	511	/*
	512	* int
	513	* copyinstr(src, dst, maxcount, count)
	514	* vm_offset_t src;
	515	* vm_offset_t dst;
	516	* vm_size_t maxcount;
	517	* vm_size_t* count;
	518	*
	519	* Set *count to the number of bytes copied
	520	* If dst == NULL, don't copy, just count bytes.
	521	* Only currently called from klcopyinstr.
	522	*/
	523
	524	ENTRY(copyinstr, TAG_NO_FRAME_USED)
	525	mfcr r2 // we use nonvolatile cr3
	526	cmplwi r4,0 // dst==NULL?
	527	li r0,0
	528	crset kkString // flag as a string op
	529	mr r10,r3 // for copyin, source ptr (r3) is in user space
	530	crmove kkNull,cr0_eq // remember if (dst==NULL)
	531	stw r0,0(r6) // initialize #bytes moved
	532	crset kkIn // flag as copyin (rather than copyout)
	533	b copyJoin1 // skip over the "crclr kkNull"
	534
	535
	536	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	537	/*
	538	* int
	539	* copyout(src, dst, count)
	540	* vm_offset_t src;
	541	* vm_offset_t dst;
	542	* size_t count;
	543	*/
	544
	545	.align 5
	546	.globl EXT(copyout)
	547	.globl EXT(copyoutmsg)
	548
	549	LEXT(copyout)
	550	LEXT(copyoutmsg)
	551
	552	#if INSTRUMENT
	553	mfspr r12,pmc1 ; INSTRUMENT - saveinstr[12] - Take stamp at copyout
	554	stw r12,0x6100+(12*16)+0x0(0) ; INSTRUMENT - Save it
	555	mfspr r12,pmc2 ; INSTRUMENT - Get stamp
	556	stw r12,0x6100+(12*16)+0x4(0) ; INSTRUMENT - Save it
	557	mfspr r12,pmc3 ; INSTRUMENT - Get stamp
	558	stw r12,0x6100+(12*16)+0x8(0) ; INSTRUMENT - Save it
	559	mfspr r12,pmc4 ; INSTRUMENT - Get stamp
	560	stw r12,0x6100+(12*16)+0xC(0) ; INSTRUMENT - Save it
	561	#endif
	562	mfcr r2 // save caller's CR
	563	crclr kkString // not a string version
	564	mr r10,r4 // dest (r4) is user-space ptr
	565	crclr kkIn // flag as copyout
	566	b copyJoin
	567
	568
	569	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	570	/*
	571	* int
	572	* copyin(src, dst, count)
	573	* vm_offset_t src;
	574	* vm_offset_t dst;
	575	* size_t count;
	576	*/
	577
	578
	579	.align 5
	580	.globl EXT(copyin)
	581	.globl EXT(copyinmsg)
	582
	583	LEXT(copyin)
	584	LEXT(copyinmsg)
	585
	586	mfcr r2 // save caller's CR
	587	crclr kkString // not a string version
	588	mr r10,r3 // source (r3) is user-space ptr in copyin
	589	crset kkIn // flag as copyin
	590
	591
	592	// Common code to handle setup for all the copy variants:
	593	// r2 = caller's CR, since we use cr3
	594	// r3-r6 = parameters
	595	// r10 = user-space ptr (r3 if copyin, r4 if copyout)
	596	// cr3 = kkIn, kkString, kkNull flags
	597
	598	copyJoin:
	599	crclr kkNull // (dst==NULL) convention not used with this call
	600	copyJoin1: // enter from copyinstr with kkNull set
	601	mflr r0 // get return address
	602	cmplwi r5,0 // buffer length 0?
	603	lis r9,0x1000 // r9 <- 0x10000000 (256MB)
	604	stw r0,FM_LR_SAVE(r1) // save return
	605	cmplw cr1,r5,r9 // buffer length > 256MB ?
	606	mfsprg r8,2 // get the features
	607	beq-- copyinout_0 // 0 length is degenerate case
	608	stwu r1,-kkFrameSize(r1) // set up stack frame
	609	stw r2,kkCR(r1) // save caller's CR since we use cr3
	610	mtcrf 0x02,r8 // move pf64Bit to cr6
	611	stw r3,kkSource(r1) // save args across MapUserAddressSpace
	612	stw r4,kkDest(r1)
	613	stw r5,kkBufSize(r1)
	614	crmove kk64bit,pf64Bitb // remember if this is a 64-bit processor
	615	stw r6,kkCountPtr(r1)
	616	stw r31,kkR31Save(r1) // we use r31 globally for mapped user ptr
	617	li r31,0 // no mapped ptr yet
	618
	619
	620	// Handle buffer length > 256MB. This is an error (ENAMETOOLONG) on copyin and copyout.
	621	// The string ops are passed -1 lengths by some BSD callers, so for them we silently clamp
	622	// the buffer length to 256MB. This isn't an issue if the string is less than 256MB
	623	// (as most are!), but if they are >256MB we eventually return ENAMETOOLONG. This restriction
	624	// is due to MapUserAddressSpace; we don't want to consume more than two segments for
	625	// the mapping.
	626
	627	ble++ cr1,copyin0 // skip if buffer length <= 256MB
	628	bf kkString,copyinout_too_big // error if not string op
	629	mr r5,r9 // silently clamp buffer length to 256MB
	630	stw r9,kkBufSize(r1) // update saved copy too
	631
	632
	633	// Set up thread_recover in case we hit an illegal address.
	634
	635	copyin0:
	636	mfsprg r8,1 /* Get the current act */
	637	lis r2,hi16(copyinout_error)
	638	lwz r7,ACT_THREAD(r8)
	639	ori r2,r2,lo16(copyinout_error)
	640	lwz r3,ACT_VMMAP(r8) // r3 <- vm_map virtual address
	641	stw r2,THREAD_RECOVER(r7)
	642
	643
	644	// Map user segment into kernel map, turn on 64-bit mode.
	645	// r3 = vm map
	646	// r5 = buffer length
	647	// r10 = user space ptr (r3 if copyin, r4 if copyout)
	648
	649	mr r6,r5 // Set length to map
	650	li r4,0 // Note: we only do this 32-bit for now
	651	mr r5,r10 // arg2 <- user space ptr
	652	#if INSTRUMENT
	653	mfspr r12,pmc1 ; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace
	654	stw r12,0x6100+(13*16)+0x0(0) ; INSTRUMENT - Save it
	655	mfspr r12,pmc2 ; INSTRUMENT - Get stamp
	656	stw r12,0x6100+(13*16)+0x4(0) ; INSTRUMENT - Save it
	657	mfspr r12,pmc3 ; INSTRUMENT - Get stamp
	658	stw r12,0x6100+(13*16)+0x8(0) ; INSTRUMENT - Save it
	659	mfspr r12,pmc4 ; INSTRUMENT - Get stamp
	660	stw r12,0x6100+(13*16)+0xC(0) ; INSTRUMENT - Save it
	661	#endif
	662	bl EXT(MapUserAddressSpace) // set r3 <- address in kernel map of user operand
	663	#if INSTRUMENT
	664	mfspr r12,pmc1 ; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace
	665	stw r12,0x6100+(14*16)+0x0(0) ; INSTRUMENT - Save it
	666	mfspr r12,pmc2 ; INSTRUMENT - Get stamp
	667	stw r12,0x6100+(14*16)+0x4(0) ; INSTRUMENT - Save it
	668	mfspr r12,pmc3 ; INSTRUMENT - Get stamp
	669	stw r12,0x6100+(14*16)+0x8(0) ; INSTRUMENT - Save it
	670	mfspr r12,pmc4 ; INSTRUMENT - Get stamp
	671	stw r12,0x6100+(14*16)+0xC(0) ; INSTRUMENT - Save it
	672	#endif
	673	or. r0,r3,r4 // Did we fail the mapping?
	674	mr r31,r4 // r31 <- mapped ptr into user space (may be 64-bit)
	675	beq-- copyinout_error // was 0, so there was an error making the mapping
	676	bf-- kk64bit,copyin1 // skip if a 32-bit processor
	677
	678	rldimi r31,r3,32,0 // slam high-order bits into mapped ptr
	679	mfmsr r4 // if 64-bit, turn on SF so we can use returned ptr
	680	li r0,1
	681	rldimi r4,r0,63,MSR_SF_BIT // light bit 0
	682	mtmsrd r4 // turn on 64-bit mode
	683	isync // wait for mode to change
	684
	685
	686	// Load r3-r5, substituting mapped ptr as appropriate.
	687
	688	copyin1:
	689	lwz r5,kkBufSize(r1) // restore length to copy
	690	bf kkIn,copyin2 // skip if copyout
	691	lwz r4,kkDest(r1) // copyin: source is mapped, dest is r4 at entry
	692	mr r3,r31 // source is mapped ptr
	693	b copyin3
	694	copyin2: // handle copyout
	695	lwz r3,kkSource(r1) // source is kernel buffer (r3 at entry)
	696	mr r4,r31 // dest is mapped ptr into user space
	697
	698
	699	// Finally, all set up to copy:
	700	// r3 = source ptr (mapped if copyin)
	701	// r4 = dest ptr (mapped if copyout)
	702	// r5 = length
	703	// r31 = mapped ptr returned by MapUserAddressSpace
	704	// cr3 = kkIn, kkString, kk64bit, and kkNull flags
	705
	706	copyin3:
	707	bt kkString,copyString // handle copyinstr and copyoutstr
	708	bl EXT(bcopy) // copyin and copyout: let bcopy do the work
	709	li r3,0 // return success
	710
	711
	712	// Main exit point for copyin, copyout, copyinstr, and copyoutstr. Also reached
	713	// from error recovery if we get a DSI accessing user space. Clear recovery ptr,
	714	// and pop off frame. Note that we have kept
	715	// the mapped ptr into user space in r31, as a reg64_t type (ie, a 64-bit ptr on
	716	// 64-bit machines.) We must unpack r31 into an addr64_t in (r3,r4) before passing
	717	// it to ReleaseUserAddressSpace.
	718	// r3 = 0, EFAULT, or ENAMETOOLONG
	719
	720	copyinx:
	721	lwz r2,kkCR(r1) // get callers cr3
	722	mfsprg r6,1 // Get the current act
	723	lwz r10,ACT_THREAD(r6)
	724
	725	bf-- kk64bit,copyinx1 // skip if 32-bit processor
	726	mfmsr r12
	727	rldicl r12,r12,0,MSR_SF_BIT+1 // if 64-bit processor, turn 64-bit mode off
	728	mtmsrd r12 // turn SF off and EE back on
	729	isync // wait for the mode to change
	730	copyinx1:
	731	lwz r31,kkR31Save(r1) // restore callers r31
	732	addi r1,r1,kkFrameSize // pop off our stack frame
	733	lwz r0,FM_LR_SAVE(r1)
	734	li r4,0
	735	stw r4,THREAD_RECOVER(r10) // Clear recovery
	736	mtlr r0
	737	mtcrf 0x10,r2 // restore cr3
	738	blr
	739
	740
	741	/* We get here via the exception handler if an illegal
	742	* user memory reference was made. This error handler is used by
	743	* copyin, copyout, copyinstr, and copyoutstr. Registers are as
	744	* they were at point of fault, so for example cr3 flags are valid.
	745	*/
	746
	747	copyinout_error:
	748	li r3,EFAULT // return error
	749	b copyinx
	750
	751	copyinout_0: // degenerate case: 0-length copy
	752	mtcrf 0x10,r2 // restore cr3
	753	li r3,0 // return success
	754	blr
	755
	756	copyinout_too_big: // degenerate case
	757	mtcrf 0x10,r2 // restore cr3
	758	lwz r1,0(r1) // pop off stack frame
	759	li r3,ENAMETOOLONG
	760	blr
	761
	762
	763	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	764	// Handle copyinstr and copyoutstr. At this point the stack frame is set up,
	765	// the recovery ptr is set, the user's buffer is mapped, we're in 64-bit mode
	766	// if necessary, and:
	767	// r3 = source ptr, mapped if copyinstr
	768	// r4 = dest ptr, mapped if copyoutstr
	769	// r5 = buffer length
	770	// r31 = mapped ptr returned by MapUserAddressSpace
	771	// cr3 = kkIn, kkString, kkNull, and kk64bit flags
	772	// We do word copies unless the buffer is very short, then use a byte copy loop
	773	// for the leftovers if necessary.
	774
	775	copyString:
	776	li r12,0 // Set header bytes count to zero
	777	cmplwi cr1,r5,20 // is buffer very short?
	778	mtctr r5 // assuming short, set up loop count for bytes
	779	blt cr1,copyinstr8 // too short for word loop
	780	andi. r12,r3,0x3 // is source ptr word aligned?
	781	bne copyinstr11 // bytes loop
	782	copyinstr1:
	783	srwi r6,r5,2 // get #words in buffer
	784	mtctr r6 // set up word loop count
	785	lis r10,hi16(0xFEFEFEFF) // load magic constants into r10 and r11
	786	lis r11,hi16(0x80808080)
	787	ori r10,r10,lo16(0xFEFEFEFF)
	788	ori r11,r11,lo16(0x80808080)
	789	bf kkNull,copyinstr6 // enter loop that copies
	790	b copyinstr5 // use loop that just counts
	791
	792
	793	// Word loop(s). They do a word-parallel search for 0s, using the following
	794	// inobvious but very efficient test:
	795	// y = data + 0xFEFEFEFF
	796	// z = ~data & 0x80808080
	797	// If (y & z)==0, then all bytes in dataword are nonzero. We need two copies of
	798	// this loop, since if we test kkNull in the loop then it becomes 9 words long.
	799
	800	.align 5 // align inner loops for speed
	801	copyinstr5: // version that counts but does not copy
	802	lwz r8,0(r3) // get next word of source
	803	addi r3,r3,4 // increment source ptr
	804	add r9,r10,r8 // r9 = data + 0xFEFEFEFF
	805	andc r7,r11,r8 // r7 = ~data & 0x80808080
	806	and. r7,r9,r7 // r7 = r9 & r7
	807	bdnzt cr0_eq,copyinstr5 // if r7==0, then all bytes are nonzero
	808
	809	b copyinstr7
	810
	811	.align 5 // align inner loops for speed
	812	copyinstr6: // version that counts and copies
	813	lwz r8,0(r3) // get next word of source
	814	addi r3,r3,4 // increment source ptr
	815	addi r4,r4,4 // increment dest ptr while we wait for data
	816	add r9,r10,r8 // r9 = data + 0xFEFEFEFF
	817	andc r7,r11,r8 // r7 = ~data & 0x80808080
	818	and. r7,r9,r7 // r7 = r9 & r7
	819	stw r8,-4(r4) // pack all 4 bytes into buffer
	820	bdnzt cr0_eq,copyinstr6 // if r7==0, then all bytes are nonzero
	821
	822
	823	// Either 0 found or buffer filled. The above algorithm has mapped nonzero bytes to 0
	824	// and 0 bytes to 0x80 with one exception: 0x01 bytes preceeding the first 0 are also
	825	// mapped to 0x80. We must mask out these false hits before searching for an 0x80 byte.
	826
	827	copyinstr7:
	828	crnot kkZero,cr0_eq // 0 found iff cr0_eq is off
	829	mfctr r6 // get #words remaining in buffer
	830	rlwinm r2,r8,7,0,31 // move 0x01 bits to 0x80 position
	831	slwi r6,r6,2 // convert to #bytes remaining
	832	andc r7,r7,r2 // turn off false hits from 0x0100 worst case
	833	rlwimi r6,r5,0,30,31 // add in odd bytes leftover in buffer
	834	srwi r7,r7,8 // we want to count the 0 as a byte xferred
	835	addi r6,r6,4 // don't count last word xferred (yet)
	836	cntlzw r7,r7 // now we can find the 0 byte (ie, the 0x80)
	837	srwi r7,r7,3 // convert 8,16,24,32 to 1,2,3,4
	838	sub. r6,r6,r7 // account for nonzero bytes in last word
	839	bt++ kkZero,copyinstr10 // 0 found, so done
	840
	841	beq copyinstr10 // r6==0, so buffer truly full
	842	mtctr r6 // 0 not found, loop over r6 bytes
	843	b copyinstr8 // enter byte loop for last 1-3 leftover bytes
	844
	845
	846	// Byte loop. This is used for very small buffers and for the odd bytes left over
	847	// after searching and copying words at a time.
	848
	849	.align 5 // align inner loops for speed
	850	copyinstr8: // loop over bytes of source
	851	lbz r0,0(r3) // get next byte of source
	852	addi r3,r3,1
	853	addi r4,r4,1 // increment dest addr whether we store or not
	854	cmpwi r0,0 // the 0?
	855	bt-- kkNull,copyinstr9 // don't store (was copyinstr with NULL ptr)
	856	stb r0,-1(r4)
	857	copyinstr9:
	858	bdnzf cr0_eq,copyinstr8 // loop if byte not 0 and more room in buffer
	859
	860	mfctr r6 // get #bytes left in buffer
	861	crmove kkZero,cr0_eq // remember if 0 found or buffer filled
	862
	863
	864	// Buffer filled or 0 found. Unwind and return.
	865	// r5 = kkBufSize, ie buffer length
	866	// r6 = untransferred bytes remaining in buffer
	867	// r31 = mapped ptr returned by MapUserAddressSpace
	868	// cr3 = kkZero set iff 0 found
	869
	870	copyinstr10:
	871	lwz r9,kkCountPtr(r1) // get ptr to place to store count of bytes moved
	872	sub r2,r5,r6 // get #bytes we moved, counting the 0 iff any
	873	add r2,r2,r12 // add the header bytes count
	874	li r3,0 // assume 0 return status
	875	stw r2,0(r9) // store #bytes moved
	876	bt++ kkZero,copyinx // we did find the 0 so return 0
	877	li r3,ENAMETOOLONG // buffer filled
	878	b copyinx // join main exit routine
	879
	880	// Byte loop. This is used on the header bytes for unaligned source
	881
	882	.align 5 // align inner loops for speed
	883	copyinstr11:
	884	li r10,4 // load word size
	885	sub r12,r10,r12 // set the header bytes count
	886	mtctr r12 // set up bytes loop count
	887	copyinstr12: // loop over bytes of source
	888	lbz r0,0(r3) // get next byte of source
	889	addi r3,r3,1
	890	addi r4,r4,1 // increment dest addr whether we store or not
	891	cmpwi r0,0 // the 0?
	892	bt-- kkNull,copyinstr13 // don't store (was copyinstr with NULL ptr)
	893	stb r0,-1(r4)
	894	copyinstr13:
	895	bdnzf cr0_eq,copyinstr12 // loop if byte not 0 and more room in buffer
	896	sub r5,r5,r12 // substract the bytes copied
	897	bne cr0_eq,copyinstr1 // branch to word loop
	898
	899	mr r5,r12 // Get the header bytes count
	900	li r12,0 // Clear the header bytes count
	901	mfctr r6 // get #bytes left in buffer
	902	crmove kkZero,cr0_eq // remember if 0 found or buffer filled
	903	b copyinstr10
	904