git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22	/*
	23	* @OSF_COPYRIGHT@
	24	*/
	25	#include <debug.h>
	26	#include <ppc/asm.h>
	27	#include <ppc/proc_reg.h>
	28	#include <mach/ppc/vm_param.h>
	29	#include <assym.s>
	30	#include <sys/errno.h>
	31
	32	#define INSTRUMENT 0
	33
	34	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	35	/*
	36	* void pmap_zero_page(vm_offset_t pa)
	37	*
	38	* Zero a page of physical memory. This routine runs in 32 or 64-bit mode,
	39	* and handles 32 and 128-byte cache lines.
	40	*/
	41
	42
	43	.align 5
	44	.globl EXT(pmap_zero_page)
	45
	46	LEXT(pmap_zero_page)
	47
	48	mflr r12 // save return address
	49	bl EXT(ml_set_physical_disabled) // turn DR and EE off, SF on, get features in r10
	50	mtlr r12 // restore return address
	51	andi. r9,r10,pf32Byte+pf128Byte // r9 <- cache line size
	52
	53	subfic r4,r9,PPC_PGBYTES // r4 <- starting offset in page
	54
	55	bt++ pf64Bitb,page0S4 // Go do the big guys...
	56
	57	slwi r3,r3,12 // get page address from page num
	58	b page_zero_1 // Jump to line aligned loop...
	59
	60	.align 5
	61
	62	nop
	63	nop
	64	nop
	65	nop
	66	nop
	67	nop
	68	nop
	69
	70	page0S4:
	71	sldi r3,r3,12 // get page address from page num
	72
	73	page_zero_1: // loop zeroing cache lines
	74	sub. r5,r4,r9 // more to go?
	75	dcbz128 r3,r4 // zero either 32 or 128 bytes
	76	sub r4,r5,r9 // generate next offset
	77	dcbz128 r3,r5
	78	bne-- page_zero_1
	79
	80	b EXT(ml_restore) // restore MSR and do the isync
	81
	82
	83	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	84	/* void
	85	* phys_copy(src, dst, bytecount)
	86	* addr64_t src;
	87	* addr64_t dst;
	88	* int bytecount
	89	*
	90	* This routine will copy bytecount bytes from physical address src to physical
	91	* address dst. It runs in 64-bit mode if necessary, but does not handle
	92	* overlap or make any attempt to be optimal. Length must be a signed word.
	93	* Not performance critical.
	94	*/
	95
	96
	97	.align 5
	98	.globl EXT(phys_copy)
	99
	100	LEXT(phys_copy)
	101
	102	rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
	103	mflr r12 // get return address
	104	rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
	105	rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
	106	bl EXT(ml_set_physical_disabled) // turn DR and EE off, SF on, get features in r10
	107	rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
	108	mtlr r12 // restore return address
	109	subic. r5,r7,4 // a word to copy?
	110	b phys_copy_2
	111
	112	.align 5
	113
	114	phys_copy_1: // loop copying words
	115	subic. r5,r5,4 // more to go?
	116	lwz r0,0(r3)
	117	addi r3,r3,4
	118	stw r0,0(r4)
	119	addi r4,r4,4
	120	phys_copy_2:
	121	bge phys_copy_1
	122	addic. r5,r5,4 // restore count
	123	ble phys_copy_4 // no more
	124
	125	// Loop is aligned here
	126
	127	phys_copy_3: // loop copying bytes
	128	subic. r5,r5,1 // more to go?
	129	lbz r0,0(r3)
	130	addi r3,r3,1
	131	stb r0,0(r4)
	132	addi r4,r4,1
	133	bgt phys_copy_3
	134	phys_copy_4:
	135	b EXT(ml_restore) // restore MSR and do the isync
	136
	137
	138	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	139	/* void
	140	* pmap_copy_page(src, dst)
	141	* ppnum_t src;
	142	* ppnum_t dst;
	143	*
	144	* This routine will copy the physical page src to physical page dst
	145	*
	146	* This routine assumes that the src and dst are page numbers and that the
	147	* destination is cached. It runs on 32 and 64 bit processors, with and
	148	* without altivec, and with 32 and 128 byte cache lines.
	149	* We also must assume that no-one will be executing within the destination
	150	* page, and that this will be used for paging. Because this
	151	* is a common routine, we have tuned loops for each processor class.
	152	*
	153	*/
	154	#define kSFSize (FM_SIZE+160)
	155
	156	ENTRY(pmap_copy_page, TAG_NO_FRAME_USED)
	157
	158	lis r2,hi16(MASK(MSR_VEC)) ; Get the vector flag
	159	mflr r0 // get return
	160	ori r2,r2,lo16(MASK(MSR_FP)) ; Add the FP flag
	161	stw r0,8(r1) // save
	162	stwu r1,-kSFSize(r1) // set up a stack frame for VRs or FPRs
	163	mfmsr r11 // save MSR at entry
	164	mfsprg r10,2 // get feature flags
	165	andc r11,r11,r2 // Clear out vec and fp
	166	ori r2,r2,lo16(MASK(MSR_EE)) // Get EE on also
	167	andc r2,r11,r2 // Clear out EE as well
	168	mtcrf 0x02,r10 // we need to test pf64Bit
	169	ori r2,r2,MASK(MSR_FP) // must enable FP for G3...
	170	mtcrf 0x80,r10 // we need to test pfAltivec too
	171	oris r2,r2,hi16(MASK(MSR_VEC)) // enable altivec for G4 (ignored if G3)
	172	mtmsr r2 // turn EE off, FP and VEC on
	173	isync
	174	bt++ pf64Bitb,pmap_copy_64 // skip if 64-bit processor (only they take hint)
	175	slwi r3,r3,12 // get page address from page num
	176	slwi r4,r4,12 // get page address from page num
	177	rlwinm r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1 // get ready to turn off DR
	178	bt pfAltivecb,pmap_copy_g4 // altivec but not 64-bit means G4
	179
	180
	181	// G3 -- copy using FPRs
	182
	183	stfd f0,FM_SIZE+0(r1) // save the 4 FPRs we use to copy
	184	stfd f1,FM_SIZE+8(r1)
	185	li r5,PPC_PGBYTES/32 // count of cache lines in a page
	186	stfd f2,FM_SIZE+16(r1)
	187	mtctr r5
	188	stfd f3,FM_SIZE+24(r1)
	189	mtmsr r12 // turn off DR after saving FPRs on stack
	190	isync
	191
	192	pmap_g3_copy_loop: // loop over 32-byte cache lines
	193	dcbz 0,r4 // avoid read of dest line
	194	lfd f0,0(r3)
	195	lfd f1,8(r3)
	196	lfd f2,16(r3)
	197	lfd f3,24(r3)
	198	addi r3,r3,32
	199	stfd f0,0(r4)
	200	stfd f1,8(r4)
	201	stfd f2,16(r4)
	202	stfd f3,24(r4)
	203	dcbst 0,r4 // flush dest line to RAM
	204	addi r4,r4,32
	205	bdnz pmap_g3_copy_loop
	206
	207	sync // wait for stores to take
	208	subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
	209	li r6,PPC_PGBYTES-32 // point to last line in page
	210	pmap_g3_icache_flush:
	211	subic. r5,r6,32 // more to go?
	212	icbi r4,r6 // flush another line in icache
	213	subi r6,r5,32 // get offset to next line
	214	icbi r4,r5
	215	bne pmap_g3_icache_flush
	216
	217	sync
	218	mtmsr r2 // turn DR back on
	219	isync
	220	lfd f0,FM_SIZE+0(r1) // restore the FPRs
	221	lfd f1,FM_SIZE+8(r1)
	222	lfd f2,FM_SIZE+16(r1)
	223	lfd f3,FM_SIZE+24(r1)
	224
	225	b pmap_g4_restore // restore MSR and done
	226
	227
	228	// G4 -- copy using VRs
	229
	230	pmap_copy_g4: // r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR
	231	la r9,FM_SIZE+16(r1) // place where we save VRs to r9
	232	li r5,16 // load x-form offsets into r5-r9
	233	li r6,32 // another offset
	234	stvx v0,0,r9 // save some VRs so we can use to copy
	235	li r7,48 // another offset
	236	stvx v1,r5,r9
	237	li r0,PPC_PGBYTES/64 // we loop over 64-byte chunks
	238	stvx v2,r6,r9
	239	mtctr r0
	240	li r8,96 // get look-ahead for touch
	241	stvx v3,r7,r9
	242	li r9,128
	243	mtmsr r12 // now we've saved VRs on stack, turn off DR
	244	isync // wait for it to happen
	245	b pmap_g4_copy_loop
	246
	247	.align 5 // align inner loops
	248	pmap_g4_copy_loop: // loop over 64-byte chunks
	249	dcbt r3,r8 // touch 3 lines ahead
	250	nop // avoid a 17-word loop...
	251	dcbt r3,r9 // touch 4 lines ahead
	252	nop // more padding
	253	dcba 0,r4 // avoid pre-fetch of 1st dest line
	254	lvx v0,0,r3 // offset 0
	255	lvx v1,r5,r3 // offset 16
	256	lvx v2,r6,r3 // offset 32
	257	lvx v3,r7,r3 // offset 48
	258	addi r3,r3,64
	259	dcba r6,r4 // avoid pre-fetch of 2nd line
	260	stvx v0,0,r4 // offset 0
	261	stvx v1,r5,r4 // offset 16
	262	stvx v2,r6,r4 // offset 32
	263	stvx v3,r7,r4 // offset 48
	264	dcbf 0,r4 // push line 1
	265	dcbf r6,r4 // and line 2
	266	addi r4,r4,64
	267	bdnz pmap_g4_copy_loop
	268
	269	sync // wait for stores to take
	270	subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
	271	li r8,PPC_PGBYTES-32 // point to last line in page
	272	pmap_g4_icache_flush:
	273	subic. r9,r8,32 // more to go?
	274	icbi r4,r8 // flush from icache
	275	subi r8,r9,32 // get offset to next line
	276	icbi r4,r9
	277	bne pmap_g4_icache_flush
	278
	279	sync
	280	mtmsr r2 // turn DR back on
	281	isync
	282	la r9,FM_SIZE+16(r1) // get base of VR save area
	283	lvx v0,0,r9 // restore the VRs
	284	lvx v1,r5,r9
	285	lvx v2,r6,r9
	286	lvx v3,r7,r9
	287
	288	pmap_g4_restore: // r11=MSR
	289	mtmsr r11 // turn EE on, VEC and FR off
	290	isync // wait for it to happen
	291	addi r1,r1,kSFSize // pop off our stack frame
	292	lwz r0,8(r1) // restore return address
	293	mtlr r0
	294	blr
	295
	296
	297	// 64-bit/128-byte processor: copy using VRs
	298
	299	pmap_copy_64: // r10=features, r11=old MSR
	300	sldi r3,r3,12 // get page address from page num
	301	sldi r4,r4,12 // get page address from page num
	302	la r9,FM_SIZE+16(r1) // get base of VR save area
	303	li r5,16 // load x-form offsets into r5-r9
	304	li r6,32 // another offset
	305	bf pfAltivecb,pmap_novmx_copy // altivec suppressed...
	306	stvx v0,0,r9 // save 8 VRs so we can copy wo bubbles
	307	stvx v1,r5,r9
	308	li r7,48 // another offset
	309	li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks
	310	stvx v2,r6,r9
	311	stvx v3,r7,r9
	312	addi r9,r9,64 // advance base ptr so we can store another 4
	313	mtctr r0
	314	li r0,MASK(MSR_DR) // get DR bit
	315	stvx v4,0,r9
	316	stvx v5,r5,r9
	317	andc r12,r2,r0 // turn off DR bit
	318	li r0,1 // get a 1 to slam into SF
	319	stvx v6,r6,r9
	320	stvx v7,r7,r9
	321	rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0)
	322	li r8,-128 // offset so we can reach back one line
	323	mtmsrd r12 // now we've saved VRs, turn DR off and SF on
	324	isync // wait for it to happen
	325	dcbt128 0,r3,1 // start a forward stream
	326	b pmap_64_copy_loop
	327
	328	.align 5 // align inner loops
	329	pmap_64_copy_loop: // loop over 128-byte chunks
	330	dcbz128 0,r4 // avoid read of destination line
	331	lvx v0,0,r3 // offset 0
	332	lvx v1,r5,r3 // offset 16
	333	lvx v2,r6,r3 // offset 32
	334	lvx v3,r7,r3 // offset 48
	335	addi r3,r3,64 // don't have enough GPRs so add 64 2x
	336	lvx v4,0,r3 // offset 64
	337	lvx v5,r5,r3 // offset 80
	338	lvx v6,r6,r3 // offset 96
	339	lvx v7,r7,r3 // offset 112
	340	addi r3,r3,64
	341	stvx v0,0,r4 // offset 0
	342	stvx v1,r5,r4 // offset 16
	343	stvx v2,r6,r4 // offset 32
	344	stvx v3,r7,r4 // offset 48
	345	addi r4,r4,64
	346	stvx v4,0,r4 // offset 64
	347	stvx v5,r5,r4 // offset 80
	348	stvx v6,r6,r4 // offset 96
	349	stvx v7,r7,r4 // offset 112
	350	addi r4,r4,64
	351	dcbf r8,r4 // flush the line we just wrote
	352	bdnz pmap_64_copy_loop
	353
	354	sync // wait for stores to take
	355	subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
	356	li r8,PPC_PGBYTES-128 // point to last line in page
	357	pmap_64_icache_flush:
	358	subic. r9,r8,128 // more to go?
	359	icbi r4,r8 // flush from icache
	360	subi r8,r9,128 // get offset to next line
	361	icbi r4,r9
	362	bne pmap_64_icache_flush
	363
	364	sync
	365	mtmsrd r2 // turn DR back on, SF off
	366	isync
	367	la r9,FM_SIZE+16(r1) // get base address of VR save area on stack
	368	lvx v0,0,r9 // restore the VRs
	369	lvx v1,r5,r9
	370	lvx v2,r6,r9
	371	lvx v3,r7,r9
	372	addi r9,r9,64
	373	lvx v4,0,r9
	374	lvx v5,r5,r9
	375	lvx v6,r6,r9
	376	lvx v7,r7,r9
	377
	378	b pmap_g4_restore // restore lower half of MSR and return
	379
	380	//
	381	// Copy on 64-bit without VMX
	382	//
	383
	384	pmap_novmx_copy:
	385	li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks
	386	mtctr r0
	387	li r0,MASK(MSR_DR) // get DR bit
	388	andc r12,r2,r0 // turn off DR bit
	389	li r0,1 // get a 1 to slam into SF
	390	rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0)
	391	mtmsrd r12 // now we've saved VRs, turn DR off and SF on
	392	isync // wait for it to happen
	393	dcbt128 0,r3,1 // start a forward stream
	394
	395	pmap_novmx_copy_loop: // loop over 128-byte cache lines
	396	dcbz128 0,r4 // avoid read of dest line
	397
	398	ld r0,0(r3) // Load half a line
	399	ld r12,8(r3)
	400	ld r5,16(r3)
	401	ld r6,24(r3)
	402	ld r7,32(r3)
	403	ld r8,40(r3)
	404	ld r9,48(r3)
	405	ld r10,56(r3)
	406
	407	std r0,0(r4) // Store half a line
	408	std r12,8(r4)
	409	std r5,16(r4)
	410	std r6,24(r4)
	411	std r7,32(r4)
	412	std r8,40(r4)
	413	std r9,48(r4)
	414	std r10,56(r4)
	415
	416	ld r0,64(r3) // Load half a line
	417	ld r12,72(r3)
	418	ld r5,80(r3)
	419	ld r6,88(r3)
	420	ld r7,96(r3)
	421	ld r8,104(r3)
	422	ld r9,112(r3)
	423	ld r10,120(r3)
	424
	425	addi r3,r3,128
	426
	427	std r0,64(r4) // Store half a line
	428	std r12,72(r4)
	429	std r5,80(r4)
	430	std r6,88(r4)
	431	std r7,96(r4)
	432	std r8,104(r4)
	433	std r9,112(r4)
	434	std r10,120(r4)
	435
	436	dcbf 0,r4 // flush the line we just wrote
	437	addi r4,r4,128
	438	bdnz pmap_novmx_copy_loop
	439
	440	sync // wait for stores to take
	441	subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
	442	li r8,PPC_PGBYTES-128 // point to last line in page
	443
	444	pmap_novmx_icache_flush:
	445	subic. r9,r8,128 // more to go?
	446	icbi r4,r8 // flush from icache
	447	subi r8,r9,128 // get offset to next line
	448	icbi r4,r9
	449	bne pmap_novmx_icache_flush
	450
	451	sync
	452	mtmsrd r2 // turn DR back on, SF off
	453	isync
	454
	455	b pmap_g4_restore // restore lower half of MSR and return
	456
	457
	458
	459	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	460
	461	// Stack frame format used by copyin, copyout, copyinstr and copyoutstr.
	462	// These routines all run both on 32 and 64-bit machines, though because they are called
	463	// by the BSD kernel they are always in 32-bit mode when entered. The mapped ptr returned
	464	// by MapUserMemoryWindow will be 64 bits however on 64-bit machines. Beware to avoid
	465	// using compare instructions on this ptr. This mapped ptr is kept globally in r31, so there
	466	// is no need to store or load it, which are mode-dependent operations since it could be
	467	// 32 or 64 bits.
	468
	469	#define kkFrameSize (FM_SIZE+32)
	470
	471	#define kkBufSize (FM_SIZE+0)
	472	#define kkCR3 (FM_SIZE+4)
	473	#define kkSource (FM_SIZE+8)
	474	#define kkDest (FM_SIZE+12)
	475	#define kkCountPtr (FM_SIZE+16)
	476	#define kkR31Save (FM_SIZE+20)
	477	#define kkThrErrJmp (FM_SIZE+24)
	478
	479
	480	// nonvolatile CR bits we use as flags in cr3
	481
	482	#define kk64bit 12
	483	#define kkNull 13
	484	#define kkIn 14
	485	#define kkString 15
	486	#define kkZero 15
	487
	488
	489	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	490	/*
	491	* int
	492	* copyoutstr(src, dst, maxcount, count)
	493	* vm_offset_t src; // r3
	494	* addr64_t dst; // r4 and r5
	495	* vm_size_t maxcount; // r6
	496	* vm_size_t* count; // r7
	497	*
	498	* Set *count to the number of bytes copied.
	499	*/
	500
	501	ENTRY(copyoutstr, TAG_NO_FRAME_USED)
	502	mfcr r2,0x10 // save caller's cr3, which we use for flags
	503	mr r10,r4 // move high word of 64-bit user address to r10
	504	li r0,0
	505	crset kkString // flag as a string op
	506	mr r11,r5 // move low word of 64-bit user address to r11
	507	stw r0,0(r7) // initialize #bytes moved
	508	crclr kkIn // flag as copyout
	509	b copyJoin
	510
	511
	512	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	513	/*
	514	* int
	515	* copyinstr(src, dst, maxcount, count)
	516	* addr64_t src; // r3 and r4
	517	* vm_offset_t dst; // r5
	518	* vm_size_t maxcount; // r6
	519	* vm_size_t* count; // r7
	520	*
	521	* Set *count to the number of bytes copied
	522	* If dst == NULL, don't copy, just count bytes.
	523	* Only currently called from klcopyinstr.
	524	*/
	525
	526	ENTRY(copyinstr, TAG_NO_FRAME_USED)
	527	mfcr r2,0x10 // save caller's cr3, which we use for flags
	528	cmplwi r5,0 // dst==NULL?
	529	mr r10,r3 // move high word of 64-bit user address to r10
	530	li r0,0
	531	crset kkString // flag as a string op
	532	mr r11,r4 // move low word of 64-bit user address to r11
	533	crmove kkNull,cr0_eq // remember if (dst==NULL)
	534	stw r0,0(r7) // initialize #bytes moved
	535	crset kkIn // flag as copyin (rather than copyout)
	536	b copyJoin1 // skip over the "crclr kkNull"
	537
	538
	539	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	540	/*
	541	* int
	542	* copyout(src, dst, count)
	543	* vm_offset_t src; // r3
	544	* addr64_t dst; // r4 and r5
	545	* size_t count; // r6
	546	*/
	547
	548	.align 5
	549	.globl EXT(copyout)
	550	.globl EXT(copyoutmsg)
	551
	552	LEXT(copyout)
	553	LEXT(copyoutmsg)
	554
	555	#if INSTRUMENT
	556	mfspr r12,pmc1 ; INSTRUMENT - saveinstr[12] - Take stamp at copyout
	557	stw r12,0x6100+(12*16)+0x0(0) ; INSTRUMENT - Save it
	558	mfspr r12,pmc2 ; INSTRUMENT - Get stamp
	559	stw r12,0x6100+(12*16)+0x4(0) ; INSTRUMENT - Save it
	560	mfspr r12,pmc3 ; INSTRUMENT - Get stamp
	561	stw r12,0x6100+(12*16)+0x8(0) ; INSTRUMENT - Save it
	562	mfspr r12,pmc4 ; INSTRUMENT - Get stamp
	563	stw r12,0x6100+(12*16)+0xC(0) ; INSTRUMENT - Save it
	564	#endif
	565	mfcr r2,0x10 // save caller's cr3, which we use for flags
	566	mr r10,r4 // move high word of 64-bit user address to r10
	567	crclr kkString // not a string version
	568	mr r11,r5 // move low word of 64-bit user address to r11
	569	crclr kkIn // flag as copyout
	570	b copyJoin
	571
	572
	573	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	574	/*
	575	* int
	576	* copyin(src, dst, count)
	577	* addr64_t src; // r3 and r4
	578	* vm_offset_t dst; // r5
	579	* size_t count; // r6
	580	*/
	581
	582
	583	.align 5
	584	.globl EXT(copyin)
	585	.globl EXT(copyinmsg)
	586
	587	LEXT(copyin)
	588	LEXT(copyinmsg)
	589
	590	mfcr r2,0x10 // save caller's cr3, which we use for flags
	591	mr r10,r3 // move high word of 64-bit user address to r10
	592	crclr kkString // not a string version
	593	mr r11,r4 // move low word of 64-bit user address to r11
	594	crset kkIn // flag as copyin
	595
	596
	597	// Common code to handle setup for all the copy variants:
	598	// r2 = caller's cr3
	599	// r3 = source if copyout
	600	// r5 = dest if copyin
	601	// r6 = buffer length or count
	602	// r7 = count output ptr (if kkString set)
	603	// r10 = high word of 64-bit user-space address (source if copyin, dest if copyout)
	604	// r11 = low word of 64-bit user-space address
	605	// cr3 = kkIn, kkString, kkNull flags
	606
	607	copyJoin:
	608	crclr kkNull // (dst==NULL) convention not used with this call
	609	copyJoin1: // enter from copyinstr with kkNull set
	610	mflr r0 // get return address
	611	cmplwi r6,0 // buffer length 0?
	612	lis r9,0x1000 // r9 <- 0x10000000 (256MB)
	613	stw r0,FM_LR_SAVE(r1) // save return
	614	cmplw cr1,r6,r9 // buffer length > 256MB ?
	615	mfsprg r8,2 // get the features
	616	beq-- copyinout_0 // 0 length is degenerate case
	617	stwu r1,-kkFrameSize(r1) // set up stack frame
	618	stw r2,kkCR3(r1) // save caller's cr3, which we use for flags
	619	mtcrf 0x02,r8 // move pf64Bit to cr6
	620	stw r3,kkSource(r1) // save args across MapUserMemoryWindow
	621	stw r5,kkDest(r1)
	622	stw r6,kkBufSize(r1)
	623	crmove kk64bit,pf64Bitb // remember if this is a 64-bit processor
	624	stw r7,kkCountPtr(r1)
	625	stw r31,kkR31Save(r1) // we use r31 globally for mapped user ptr
	626	li r31,0 // no mapped ptr yet
	627
	628
	629	// Handle buffer length > 256MB. This is an error (ENAMETOOLONG) on copyin and copyout.
	630	// The string ops are passed -1 lengths by some BSD callers, so for them we silently clamp
	631	// the buffer length to 256MB. This isn't an issue if the string is less than 256MB
	632	// (as most are!), but if they are >256MB we eventually return ENAMETOOLONG. This restriction
	633	// is due to MapUserMemoryWindow; we don't want to consume more than two segments for
	634	// the mapping.
	635
	636	ble++ cr1,copyin0 // skip if buffer length <= 256MB
	637	bf kkString,copyinout_too_big // error if not string op
	638	mr r6,r9 // silently clamp buffer length to 256MB
	639	stw r9,kkBufSize(r1) // update saved copy too
	640
	641
	642	// Set up thread_recover in case we hit an illegal address.
	643
	644	copyin0:
	645	mfsprg r8,1 // Get the current thread
	646	lis r2,hi16(copyinout_error)
	647	ori r2,r2,lo16(copyinout_error)
	648	lwz r4,THREAD_RECOVER(r8)
	649	lwz r3,ACT_VMMAP(r8) // r3 <- vm_map virtual address
	650	stw r2,THREAD_RECOVER(r8)
	651	stw r4,kkThrErrJmp(r1)
	652
	653
	654	// Map user segment into kernel map, turn on 64-bit mode. At this point:
	655	// r3 = vm map
	656	// r6 = buffer length
	657	// r10/r11 = 64-bit user-space ptr (source if copyin, dest if copyout)
	658	//
	659	// When we call MapUserMemoryWindow, we pass:
	660	// r3 = vm map ptr
	661	// r4/r5 = 64-bit user space address as an addr64_t
	662
	663	mr r4,r10 // copy user ptr into r4/r5
	664	mr r5,r11
	665	#if INSTRUMENT
	666	mfspr r12,pmc1 ; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace
	667	stw r12,0x6100+(13*16)+0x0(0) ; INSTRUMENT - Save it
	668	mfspr r12,pmc2 ; INSTRUMENT - Get stamp
	669	stw r12,0x6100+(13*16)+0x4(0) ; INSTRUMENT - Save it
	670	mfspr r12,pmc3 ; INSTRUMENT - Get stamp
	671	stw r12,0x6100+(13*16)+0x8(0) ; INSTRUMENT - Save it
	672	mfspr r12,pmc4 ; INSTRUMENT - Get stamp
	673	stw r12,0x6100+(13*16)+0xC(0) ; INSTRUMENT - Save it
	674	#endif
	675	bl EXT(MapUserMemoryWindow) // get r3/r4 <- 64-bit address in kernel map of user operand
	676	#if INSTRUMENT
	677	mfspr r12,pmc1 ; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace
	678	stw r12,0x6100+(14*16)+0x0(0) ; INSTRUMENT - Save it
	679	mfspr r12,pmc2 ; INSTRUMENT - Get stamp
	680	stw r12,0x6100+(14*16)+0x4(0) ; INSTRUMENT - Save it
	681	mfspr r12,pmc3 ; INSTRUMENT - Get stamp
	682	stw r12,0x6100+(14*16)+0x8(0) ; INSTRUMENT - Save it
	683	mfspr r12,pmc4 ; INSTRUMENT - Get stamp
	684	stw r12,0x6100+(14*16)+0xC(0) ; INSTRUMENT - Save it
	685	#endif
	686	mr r31,r4 // r31 <- mapped ptr into user space (may be 64-bit)
	687	bf-- kk64bit,copyin1 // skip if a 32-bit processor
	688
	689	rldimi r31,r3,32,0 // slam high-order bits into mapped ptr
	690	mfmsr r4 // if 64-bit, turn on SF so we can use returned ptr
	691	li r0,1
	692	rldimi r4,r0,63,MSR_SF_BIT // light bit 0
	693	mtmsrd r4 // turn on 64-bit mode
	694	isync // wait for mode to change
	695
	696
	697	// Load r3-r5, substituting mapped ptr as appropriate.
	698
	699	copyin1:
	700	lwz r5,kkBufSize(r1) // restore length to copy
	701	bf kkIn,copyin2 // skip if copyout
	702	lwz r4,kkDest(r1) // copyin: dest is kernel ptr
	703	mr r3,r31 // source is mapped ptr
	704	b copyin3
	705	copyin2: // handle copyout
	706	lwz r3,kkSource(r1) // source is kernel buffer (r3 at entry)
	707	mr r4,r31 // dest is mapped ptr into user space
	708
	709
	710	// Finally, all set up to copy:
	711	// r3 = source ptr (mapped if copyin)
	712	// r4 = dest ptr (mapped if copyout)
	713	// r5 = length
	714	// r31 = mapped ptr returned by MapUserMemoryWindow
	715	// cr3 = kkIn, kkString, kk64bit, and kkNull flags
	716
	717	copyin3:
	718	bt kkString,copyString // handle copyinstr and copyoutstr
	719	bl EXT(bcopy) // copyin and copyout: let bcopy do the work
	720	li r3,0 // return success
	721
	722
	723	// Main exit point for copyin, copyout, copyinstr, and copyoutstr. Also reached
	724	// from error recovery if we get a DSI accessing user space. Clear recovery ptr,
	725	// and pop off frame.
	726	// r3 = 0, EFAULT, or ENAMETOOLONG
	727
	728	copyinx:
	729	lwz r2,kkCR3(r1) // get callers cr3
	730	mfsprg r6,1 // Get the current thread
	731	bf-- kk64bit,copyinx1 // skip if 32-bit processor
	732	mfmsr r12
	733	rldicl r12,r12,0,MSR_SF_BIT+1 // if 64-bit processor, turn 64-bit mode off
	734	mtmsrd r12 // turn SF off
	735	isync // wait for the mode to change
	736	copyinx1:
	737	lwz r0,FM_LR_SAVE+kkFrameSize(r1) // get return address
	738	lwz r31,kkR31Save(r1) // restore callers r31
	739	lwz r4,kkThrErrJmp(r1) // load saved thread recover
	740	addi r1,r1,kkFrameSize // pop off our stack frame
	741	mtlr r0
	742	stw r4,THREAD_RECOVER(r6) // restore thread recover
	743	mtcrf 0x10,r2 // restore cr3
	744	blr
	745
	746
	747	/* We get here via the exception handler if an illegal
	748	* user memory reference was made. This error handler is used by
	749	* copyin, copyout, copyinstr, and copyoutstr. Registers are as
	750	* they were at point of fault, so for example cr3 flags are valid.
	751	*/
	752
	753	copyinout_error:
	754	li r3,EFAULT // return error
	755	b copyinx
	756
	757	copyinout_0: // degenerate case: 0-length copy
	758	mtcrf 0x10,r2 // restore cr3
	759	li r3,0 // return success
	760	blr
	761
	762	copyinout_too_big: // degenerate case
	763	mtcrf 0x10,r2 // restore cr3
	764	lwz r1,0(r1) // pop off stack frame
	765	li r3,ENAMETOOLONG
	766	blr
	767
	768
	769	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	770	// Handle copyinstr and copyoutstr. At this point the stack frame is set up,
	771	// the recovery ptr is set, the user's buffer is mapped, we're in 64-bit mode
	772	// if necessary, and:
	773	// r3 = source ptr, mapped if copyinstr
	774	// r4 = dest ptr, mapped if copyoutstr
	775	// r5 = buffer length
	776	// r31 = mapped ptr returned by MapUserMemoryWindow
	777	// cr3 = kkIn, kkString, kkNull, and kk64bit flags
	778	// We do word copies unless the buffer is very short, then use a byte copy loop
	779	// for the leftovers if necessary. The crossover at which the word loop becomes
	780	// faster is about seven bytes, counting the zero.
	781	//
	782	// We first must word-align the source ptr, in order to avoid taking a spurious
	783	// page fault.
	784
	785	copyString:
	786	cmplwi cr1,r5,15 // is buffer very short?
	787	mr r12,r3 // remember ptr to 1st source byte
	788	mtctr r5 // assuming short, set up loop count for bytes
	789	blt-- cr1,copyinstr8 // too short for word loop
	790	rlwinm r2,r3,0,0x3 // get byte offset of 1st byte within word
	791	rlwinm r9,r3,3,0x18 // get bit offset of 1st byte within word
	792	li r7,-1
	793	sub r3,r3,r2 // word-align source address
	794	add r6,r5,r2 // get length starting at byte 0 in word
	795	srw r7,r7,r9 // get mask for bytes in first word
	796	srwi r0,r6,2 // get #words in buffer
	797	lwz r5,0(r3) // get aligned word with first source byte
	798	lis r10,hi16(0xFEFEFEFF) // load magic constants into r10 and r11
	799	lis r11,hi16(0x80808080)
	800	mtctr r0 // set up word loop count
	801	addi r3,r3,4 // advance past the source word
	802	ori r10,r10,lo16(0xFEFEFEFF)
	803	ori r11,r11,lo16(0x80808080)
	804	orc r8,r5,r7 // map bytes preceeding first source byte into 0xFF
	805	bt-- kkNull,copyinstr5enter // enter loop that just counts
	806
	807	// Special case 1st word, which has been 0xFF filled on left. Note that we use
	808	// "and.", even though we execute both in 32 and 64-bit mode. This is OK.
	809
	810	slw r5,r5,r9 // left justify payload bytes
	811	add r9,r10,r8 // r9 = data + 0xFEFEFEFF
	812	andc r7,r11,r8 // r7 = ~data & 0x80808080
	813	subfic r0,r2,4 // get r0 <- #payload bytes in 1st word
	814	and. r7,r9,r7 // if r7==0, then all bytes in r8 are nonzero
	815	stw r5,0(r4) // copy payload bytes to dest buffer
	816	add r4,r4,r0 // then point to next byte in dest buffer
	817	bdnzt cr0_eq,copyinstr6 // use loop that copies if 0 not found
	818
	819	b copyinstr7 // 0 found (buffer can't be full)
	820
	821
	822	// Word loop(s). They do a word-parallel search for 0s, using the following
	823	// inobvious but very efficient test:
	824	// y = data + 0xFEFEFEFF
	825	// z = ~data & 0x80808080
	826	// If (y & z)==0, then all bytes in dataword are nonzero. There are two copies
	827	// of this loop, one that just counts and another that copies.
	828	// r3 = ptr to next word of source (word aligned)
	829	// r4 = ptr to next byte in buffer
	830	// r6 = original buffer length (adjusted to be word origin)
	831	// r10 = 0xFEFEFEFE
	832	// r11 = 0x80808080
	833	// r12 = ptr to 1st source byte (used to determine string length)
	834
	835	.align 5 // align inner loops for speed
	836	copyinstr5: // version that counts but does not copy
	837	lwz r8,0(r3) // get next word of source
	838	addi r3,r3,4 // advance past it
	839	copyinstr5enter:
	840	add r9,r10,r8 // r9 = data + 0xFEFEFEFF
	841	andc r7,r11,r8 // r7 = ~data & 0x80808080
	842	and. r7,r9,r7 // r7 = r9 & r7 ("." ok even in 64-bit mode)
	843	bdnzt cr0_eq,copyinstr5 // if r7==0, then all bytes in r8 are nonzero
	844
	845	b copyinstr7
	846
	847	.align 5 // align inner loops for speed
	848	copyinstr6: // version that counts and copies
	849	lwz r8,0(r3) // get next word of source
	850	addi r3,r3,4 // advance past it
	851	addi r4,r4,4 // increment dest ptr while we wait for data
	852	add r9,r10,r8 // r9 = data + 0xFEFEFEFF
	853	andc r7,r11,r8 // r7 = ~data & 0x80808080
	854	and. r7,r9,r7 // r7 = r9 & r7 ("." ok even in 64-bit mode)
	855	stw r8,-4(r4) // pack all 4 bytes into buffer
	856	bdnzt cr0_eq,copyinstr6 // if r7==0, then all bytes are nonzero
	857
	858
	859	// Either 0 found or buffer filled. The above algorithm has mapped nonzero bytes to 0
	860	// and 0 bytes to 0x80 with one exception: 0x01 bytes preceeding the first 0 are also
	861	// mapped to 0x80. We must mask out these false hits before searching for an 0x80 byte.
	862	// r3 = word aligned ptr to next word of source (ie, r8==mem(r3-4))
	863	// r6 = original buffer length (adjusted to be word origin)
	864	// r7 = computed vector of 0x00 and 0x80 bytes
	865	// r8 = original source word, coming from -4(r3), possibly padded with 0xFFs on left if 1st word
	866	// r12 = ptr to 1st source byte (used to determine string length)
	867	// cr0 = beq set iff 0 not found
	868
	869	copyinstr7:
	870	rlwinm r2,r8,7,0,31 // move 0x01 bits to 0x80 position
	871	rlwinm r6,r6,0,0x3 // mask down to partial byte count in last word
	872	andc r7,r7,r2 // turn off false hits from 0x0100 worst case
	873	crnot kkZero,cr0_eq // 0 found iff cr0_eq is off
	874	srwi r7,r7,8 // we want to count the 0 as a byte xferred
	875	cmpwi r6,0 // any bytes left over in last word?
	876	cntlzw r7,r7 // now we can find the 0 byte (ie, the 0x80)
	877	subi r3,r3,4 // back up r3 to point to 1st byte in r8
	878	srwi r7,r7,3 // convert 8,16,24,32 to 1,2,3,4
	879	add r3,r3,r7 // now r3 points one past 0 byte, or at 1st byte not xferred
	880	bt++ kkZero,copyinstr10 // 0 found, so done
	881
	882	beq copyinstr10 // r6==0, so buffer truly full
	883	mtctr r6 // 0 not found, loop over r6 bytes
	884	b copyinstr8 // enter byte loop for last 1-3 leftover bytes
	885
	886
	887	// Byte loop. This is used for very small buffers and for the odd bytes left over
	888	// after searching and copying words at a time.
	889	// r3 = ptr to next byte of source
	890	// r4 = ptr to next dest byte
	891	// r12 = ptr to first byte of source
	892	// ctr = count of bytes to check
	893
	894	.align 5 // align inner loops for speed
	895	copyinstr8: // loop over bytes of source
	896	lbz r0,0(r3) // get next byte of source
	897	addi r3,r3,1
	898	addi r4,r4,1 // increment dest addr whether we store or not
	899	cmpwi r0,0 // the 0?
	900	bt-- kkNull,copyinstr9 // don't store if copyinstr with NULL ptr
	901	stb r0,-1(r4)
	902	copyinstr9:
	903	bdnzf cr0_eq,copyinstr8 // loop if byte not 0 and more room in buffer
	904
	905	crmove kkZero,cr0_eq // remember if 0 found or buffer filled
	906
	907
	908	// Buffer filled or 0 found. Unwind and return.
	909	// r3 = ptr to 1st source byte not transferred
	910	// r12 = ptr to 1st source byte
	911	// r31 = mapped ptr returned by MapUserMemoryWindow
	912	// cr3 = kkZero set iff 0 found
	913
	914	copyinstr10:
	915	lwz r9,kkCountPtr(r1) // get ptr to place to store count of bytes moved
	916	sub r2,r3,r12 // compute #bytes copied (including the 0)
	917	li r3,0 // assume success return status
	918	stw r2,0(r9) // store #bytes moved
	919	bt++ kkZero,copyinx // we did find the 0 so return 0
	920	li r3,ENAMETOOLONG // buffer filled
	921	b copyinx // join main exit routine
	922
	923	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
	924	/*
	925	* int
	926	* copypv(source, sink, size, which)
	927	* addr64_t src; // r3 and r4
	928	* addr64_t dst; // r5 and r6
	929	* size_t size; // r7
	930	* int which; // r8
	931	*
	932	* Operand size bytes are copied from operand src into operand dst. The source and
	933	* destination operand addresses are given as addr64_t, and may designate starting
	934	* locations in physical or virtual memory in any combination except where both are
	935	* virtual. Virtual memory locations may be in either the kernel or the current thread's
	936	* address space. Operand size may be up to 256MB.
	937	*
	938	* Operation is controlled by operand which, which offers these options:
	939	* cppvPsrc : source operand is (1) physical or (0) virtual
	940	* cppvPsnk : destination operand is (1) physical or (0) virtual
	941	* cppvKmap : virtual operand is in (1) kernel or (0) current thread
	942	* cppvFsnk : (1) flush destination before and after transfer
	943	* cppvFsrc : (1) flush source before and after transfer
	944	* cppvNoModSnk : (1) don't set source operand's changed bit(s)
	945	* cppvNoRefSrc : (1) don't set destination operand's referenced bit(s)
	946	*
	947	* Implementation is now split into this new 64-bit path and the old path, hw_copypv_32().
	948	* This section describes the operation of the new 64-bit path.
	949	*
	950	* The 64-bit path utilizes the more capacious 64-bit kernel address space to create a
	951	* window in the kernel address space into all of physical RAM plus the I/O hole. Since
	952	* the window's mappings specify the proper access policies for the underlying memory,
	953	* the new path does not have to flush caches to avoid a cache paradox, so cppvFsnk
	954	* and cppvFsrc are ignored. Physical operand adresses are relocated into the physical
	955	* memory window, and are accessed with data relocation on. Virtual addresses are either
	956	* within the kernel, or are mapped into the kernel address space through the user memory
	957	* window. Because accesses to a virtual operand are performed with data relocation on,
	958	* the new path does not have to translate the address, disable/enable interrupts, lock
	959	* the mapping, or update referenced and changed bits.
	960	*
	961	* The IBM 970 (a.k.a. G5) processor treats real-mode accesses as guarded, so there is
	962	* a substantial performance penalty for copypv operating in real mode. Utilizing the
	963	* new 64-bit path, transfer performance increases >100% on the G5.
	964	*
	965	* The attentive reader may notice that mtmsrd ops are not followed by isync ops as
	966	* might be expected. The 970 follows PowerPC architecture version 2.01, which defines
	967	* mtmsrd with L=0 as a context synchronizing op, so a following isync is no longer
	968	* required.
	969	*
	970	* To keep things exciting, we develop 64-bit values in non-volatiles, but we also need
	971	* to call 32-bit functions, which would lead to the high-order 32 bits of our values
	972	* getting clobbered unless we do something special. So, we preserve our 64-bit non-volatiles
	973	* in our own stack frame across calls to 32-bit functions.
	974	*
	975	*/
	976
	977	// Map operand which bits into non-volatile CR2 and CR3 bits.
	978	#define whichAlign ((3+1)*4)
	979	#define whichMask 0x007F0000
	980	#define pvPsnk (cppvPsnkb - whichAlign)
	981	#define pvPsrc (cppvPsrcb - whichAlign)
	982	#define pvFsnk (cppvFsnkb - whichAlign)
	983	#define pvFsrc (cppvFsrcb - whichAlign)
	984	#define pvNoModSnk (cppvNoModSnkb - whichAlign)
	985	#define pvNoRefSrc (cppvNoRefSrcb - whichAlign)
	986	#define pvKmap (cppvKmapb - whichAlign)
	987	#define pvNoCache cr2_lt
	988
	989	.align 5
	990	.globl EXT(copypv)
	991
	992	LEXT(copypv)
	993	mfsprg r10,2 // get feature flags
	994	mtcrf 0x02,r10 // we need to test pf64Bit
	995	bt++ pf64Bitb,copypv_64 // skip if 64-bit processor (only they take hint)
	996
	997	b EXT(hw_copypv_32) // carry on with 32-bit copypv
	998
	999	// Push a 32-bit ABI-compliant stack frame and preserve all non-volatiles that we'll clobber.
	1000	copypv_64:
	1001	mfsprg r9,1 // get current thread
	1002	stwu r1,-(FM_ALIGN((31-26+11)*4)+FM_SIZE)(r1)
	1003	// allocate stack frame and link it
	1004	mflr r0 // get return address
	1005	mfcr r10 // get cr2 and cr3
	1006	lwz r12,THREAD_RECOVER(r9) // get error callback
	1007	stw r26,FM_ARG0+0x00(r1) // save non-volatile r26
	1008	stw r27,FM_ARG0+0x04(r1) // save non-volatile r27
	1009	stw r28,FM_ARG0+0x08(r1) // save non-volatile r28
	1010	stw r29,FM_ARG0+0x0C(r1) // save non-volatile r29
	1011	stw r30,FM_ARG0+0x10(r1) // save non-volatile r30
	1012	stw r31,FM_ARG0+0x14(r1) // save non-volatile r31
	1013	stw r12,FM_ARG0+0x20(r1) // save error callback
	1014	stw r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
	1015	// save return address
	1016	stw r10,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
	1017	// save non-volatile cr2 and cr3
	1018
	1019	// Non-volatile register usage in this routine is:
	1020	// r26: saved msr image
	1021	// r27: current pmap_t / virtual source address
	1022	// r28: destination virtual address
	1023	// r29: source address
	1024	// r30: destination address
	1025	// r31: byte count to copy
	1026	// cr2/3: parameter 'which' bits
	1027
	1028	rlwinm r8,r8,whichAlign,whichMask // align and mask which bits
	1029	mr r31,r7 // copy size to somewhere non-volatile
	1030	mtcrf 0x20,r8 // insert which bits into cr2 and cr3
	1031	mtcrf 0x10,r8 // insert which bits into cr2 and cr3
	1032	rlwinm r29,r3,0,1,0 // form source address high-order bits
	1033	rlwinm r30,r5,0,1,0 // form destination address high-order bits
	1034	rlwimi r29,r4,0,0,31 // form source address low-order bits
	1035	rlwimi r30,r6,0,0,31 // form destination address low-order bits
	1036	crand cr7_lt,pvPsnk,pvPsrc // are both operand addresses physical?
	1037	cntlzw r0,r31 // count leading zeroes in byte count
	1038	cror cr7_eq,pvPsnk,pvPsrc // cr7_eq <- source or destination is physical
	1039	bf-- cr7_eq,copypv_einval // both operands may not be virtual
	1040	cmplwi r0,4 // byte count greater than or equal 256M (2**28)?
	1041	blt-- copypv_einval // byte count too big, give EINVAL
	1042	cmplwi r31,0 // byte count zero?
	1043	beq-- copypv_zero // early out
	1044	bt cr7_lt,copypv_phys // both operand addresses are physical
	1045	mr r28,r30 // assume destination is virtual
	1046	bf pvPsnk,copypv_dv // is destination virtual?
	1047	mr r28,r29 // no, so source must be virtual
	1048	copypv_dv:
	1049	lis r27,ha16(EXT(kernel_pmap)) // get kernel's pmap_t *, high-order
	1050	lwz r27,lo16(EXT(kernel_pmap))(r27) // get kernel's pmap_t
	1051	bt pvKmap,copypv_kern // virtual address in kernel map?
	1052	lwz r3,ACT_VMMAP(r9) // get user's vm_map *
	1053	rldicl r4,r28,32,32 // r4, r5 <- addr64_t virtual address
	1054	rldicl r5,r28,0,32
	1055	std r29,FM_ARG0+0x30(r1) // preserve 64-bit r29 across 32-bit call
	1056	std r30,FM_ARG0+0x38(r1) // preserve 64-bit r30 across 32-bit call
	1057	bl EXT(MapUserMemoryWindow) // map slice of user space into kernel space
	1058	ld r29,FM_ARG0+0x30(r1) // restore 64-bit r29
	1059	ld r30,FM_ARG0+0x38(r1) // restore 64-bit r30
	1060	rlwinm r28,r3,0,1,0 // convert relocated addr64_t virtual address
	1061	rlwimi r28,r4,0,0,31 // into a single 64-bit scalar
	1062	copypv_kern:
	1063
	1064	// Since we'll be accessing the virtual operand with data-relocation on, we won't need to
	1065	// update the referenced and changed bits manually after the copy. So, force the appropriate
	1066	// flag bit on for the virtual operand.
	1067	crorc pvNoModSnk,pvNoModSnk,pvPsnk // for virtual dest, let hardware do ref/chg bits
	1068	crorc pvNoRefSrc,pvNoRefSrc,pvPsrc // for virtual source, let hardware do ref bit
	1069
	1070	// We'll be finding a mapping and looking at, so we need to disable 'rupts.
	1071	lis r0,hi16(MASK(MSR_VEC)) // get vector mask
	1072	ori r0,r0,lo16(MASK(MSR_FP)) // insert fp mask
	1073	mfmsr r26 // save current msr
	1074	andc r26,r26,r0 // turn off VEC and FP in saved copy
	1075	ori r0,r0,lo16(MASK(MSR_EE)) // add EE to our mask
	1076	andc r0,r26,r0 // disable EE in our new msr image
	1077	mtmsrd r0 // introduce new msr image
	1078
	1079	// We're now holding the virtual operand's pmap_t in r27 and its virtual address in r28. We now
	1080	// try to find a mapping corresponding to this address in order to determine whether the address
	1081	// is cacheable. If we don't find a mapping, we can safely assume that the operand is cacheable
	1082	// (a non-cacheable operand must be a block mapping, which will always exist); otherwise, we
	1083	// examine the mapping's caching-inhibited bit.
	1084	mr r3,r27 // r3 <- pmap_t pmap
	1085	rldicl r4,r28,32,32 // r4, r5 <- addr64_t va
	1086	rldicl r5,r28,0,32
	1087	la r6,FM_ARG0+0x18(r1) // r6 <- addr64_t *nextva
	1088	li r7,1 // r7 <- int full, search nested mappings
	1089	std r26,FM_ARG0+0x28(r1) // preserve 64-bit r26 across 32-bit calls
	1090	std r28,FM_ARG0+0x30(r1) // preserve 64-bit r28 across 32-bit calls
	1091	std r29,FM_ARG0+0x38(r1) // preserve 64-bit r29 across 32-bit calls
	1092	std r30,FM_ARG0+0x40(r1) // preserve 64-bit r30 across 32-bit calls
	1093	bl EXT(mapping_find) // find mapping for virtual operand
	1094	mr. r3,r3 // did we find it?
	1095	beq copypv_nomapping // nope, so we'll assume it's cacheable
	1096	lwz r4,mpVAddr+4(r3) // get low half of virtual addr for hw flags
	1097	rlwinm. r4,r4,0,mpIb-32,mpIb-32 // caching-inhibited bit set?
	1098	crnot pvNoCache,cr0_eq // if it is, use bcopy_nc
	1099	bl EXT(mapping_drop_busy) // drop busy on the mapping
	1100	copypv_nomapping:
	1101	ld r26,FM_ARG0+0x28(r1) // restore 64-bit r26
	1102	ld r28,FM_ARG0+0x30(r1) // restore 64-bit r28
	1103	ld r29,FM_ARG0+0x38(r1) // restore 64-bit r29
	1104	ld r30,FM_ARG0+0x40(r1) // restore 64-bit r30
	1105	mtmsrd r26 // restore msr to it's previous state
	1106
	1107	// Set both the source and destination virtual addresses to the virtual operand's address --
	1108	// we'll overlay one of them with the physical operand's address.
	1109	mr r27,r28 // make virtual operand BOTH source AND destination
	1110
	1111	// Now we're ready to relocate the physical operand address(es) into the physical memory window.
	1112	// Recall that we've mapped physical memory (including the I/O hole) into the kernel's address
	1113	// space somewhere at or over the 2**32 line. If one or both of the operands are in the I/O hole,
	1114	// we'll set the pvNoCache flag, forcing use of non-caching bcopy_nc() to do the copy.
	1115	copypv_phys:
	1116	ld r6,lgPMWvaddr(0) // get physical memory window virtual address
	1117	bf pvPsnk,copypv_dstvirt // is destination address virtual?
	1118	cntlzd r4,r30 // count leading zeros in destination address
	1119	cmplwi r4,32 // if it's 32, then it's in the I/O hole (230 to 231-1)
	1120	cror pvNoCache,cr0_eq,pvNoCache // use bcopy_nc for I/O hole locations
	1121	add r28,r30,r6 // relocate physical destination into physical window
	1122	copypv_dstvirt:
	1123	bf pvPsrc,copypv_srcvirt // is source address virtual?
	1124	cntlzd r4,r29 // count leading zeros in source address
	1125	cmplwi r4,32 // if it's 32, then it's in the I/O hole (230 to 231-1)
	1126	cror pvNoCache,cr0_eq,pvNoCache // use bcopy_nc for I/O hole locations
	1127	add r27,r29,r6 // relocate physical source into physical window
	1128	copypv_srcvirt:
	1129
	1130	// Once the copy is under way (bcopy or bcopy_nc), we will want to get control if anything
	1131	// funny happens during the copy. So, we set a pointer to our error handler in the per-thread
	1132	// control block.
	1133	mfsprg r8,1 // get current threads stuff
	1134	lis r3,hi16(copypv_error) // get our error callback's address, high
	1135	ori r3,r3,lo16(copypv_error) // get our error callback's address, low
	1136	stw r3,THREAD_RECOVER(r8) // set our error callback
	1137
	1138	// Since our physical operand(s) are relocated at or above the 2**32 line, we must enter
	1139	// 64-bit mode.
	1140	li r0,1 // get a handy one bit
	1141	mfmsr r3 // get current msr
	1142	rldimi r3,r0,63,MSR_SF_BIT // set SF bit on in our msr copy
	1143	mtmsrd r3 // enter 64-bit mode
	1144
	1145	// If requested, flush data cache
	1146	// Note that we don't flush, the code is being saved "just in case".
	1147	#if 0
	1148	bf pvFsrc,copypv_nfs // do we flush the source?
	1149	rldicl r3,r27,32,32 // r3, r4 <- addr64_t source virtual address
	1150	rldicl r4,r27,0,32
	1151	mr r5,r31 // r5 <- count (in bytes)
	1152	li r6,0 // r6 <- boolean phys (false, not physical)
	1153	bl EXT(flush_dcache) // flush the source operand
	1154	copypv_nfs:
	1155	bf pvFsnk,copypv_nfdx // do we flush the destination?
	1156	rldicl r3,r28,32,32 // r3, r4 <- addr64_t destination virtual address
	1157	rldicl r4,r28,0,32
	1158	mr r5,r31 // r5 <- count (in bytes)
	1159	li r6,0 // r6 <- boolean phys (false, not physical)
	1160	bl EXT(flush_dcache) // flush the destination operand
	1161	copypv_nfdx:
	1162	#endif
	1163
	1164	// Call bcopy or bcopy_nc to perform the copy.
	1165	mr r3,r27 // r3 <- source virtual address
	1166	mr r4,r28 // r4 <- destination virtual address
	1167	mr r5,r31 // r5 <- bytes to copy
	1168	bt pvNoCache,copypv_nc // take non-caching route
	1169	bl EXT(bcopy) // call bcopy to do the copying
	1170	b copypv_copydone
	1171	copypv_nc:
	1172	bl EXT(bcopy_nc) // call bcopy_nc to do the copying
	1173	copypv_copydone:
	1174
	1175	// If requested, flush data cache
	1176	// Note that we don't flush, the code is being saved "just in case".
	1177	#if 0
	1178	bf pvFsrc,copypv_nfsx // do we flush the source?
	1179	rldicl r3,r27,32,32 // r3, r4 <- addr64_t source virtual address
	1180	rldicl r4,r27,0,32
	1181	mr r5,r31 // r5 <- count (in bytes)
	1182	li r6,0 // r6 <- boolean phys (false, not physical)
	1183	bl EXT(flush_dcache) // flush the source operand
	1184	copypv_nfsx:
	1185	bf pvFsnk,copypv_nfd // do we flush the destination?
	1186	rldicl r3,r28,32,32 // r3, r4 <- addr64_t destination virtual address
	1187	rldicl r4,r28,0,32
	1188	mr r5,r31 // r5 <- count (in bytes)
	1189	li r6,0 // r6 <- boolean phys (false, not physical)
	1190	bl EXT(flush_dcache) // flush the destination operand
	1191	copypv_nfd:
	1192	#endif
	1193
	1194	// Leave 64-bit mode.
	1195	mfmsr r3 // get current msr
	1196	rldicl r3,r3,0,MSR_SF_BIT+1 // clear SF bit in our copy
	1197	mtmsrd r3 // leave 64-bit mode
	1198
	1199	// If requested, set ref/chg on source/dest physical operand(s). It is possible that copy is
	1200	// from/to a RAM disk situated outside of mapped physical RAM, so we check each page by calling
	1201	// mapping_phys_lookup() before we try to set its ref/chg bits; otherwise, we might panic.
	1202	// Note that this code is page-size sensitive, so it should probably be a part of our low-level
	1203	// code in hw_vm.s.
	1204	bt pvNoModSnk,copypv_nomod // skip destination update if not requested
	1205	std r29,FM_ARG0+0x30(r1) // preserve 64-bit r29 across 32-bit calls
	1206	li r26,1 // r26 <- 4K-page count
	1207	mr r27,r31 // r27 <- byte count
	1208	rlwinm r3,r30,0,20,31 // does destination cross a page boundary?
	1209	subfic r3,r3,4096 //
	1210	cmplw r3,r27 //
	1211	blt copypv_modnox // skip if not crossing case
	1212	subf r27,r3,r27 // r27 <- byte count less initial fragment
	1213	addi r26,r26,1 // increment page count
	1214	copypv_modnox:
	1215	srdi r3,r27,12 // pages to update (not including crosser)
	1216	add r26,r26,r3 // add in crosser
	1217	srdi r27,r30,12 // r27 <- destination page number
	1218	copypv_modloop:
	1219	mr r3,r27 // r3 <- destination page number
	1220	la r4,FM_ARG0+0x18(r1) // r4 <- unsigned int *pindex
	1221	bl EXT(mapping_phys_lookup) // see if page is really there
	1222	mr. r3,r3 // is it?
	1223	beq-- copypv_modend // nope, break out of modify loop
	1224	mr r3,r27 // r3 <- destination page number
	1225	bl EXT(mapping_set_mod) // set page changed status
	1226	subi r26,r26,1 // decrement page count
	1227	cmpwi r26,0 // done yet?
	1228	bgt copypv_modloop // nope, iterate
	1229	copypv_modend:
	1230	ld r29,FM_ARG0+0x30(r1) // restore 64-bit r29
	1231	copypv_nomod:
	1232	bt pvNoRefSrc,copypv_done // skip source update if not requested
	1233	copypv_debugref:
	1234	li r26,1 // r26 <- 4K-page count
	1235	mr r27,r31 // r27 <- byte count
	1236	rlwinm r3,r29,0,20,31 // does source cross a page boundary?
	1237	subfic r3,r3,4096 //
	1238	cmplw r3,r27 //
	1239	blt copypv_refnox // skip if not crossing case
	1240	subf r27,r3,r27 // r27 <- byte count less initial fragment
	1241	addi r26,r26,1 // increment page count
	1242	copypv_refnox:
	1243	srdi r3,r27,12 // pages to update (not including crosser)
	1244	add r26,r26,r3 // add in crosser
	1245	srdi r27,r29,12 // r27 <- source page number
	1246	copypv_refloop:
	1247	mr r3,r27 // r3 <- source page number
	1248	la r4,FM_ARG0+0x18(r1) // r4 <- unsigned int *pindex
	1249	bl EXT(mapping_phys_lookup) // see if page is really there
	1250	mr. r3,r3 // is it?
	1251	beq-- copypv_done // nope, break out of modify loop
	1252	mr r3,r27 // r3 <- source page number
	1253	bl EXT(mapping_set_ref) // set page referenced status
	1254	subi r26,r26,1 // decrement page count
	1255	cmpwi r26,0 // done yet?
	1256	bgt copypv_refloop // nope, iterate
	1257
	1258	// Return, indicating success.
	1259	copypv_done:
	1260	copypv_zero:
	1261	li r3,0 // our efforts were crowned with success
	1262
	1263	// Pop frame, restore caller's non-volatiles, clear recovery routine pointer.
	1264	copypv_return:
	1265	mfsprg r9,1 // get current threads stuff
	1266	lwz r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
	1267	// get return address
	1268	lwz r4,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
	1269	// get non-volatile cr2 and cr3
	1270	lwz r26,FM_ARG0+0x00(r1) // restore non-volatile r26
	1271	lwz r27,FM_ARG0+0x04(r1) // restore non-volatile r27
	1272	mtlr r0 // restore return address
	1273	lwz r28,FM_ARG0+0x08(r1) // restore non-volatile r28
	1274	mtcrf 0x20,r4 // restore non-volatile cr2
	1275	mtcrf 0x10,r4 // restore non-volatile cr3
	1276	lwz r11,FM_ARG0+0x20(r1) // save error callback
	1277	lwz r29,FM_ARG0+0x0C(r1) // restore non-volatile r29
	1278	lwz r30,FM_ARG0+0x10(r1) // restore non-volatile r30
	1279	lwz r31,FM_ARG0+0x14(r1) // restore non-volatile r31
	1280	stw r11,THREAD_RECOVER(r9) // restore our error callback
	1281	lwz r1,0(r1) // release stack frame
	1282
	1283	blr // y'all come back now
	1284
	1285	// Invalid argument handler.
	1286	copypv_einval:
	1287	li r3,EINVAL // invalid argument
	1288	b copypv_return // return
	1289
	1290	// Error encountered during bcopy or bcopy_nc.
	1291	copypv_error:
	1292	mfmsr r3 // get current msr
	1293	rldicl r3,r3,0,MSR_SF_BIT+1 // clear SF bit in our copy
	1294	mtmsrd r3 // leave 64-bit mode
	1295	li r3,EFAULT // it was all his fault
	1296	b copypv_return // return