]> git.saurik.com Git - apple/xnu.git/blame - osfmk/ppc/bcopy.s
xnu-792.25.20.tar.gz
[apple/xnu.git] / osfmk / ppc / bcopy.s
CommitLineData
1c79356b 1/*
91447636 2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
1c79356b 3 *
6601e61a 4 * @APPLE_LICENSE_HEADER_START@
1c79356b 5 *
6601e61a
A
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
8f6c56a5 11 *
6601e61a
A
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
6601e61a
A
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
8f6c56a5 19 *
6601e61a 20 * @APPLE_LICENSE_HEADER_END@
1c79356b
A
21 */
22;
91447636 23; Copy bytes of data around. Handles overlapped data.
1c79356b 24;
1c79356b
A
25;
26#include <ppc/asm.h>
27#include <ppc/proc_reg.h>
55e303ae 28#include <assym.s>
1c79356b 29
91447636
A
30; These routines use CR5 for certain flags:
31; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
1c79356b 32#define noncache 20
55e303ae 33
55e303ae 34
91447636
A
35; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
36#define BCOPY_SF_SIZE 32 // total size
37#define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
1c79356b 38
1c79356b 39
91447636 40#define kShort 32 // short operands are special cased
1c79356b 41
1c79356b 42
91447636
A
43; void bcopy_physvir_32(from, to, nbytes)
44;
9bccf70c 45; Attempt to copy physically addressed memory with translation on if conditions are met.
55e303ae
A
46; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
47; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
48; for the passed phys addrs and do the copy with translation on.
9bccf70c 49;
91447636
A
50; Rules are: - neither source nor destination can cross a page.
51; - Interrupts must be disabled when this routine is called.
52; - Translation must be on when called.
9bccf70c 53;
91447636
A
54; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
55; is loaded. We do not touch the IBATs, so there is no issue if either physical page
9bccf70c
A
56; address is the same as the virtual address of the instructions we are executing.
57;
55e303ae
A
58; At the end, we invalidate the used DBATs.
59;
60; Note that the address parameters are long longs. We will transform these to 64-bit
61; values. Note that on 32-bit architectures that this will ignore the high half of the
62; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
63; there anyhow.
9bccf70c 64;
91447636
A
65; Note also that this routine is used only on 32-bit machines. If you're contemplating use
66; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
67; for an example of how this is done.
9bccf70c
A
68
69 .align 5
91447636 70 .globl EXT(bcopy_physvir_32)
9bccf70c 71
91447636
A
72LEXT(bcopy_physvir_32)
73 mflr r0 ; get return address
55e303ae 74 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
91447636
A
75 mfsprg r8,2 ; get processor feature flags
76 stw r0,8(r1) ; save return address
55e303ae 77 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
91447636 78 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
55e303ae 79 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
91447636
A
80 subi r0,r7,1 ; get length - 1
81 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
82 add r11,r3,r0 ; Point to last byte of sink
55e303ae 83 mr r5,r7 ; Get the length into the right register
91447636
A
84 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
85
86; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
87; to bcopy_physvir() on 32-bit machines, and will result in a panic.
88
9bccf70c 89 add r12,r4,r0 ; Point to last byte of source
9bccf70c
A
90 xor r7,r11,r3 ; See if we went to next page
91 xor r8,r12,r4 ; See if we went to next page
92 or r0,r7,r8 ; Combine wrap
93
55e303ae
A
94// li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
95 li r9,((2<<3)|2) ; Set default attributes
9bccf70c
A
96 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
97 li r7,2 ; Set validity flags
98 li r8,2 ; Set validity flags
55e303ae 99 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
9bccf70c 100
9bccf70c
A
101 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
102 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
103 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
104 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
105 cmplw cr1,r11,r12 ; See if sink and source are same block
106
107 sync
108
109 mtdbatl 0,r11 ; Set sink lower DBAT
110 mtdbatu 0,r7 ; Set sink upper DBAT
111
112 beq- cr1,bcpvsame ; Source and sink are in same block
113
114 mtdbatl 1,r12 ; Set source lower DBAT
115 mtdbatu 1,r8 ; Set source upper DBAT
91447636 116
e5568f75 117bcpvsame:
91447636 118 sync ; wait for the BATs to stabilize
e5568f75 119 isync
91447636
A
120
121 bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
122
123 li r0,0 ; Get set to invalidate upper half of BATs
124 sync ; Make sure all is well
125 mtdbatu 0,r0 ; Clear sink upper DBAT
126 mtdbatu 1,r0 ; Clear source upper DBAT
127 sync
128 isync
129
130 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
131 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
132 mtlr r0
133 blr
134
9bccf70c 135
1c79356b 136; void bcopy_phys(from, to, nbytes)
91447636
A
137;
138; Turns off data translation before the copy. This one will not work in user state.
139; This routine is used on 32 and 64-bit machines.
55e303ae
A
140;
141; Note that the address parameters are long longs. We will transform these to 64-bit
142; values. Note that on 32-bit architectures that this will ignore the high half of the
143; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
144; there anyhow.
145;
146; Also note that you probably will not be happy if either the sink or source spans across the
147; boundary between RAM and I/O space. Good chance of hanging the machine and this code
148; will not check, so be careful.
1c79356b 149;
91447636
A
150; NOTE: when called, translation must be on, and we must be in 32-bit mode.
151; Interrupts may or may not be disabled.
1c79356b 152
9bccf70c
A
153 .align 5
154 .globl EXT(bcopy_phys)
155
156LEXT(bcopy_phys)
91447636 157 mflr r0 ; get return address
55e303ae 158 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
91447636 159 stw r0,8(r1) ; save
55e303ae 160 mfsprg r8,2 ; get processor feature flags
91447636 161 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
55e303ae
A
162 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
163 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
164 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
165 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
166 mr r5,r7 ; Get the length into the right register
91447636
A
167
168bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
1c79356b 169 mfmsr r9 ; Get the MSR
91447636
A
170 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
171 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
172 andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
55e303ae
A
173 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
174
175; 32-bit CPUs
91447636
A
176
177 mtmsr r9 ; turn DR, FP, and VEC off
1c79356b
A
178 isync ; Wait for it
179
91447636 180 bl EXT(bcopy) ; do the copy with translation off and caching on
55e303ae 181
91447636
A
182 mfmsr r9 ; Get the MSR
183 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
184 mtmsr r9 ; restore msr
185 isync ; wait for it to happen
186 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
187 mtlr r0
188 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
189 blr
55e303ae 190
91447636
A
191
192; 64-bit: turn DR off and SF on.
55e303ae 193
91447636
A
194bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
195 ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
196 srdi r2,r3,31 ; Get a 1 if source is in I/O memory
55e303ae 197 li r0,1 ; Note - we use this in a couple places below
91447636
A
198 srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
199 std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
200 rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
201 cmpldi cr0,r2,1 ; Is source in I/O memory?
202 cmpldi cr7,r10,1 ; Is sink in I/O memory?
55e303ae 203 mtmsrd r9 ; turn 64-bit addressing on, data translation off
55e303ae 204 isync ; wait for it to happen
91447636
A
205 cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
206 beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
207
208 bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
209
210bcopy_phys64x:
211 mfmsr r9 ; Get the MSR we used to copy
212 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
213 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
214 mtmsrd r9 ; turn 64-bit mode off, translation back on
215 isync ; wait for it to happen
216 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
217 ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
218 mtlr r0
219 mtmsrd r8,1 ; turn EE back on if necessary
220 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
221 blr
55e303ae 222
91447636
A
223; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
224; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
225; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
226; this state, so we have to disable interrupts as well. This is an unpleasant hack.
227
228io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
229 sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
230 sldi r0,r0,32+8 ; Get the right bit to turn off caching
231 andc r9,r9,r11 ; Turn off EE bit
232 mfspr r2,hid4 ; Get HID4
233 mtmsrd r9,1 ; Force off EE
234 or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
235 sync ; Sync up
236 mtspr hid4,r2 ; Make real accesses cache-inhibited
237 isync ; Toss prefetches
238
239 lis r12,0xE000 ; Get the unlikeliest ESID possible
240 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
241 slbie r12 ; Make sure the ERAT is cleared
55e303ae 242
91447636
A
243 sync
244 isync
55e303ae 245
91447636 246 bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
55e303ae 247
91447636
A
248 li r0,1 ; Get a 1
249 sldi r0,r0,32+8 ; Get the right bit to turn off caching
250 mfspr r2,hid4 ; Get HID4
251 andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
252 sync ; Sync up
253 mtspr hid4,r2 ; Make real accesses not cache-inhibited
254 isync ; Toss prefetches
255
256 lis r12,0xE000 ; Get the unlikeliest ESID possible
257 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
258 slbie r12 ; Make sure the ERAT is cleared
259 b bcopy_phys64x
260
1c79356b 261
91447636
A
262;
263; shortcopy
264;
265; Special case short operands (<32 bytes), which are very common. Note that the check for
266; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
267; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
268; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
269; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
270; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
271; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
272; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
273; might do unaligned accesses this code cannot be called from bcopy_nc().
274; r4 = destination
275; r5 = length (<32)
276; r6 = source
277; r12 = (dest - source)
278
279 .align 5
280shortcopy:
281 cmplw r12,r5 ; must move reverse if (dest-source)<length
282 mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
283 mtcrf 1,r5 ; ...which is faster on G4 and G5
284 bge++ backend ; handle forward moves (most common case)
285 add r6,r6,r5 ; point one past end of operands in reverse moves
286 add r4,r4,r5
287 b bbackend ; handle reverse moves
288
1c79356b
A
289;
290; void bcopy(from, to, nbytes)
291;
91447636
A
292; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
293; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
294; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
295; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
296; registers to accelerate the copy.
297;
298; NOTE: this code can be called in any of three "modes":
299; - on 32-bit processors (32-byte cache line)
300; - on 64-bit processors running in 32-bit mode (128-byte cache line)
301; - on 64-bit processors running in 64-bit mode (128-byte cache line)
1c79356b 302
9bccf70c
A
303 .align 5
304 .globl EXT(bcopy)
91447636 305 .globl EXT(bcopy_nop_if_32bit)
9bccf70c
A
306
307LEXT(bcopy)
91447636
A
308 cmplwi cr1,r5,kShort ; less than 32 bytes?
309 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
310 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
311 blt cr1,shortcopy ; special case short operands
1c79356b 312 crclr noncache ; Set cached
91447636
A
313LEXT(bcopy_nop_if_32bit)
314 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
315 bne+ copyit32 ; handle 32-bit processor
316 blr ; to==from so nothing to do
317
318;
319; bcopy_nc(from, to, nbytes)
320;
321; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
322; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
323; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
324; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
1c79356b 325
91447636
A
326 .align 5
327 .globl EXT(bcopy_nc)
328 .globl EXT(bcopy_nc_nop_if_32bit)
329
330LEXT(bcopy_nc)
55e303ae 331 cmpwi cr1,r5,0 ; Check if we have a 0 length
91447636
A
332 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
333 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
334 crset noncache ; Set non-cached
335 cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
336LEXT(bcopy_nc_nop_if_32bit)
337 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
338 bne+ copyit32 ; handle 32-bit processor
339 blr ; either zero length or to==from
1c79356b
A
340
341;
91447636
A
342; void* memcpy(to, from, nbytes)
343; void* memmove(to, from, nbytes)
55e303ae 344;
91447636
A
345; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
346; However, they would work correctly if called in 64-bit mode.
347
9bccf70c
A
348 .align 5
349 .globl EXT(memcpy)
91447636
A
350 .globl EXT(memmove)
351 .globl EXT(memcpy_nop_if_32bit)
352
9bccf70c 353LEXT(memcpy)
91447636
A
354LEXT(memmove)
355 cmplwi cr1,r5,kShort ; less than 32 bytes?
356 sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
357 mr r6,r4 ; Set source
358 mr r4,r3 ; Set the "to" (must preserve r3 for return value)
359 blt cr1,shortcopy ; special case short operands
1c79356b 360 crclr noncache ; Set cached
91447636
A
361LEXT(memcpy_nop_if_32bit)
362 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
363 beqlr- ; exit if to==from
1c79356b 364
91447636
A
365
366; Here to copy on 32-bit processors.
1c79356b 367;
91447636
A
368; When we move the memory, forward overlays must be handled. We
369; also can not use the cache instructions if we are from bcopy_nc.
370; We need to preserve R3 because it needs to be returned for memcpy.
371; We can be interrupted and lose control here.
1c79356b 372;
91447636
A
373; When entered:
374; r4 = destination
375; r5 = length (>0)
376; r6 = source
377; r12 = (dest - source)
378; cr5 = noncache flag
379
380copyit32: ; WARNING! can drop down to this label
381 cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
382 cntlzw r11,r5 ; get magnitude of length
383 dcbt 0,r6 ; start to touch in source
384 lis r10,hi16(0x80000000) ; get 0x80000000
385 neg r9,r4 ; start to get alignment for destination
386 dcbtst 0,r4 ; start to touch in destination
387 sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
388 blt- cr1,reverse32bit ; reverse move required
389
390; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
391; NOTE: we never do an unaligned access if the source and destination are "relatively"
392; word aligned. We depend on this in the uncached case on 64-bit processors.
393; r4 = destination
394; r5 = length (>0)
395; r6 = source
396; r8 = inverse of largest mask smaller than operand length
397; r9 = neg(dest), used to compute alignment
398; cr5 = noncache flag
399
400forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
401 rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
402 andc. r0,r7,r8 ; limit to the maximum front end move
403 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
1c79356b
A
404 beq alline ; Already on a line...
405
91447636
A
406 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
407 sub r5,r5,r0 ; Set the length left to move
408
1c79356b
A
409 bf 31,alhalf ; No single byte to do...
410 lbz r7,0(r6) ; Get the byte
411 addi r6,r6,1 ; Point to the next
412 stb r7,0(r4) ; Save the single
413 addi r4,r4,1 ; Bump sink
414
415; Sink is halfword aligned here
416
417alhalf: bf 30,alword ; No halfword to do...
418 lhz r7,0(r6) ; Get the halfword
419 addi r6,r6,2 ; Point to the next
420 sth r7,0(r4) ; Save the halfword
421 addi r4,r4,2 ; Bump sink
422
423; Sink is word aligned here
424
425alword: bf 29,aldouble ; No word to do...
426 lwz r7,0(r6) ; Get the word
427 addi r6,r6,4 ; Point to the next
428 stw r7,0(r4) ; Save the word
429 addi r4,r4,4 ; Bump sink
430
431; Sink is double aligned here
432
433aldouble: bf 28,alquad ; No double to do...
434 lwz r7,0(r6) ; Get the first word
435 lwz r8,4(r6) ; Get the second word
436 addi r6,r6,8 ; Point to the next
437 stw r7,0(r4) ; Save the first word
438 stw r8,4(r4) ; Save the second word
439 addi r4,r4,8 ; Bump sink
440
441; Sink is quadword aligned here
442
443alquad: bf 27,alline ; No quad to do...
444 lwz r7,0(r6) ; Get the first word
445 lwz r8,4(r6) ; Get the second word
446 lwz r9,8(r6) ; Get the third word
447 stw r7,0(r4) ; Save the first word
448 lwz r11,12(r6) ; Get the fourth word
449 addi r6,r6,16 ; Point to the next
450 stw r8,4(r4) ; Save the second word
451 stw r9,8(r4) ; Save the third word
452 stw r11,12(r4) ; Save the fourth word
453 addi r4,r4,16 ; Bump sink
454
455; Sink is line aligned here
456
457alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
91447636
A
458 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
459 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
1c79356b 460 beq- backend ; No full lines to move
91447636
A
461
462 mtctr r0 ; set up loop count
463 li r0,96 ; Stride for touch ahead
464 b nxtline
465
466 .align 4
467nxtline:
468 lwz r2,0(r6) ; Get the first word
469 lwz r5,4(r6) ; Get the second word
470 lwz r7,8(r6) ; Get the third word
471 lwz r8,12(r6) ; Get the fourth word
472 lwz r9,16(r6) ; Get the fifth word
473 lwz r10,20(r6) ; Get the sixth word
474 lwz r11,24(r6) ; Get the seventh word
475 lwz r12,28(r6) ; Get the eighth word
1c79356b 476 bt- noncache,skipz ; Skip if we are not cached...
91447636
A
477 dcbz 0,r4 ; Blow away the whole line because we are replacing it
478 dcbt r6,r0 ; Touch ahead a bit
479skipz:
1c79356b 480 addi r6,r6,32 ; Point to the next
91447636
A
481 stw r2,0(r4) ; Save the first word
482 stw r5,4(r4) ; Save the second word
483 stw r7,8(r4) ; Save the third word
484 stw r8,12(r4) ; Save the fourth word
485 stw r9,16(r4) ; Save the fifth word
486 stw r10,20(r4) ; Save the sixth word
487 stw r11,24(r4) ; Save the seventh word
488 stw r12,28(r4) ; Save the eighth word
1c79356b 489 addi r4,r4,32 ; Bump sink
91447636 490 bdnz+ nxtline ; Do the next line, if any...
1c79356b
A
491
492
493; Move backend quadword
494
91447636
A
495backend: ; Join here from "shortcopy" for forward moves <32 bytes
496 bf 27,noquad ; No quad to do...
1c79356b
A
497 lwz r7,0(r6) ; Get the first word
498 lwz r8,4(r6) ; Get the second word
499 lwz r9,8(r6) ; Get the third word
500 lwz r11,12(r6) ; Get the fourth word
501 stw r7,0(r4) ; Save the first word
502 addi r6,r6,16 ; Point to the next
503 stw r8,4(r4) ; Save the second word
504 stw r9,8(r4) ; Save the third word
505 stw r11,12(r4) ; Save the fourth word
506 addi r4,r4,16 ; Bump sink
507
508; Move backend double
509
510noquad: bf 28,nodouble ; No double to do...
511 lwz r7,0(r6) ; Get the first word
512 lwz r8,4(r6) ; Get the second word
513 addi r6,r6,8 ; Point to the next
514 stw r7,0(r4) ; Save the first word
515 stw r8,4(r4) ; Save the second word
516 addi r4,r4,8 ; Bump sink
517
518; Move backend word
519
520nodouble: bf 29,noword ; No word to do...
521 lwz r7,0(r6) ; Get the word
522 addi r6,r6,4 ; Point to the next
523 stw r7,0(r4) ; Save the word
524 addi r4,r4,4 ; Bump sink
525
526; Move backend halfword
527
528noword: bf 30,nohalf ; No halfword to do...
529 lhz r7,0(r6) ; Get the halfword
530 addi r6,r6,2 ; Point to the next
531 sth r7,0(r4) ; Save the halfword
532 addi r4,r4,2 ; Bump sink
533
534; Move backend byte
535
91447636 536nohalf: bflr 31 ; Leave cuz we are all done...
1c79356b
A
537 lbz r7,0(r6) ; Get the byte
538 stb r7,0(r4) ; Save the single
55e303ae
A
539 blr
540
9bccf70c 541
91447636
A
542; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
543; NOTE: we never do an unaligned access if the source and destination are "relatively"
544; word aligned. We depend on this in the uncached case on 64-bit processors.
545; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
546; r4 = destination
547; r5 = length (>0)
548; r6 = source
549; r8 = inverse of largest mask smaller than operand length
550; cr5 = noncache flag (but we don't dcbz anyway)
1c79356b 551
91447636
A
552reverse32bit: ; here from 64-bit code with word aligned uncached operands
553 add r4,r5,r4 ; Point past the last sink byte
1c79356b 554 add r6,r5,r6 ; Point past the last source byte
91447636
A
555 rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
556 li r12,-1 ; Make sure we touch in the actual line
557 andc. r0,r7,r8 ; Apply movement limit
1c79356b 558 dcbt r12,r6 ; Touch in the last line of source
91447636 559 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
1c79356b 560 dcbtst r12,r4 ; Touch in the last line of the sink
91447636
A
561 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
562 beq- balline ; Aready on cache line boundary (or too short to bother)
1c79356b
A
563
564 sub r5,r5,r0 ; Precaculate move length left after alignment
565
566 bf 31,balhalf ; No single byte to do...
567 lbz r7,-1(r6) ; Get the byte
568 subi r6,r6,1 ; Point to the next
569 stb r7,-1(r4) ; Save the single
570 subi r4,r4,1 ; Bump sink
571
572; Sink is halfword aligned here
573
574balhalf: bf 30,balword ; No halfword to do...
575 lhz r7,-2(r6) ; Get the halfword
576 subi r6,r6,2 ; Point to the next
577 sth r7,-2(r4) ; Save the halfword
578 subi r4,r4,2 ; Bump sink
579
580; Sink is word aligned here
581
582balword: bf 29,baldouble ; No word to do...
583 lwz r7,-4(r6) ; Get the word
584 subi r6,r6,4 ; Point to the next
585 stw r7,-4(r4) ; Save the word
586 subi r4,r4,4 ; Bump sink
587
588; Sink is double aligned here
589
590baldouble: bf 28,balquad ; No double to do...
591 lwz r7,-8(r6) ; Get the first word
592 lwz r8,-4(r6) ; Get the second word
593 subi r6,r6,8 ; Point to the next
594 stw r7,-8(r4) ; Save the first word
595 stw r8,-4(r4) ; Save the second word
596 subi r4,r4,8 ; Bump sink
597
598; Sink is quadword aligned here
599
600balquad: bf 27,balline ; No quad to do...
601 lwz r7,-16(r6) ; Get the first word
602 lwz r8,-12(r6) ; Get the second word
603 lwz r9,-8(r6) ; Get the third word
604 lwz r11,-4(r6) ; Get the fourth word
605 stw r7,-16(r4) ; Save the first word
606 subi r6,r6,16 ; Point to the next
607 stw r8,-12(r4) ; Save the second word
608 stw r9,-8(r4) ; Save the third word
609 stw r11,-4(r4) ; Save the fourth word
610 subi r4,r4,16 ; Bump sink
611
612; Sink is line aligned here
613
614balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
91447636
A
615 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
616 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
1c79356b 617 beq- bbackend ; No full lines to move
91447636
A
618 mtctr r0 ; set up loop count
619 b bnxtline
1c79356b 620
91447636
A
621 .align 4
622bnxtline:
1c79356b
A
623 lwz r7,-32(r6) ; Get the first word
624 lwz r5,-28(r6) ; Get the second word
625 lwz r2,-24(r6) ; Get the third word
626 lwz r12,-20(r6) ; Get the third word
627 lwz r11,-16(r6) ; Get the fifth word
628 lwz r10,-12(r6) ; Get the sixth word
629 lwz r9,-8(r6) ; Get the seventh word
630 lwz r8,-4(r6) ; Get the eighth word
631 subi r6,r6,32 ; Point to the next
632
633 stw r7,-32(r4) ; Get the first word
91447636 634 stw r5,-28(r4) ; Get the second word
1c79356b
A
635 stw r2,-24(r4) ; Get the third word
636 stw r12,-20(r4) ; Get the third word
637 stw r11,-16(r4) ; Get the fifth word
638 stw r10,-12(r4) ; Get the sixth word
639 stw r9,-8(r4) ; Get the seventh word
640 stw r8,-4(r4) ; Get the eighth word
641 subi r4,r4,32 ; Bump sink
642
91447636 643 bdnz+ bnxtline ; Do the next line, if any...
1c79356b
A
644
645;
646; Note: We touched these lines in at the beginning
647;
648
649; Move backend quadword
650
91447636
A
651bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
652 bf 27,bnoquad ; No quad to do...
1c79356b
A
653 lwz r7,-16(r6) ; Get the first word
654 lwz r8,-12(r6) ; Get the second word
655 lwz r9,-8(r6) ; Get the third word
656 lwz r11,-4(r6) ; Get the fourth word
657 stw r7,-16(r4) ; Save the first word
658 subi r6,r6,16 ; Point to the next
659 stw r8,-12(r4) ; Save the second word
660 stw r9,-8(r4) ; Save the third word
661 stw r11,-4(r4) ; Save the fourth word
662 subi r4,r4,16 ; Bump sink
663
664; Move backend double
665
666bnoquad: bf 28,bnodouble ; No double to do...
667 lwz r7,-8(r6) ; Get the first word
668 lwz r8,-4(r6) ; Get the second word
669 subi r6,r6,8 ; Point to the next
670 stw r7,-8(r4) ; Save the first word
671 stw r8,-4(r4) ; Save the second word
672 subi r4,r4,8 ; Bump sink
673
674; Move backend word
675
676bnodouble: bf 29,bnoword ; No word to do...
677 lwz r7,-4(r6) ; Get the word
678 subi r6,r6,4 ; Point to the next
679 stw r7,-4(r4) ; Save the word
680 subi r4,r4,4 ; Bump sink
681
682; Move backend halfword
683
684bnoword: bf 30,bnohalf ; No halfword to do...
685 lhz r7,-2(r6) ; Get the halfword
686 subi r6,r6,2 ; Point to the next
687 sth r7,-2(r4) ; Save the halfword
688 subi r4,r4,2 ; Bump sink
689
690; Move backend byte
691
91447636 692bnohalf: bflr 31 ; Leave cuz we are all done...
1c79356b
A
693 lbz r7,-1(r6) ; Get the byte
694 stb r7,-1(r4) ; Save the single
91447636 695 blr
55e303ae
A
696
697
698// Here on 64-bit processors, which have a 128-byte cache line. This can be
699// called either in 32 or 64-bit mode, which makes the test for reverse moves
700// a little tricky. We've already filtered out the (sou==dest) and (len==0)
701// special cases.
702//
703// When entered:
704// r4 = destination (32 or 64-bit ptr)
705// r5 = length (always 32 bits)
706// r6 = source (32 or 64-bit ptr)
91447636
A
707// r12 = (dest - source), reverse move required if (dest-source)<length
708// cr5 = noncache flag
55e303ae
A
709
710 .align 5
711copyit64:
91447636
A
712 rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
713 cntlzw r11,r5 // get magnitude of length
55e303ae 714 dcbt 0,r6 // touch in 1st block of source
55e303ae 715 dcbtst 0,r4 // touch in 1st destination cache block
91447636
A
716 subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
717 li r0,0 // get a 0
718 lis r10,hi16(0x80000000)// get 0x80000000
719 addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
720 neg r9,r4 // start to get alignment for destination
721 sraw r8,r10,r11 // get mask based on operand length, to limit alignment
722 bt-- noncache,c64uncached// skip if uncached
723 beq-- c64rdouble // handle cached reverse moves
724
55e303ae
A
725
726// Forward, cached or doubleword aligned uncached. This is the common case.
91447636
A
727// NOTE: we never do an unaligned access if the source and destination are "relatively"
728// doubleword aligned. We depend on this in the uncached case.
729// r4 = destination
730// r5 = length (>0)
731// r6 = source
732// r8 = inverse of largest mask smaller than operand length
733// r9 = neg(dest), used to compute alignment
734// cr5 = noncache flag
55e303ae
A
735
736c64double:
91447636
A
737 rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
738 andc r7,r7,r8 // limit by operand length
55e303ae
A
739 andi. r8,r7,7 // r8 <- #bytes to doubleword align
740 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
741 sub r5,r5,r7 // adjust length remaining
742 cmpwi cr1,r9,0 // any doublewords to move to cache align?
743 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
744 cmpwi cr7,r10,0 // set cr7 on chunk count
745 beq c64double2 // dest already doubleword aligned
746 mtctr r8
747 b c64double1
748
749 .align 5 // align inner loops
750c64double1: // copy bytes until dest is doubleword aligned
751 lbz r0,0(r6)
752 addi r6,r6,1
753 stb r0,0(r4)
754 addi r4,r4,1
755 bdnz c64double1
756
91447636 757c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
55e303ae
A
758 beq cr1,c64double4 // no doublewords to xfer in order to cache align
759 mtctr r9
760 b c64double3
761
762 .align 5 // align inner loops
763c64double3: // copy doublewords until dest is 128-byte aligned
764 ld r7,0(r6)
765 addi r6,r6,8
766 std r7,0(r4)
767 addi r4,r4,8
768 bdnz c64double3
769
91447636 770// Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
55e303ae
A
771// data (64 bytes), we load/store each twice per 128-byte chunk.
772
773c64double4: // r10/cr7=128-byte chunks
774 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
775 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
776 beq cr7,c64double7 // no 128-byte chunks
55e303ae 777
91447636
A
778 ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
779 ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
55e303ae 780
91447636
A
781 sub r8,r6,r4 // r8 <- (source - dest)
782 rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
783 cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
784 mtctr r10
785 b c64InnerLoop
786
55e303ae
A
787 .align 5 // align inner loop
788c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
91447636 789 ld r0,0(r6) // start pipe: load 1st half-line
55e303ae 790 ld r2,8(r6)
55e303ae 791 ld r7,16(r6)
55e303ae 792 ld r8,24(r6)
55e303ae 793 ld r9,32(r6)
55e303ae 794 ld r10,40(r6)
55e303ae 795 ld r11,48(r6)
55e303ae 796 ld r12,56(r6)
55e303ae
A
797 bt noncache,c64InnerLoop1 // skip if uncached or overlap
798 dcbz128 0,r4 // avoid prefetch of next cache line
799c64InnerLoop1:
91447636
A
800
801 std r0,0(r4)
55e303ae 802 std r2,8(r4)
55e303ae 803 std r7,16(r4)
55e303ae 804 std r8,24(r4)
55e303ae 805 std r9,32(r4)
55e303ae 806 std r10,40(r4)
55e303ae 807 std r11,48(r4)
55e303ae 808 std r12,56(r4)
55e303ae 809
91447636
A
810 ld r0,64(r6) // load 2nd half of chunk
811 ld r2,72(r6)
812 ld r7,80(r6)
813 ld r8,88(r6)
814 ld r9,96(r6)
815 ld r10,104(r6)
816 ld r11,112(r6)
817 ld r12,120(r6)
818 addi r6,r6,128
819
820 std r0,64(r4)
55e303ae
A
821 std r2,72(r4)
822 std r7,80(r4)
823 std r8,88(r4)
824 std r9,96(r4)
825 std r10,104(r4)
826 std r11,112(r4)
827 std r12,120(r4)
828 addi r4,r4,128 // advance to next dest chunk
829
91447636
A
830 bdnz c64InnerLoop // loop if more chunks
831
832
55e303ae
A
833c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
834 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
835 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
836 beq cr1,c64byte // no leftover doublewords
837 mtctr r0
838 b c64double8
839
840 .align 5 // align inner loop
841c64double8: // loop copying leftover doublewords
842 ld r0,0(r6)
843 addi r6,r6,8
844 std r0,0(r4)
845 addi r4,r4,8
846 bdnz c64double8
847
848
849// Forward byte loop.
850
851c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
91447636 852 beqlr // done if no leftover bytes
55e303ae
A
853 mtctr r5
854 b c64byte1
855
856 .align 5 // align inner loop
857c64byte1:
858 lbz r0,0(r6)
859 addi r6,r6,1
860 stb r0,0(r4)
861 addi r4,r4,1
862 bdnz c64byte1
863
91447636 864 blr
55e303ae
A
865
866
867// Uncached copies. We must avoid unaligned accesses, since they always take alignment
868// exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
869// a byte at a time, but that is still much faster than alignment exceptions.
91447636
A
870// r4 = destination
871// r5 = length (>0)
872// r6 = source
873// r8 = inverse of largest mask smaller than operand length
874// r9 = neg(dest), used to compute alignment
875// r12 = (dest-source), used to test relative alignment
876// cr0 = beq if reverse move required
877// cr5 = noncache flag
55e303ae
A
878
879c64uncached:
91447636
A
880 rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
881 rlwinm r11,r12,0,30,31 // relatively word aligned?
882 cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
55e303ae 883 cmpwi cr1,r11,0 // set cr1 beq if word aligned
91447636
A
884 beq-- c64reverseUncached
885
886 beq cr7,c64double // doubleword aligned
887 beq cr1,forward32bit // word aligned, use G3/G4 code
55e303ae
A
888 cmpwi r5,0 // set cr0 on byte count
889 b c64byte // unaligned operands
890
891c64reverseUncached:
91447636
A
892 beq cr7,c64rdouble // doubleword aligned so can use LD/STD
893 beq cr1,reverse32bit // word aligned, use G3/G4 code
55e303ae
A
894 add r6,r6,r5 // point to (end+1) of source and dest
895 add r4,r4,r5
896 cmpwi r5,0 // set cr0 on length
897 b c64rbyte // copy a byte at a time
898
899
900
901// Reverse doubleword copies. This is used for all cached copies, and doubleword
902// aligned uncached copies.
91447636
A
903// r4 = destination
904// r5 = length (>0)
905// r6 = source
906// r8 = inverse of largest mask of low-order 1s smaller than operand length
907// cr5 = noncache flag
55e303ae
A
908
909c64rdouble:
910 add r6,r6,r5 // point to (end+1) of source and dest
911 add r4,r4,r5
91447636
A
912 rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
913 andc. r7,r7,r8 // limit by operand length
55e303ae
A
914 sub r5,r5,r7 // adjust length
915 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
916 cmpwi cr1,r8,0 // any chunks?
917 beq c64rd2 // source already doubleword aligned
918 mtctr r7
919
920c64rd1: // copy bytes until source doublword aligned
921 lbzu r0,-1(r6)
922 stbu r0,-1(r4)
923 bdnz c64rd1
924
925c64rd2: // r8/cr1 <- count of 64-byte chunks
926 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
927 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
928 cmpwi cr7,r0,0 // leftover doublewords?
929 beq cr1,c64rd4 // no chunks to xfer
55e303ae 930 mtctr r8
55e303ae
A
931 b c64rd3
932
933 .align 5 // align inner loop
934c64rd3: // loop copying 64-byte chunks
935 ld r7,-8(r6)
936 ld r8,-16(r6)
937 ld r9,-24(r6)
938 ld r10,-32(r6)
939 ld r11,-40(r6)
940 ld r12,-48(r6)
941 std r7,-8(r4)
942 std r8,-16(r4)
943 ld r7,-56(r6)
944 ldu r8,-64(r6)
945 std r9,-24(r4)
946 std r10,-32(r4)
947 std r11,-40(r4)
948 std r12,-48(r4)
949 std r7,-56(r4)
950 stdu r8,-64(r4)
951 bdnz c64rd3
952
953c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
954 beq cr7,c64rbyte // no leftover doublewords
955 mtctr r0
956
957c64rd5: // loop copying leftover doublewords
958 ldu r0,-8(r6)
959 stdu r0,-8(r4)
960 bdnz c64rd5
961
962
963// Reverse byte loop.
964
965c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
91447636 966 beqlr // done if no leftover bytes
55e303ae
A
967 mtctr r5
968
969c64rbyte1:
970 lbzu r0,-1(r6)
971 stbu r0,-1(r4)
972 bdnz c64rbyte1
973
91447636 974 blr
55e303ae 975