]> git.saurik.com Git - apple/xnu.git/blame - osfmk/ppc/bcopy.s
xnu-792.12.6.tar.gz
[apple/xnu.git] / osfmk / ppc / bcopy.s
CommitLineData
1c79356b 1/*
91447636 2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
1c79356b 3 *
8ad349bb 4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
1c79356b 5 *
8ad349bb
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
1c79356b
A
29 */
30;
91447636 31; Copy bytes of data around. Handles overlapped data.
1c79356b 32;
1c79356b
A
33;
34#include <ppc/asm.h>
35#include <ppc/proc_reg.h>
55e303ae 36#include <assym.s>
1c79356b 37
91447636
A
38; These routines use CR5 for certain flags:
39; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
1c79356b 40#define noncache 20
55e303ae 41
55e303ae 42
91447636
A
43; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
44#define BCOPY_SF_SIZE 32 // total size
45#define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
1c79356b 46
1c79356b 47
91447636 48#define kShort 32 // short operands are special cased
1c79356b 49
1c79356b 50
91447636
A
51; void bcopy_physvir_32(from, to, nbytes)
52;
9bccf70c 53; Attempt to copy physically addressed memory with translation on if conditions are met.
55e303ae
A
54; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
55; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
56; for the passed phys addrs and do the copy with translation on.
9bccf70c 57;
91447636
A
58; Rules are: - neither source nor destination can cross a page.
59; - Interrupts must be disabled when this routine is called.
60; - Translation must be on when called.
9bccf70c 61;
91447636
A
62; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
63; is loaded. We do not touch the IBATs, so there is no issue if either physical page
9bccf70c
A
64; address is the same as the virtual address of the instructions we are executing.
65;
55e303ae
A
66; At the end, we invalidate the used DBATs.
67;
68; Note that the address parameters are long longs. We will transform these to 64-bit
69; values. Note that on 32-bit architectures that this will ignore the high half of the
70; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
71; there anyhow.
9bccf70c 72;
91447636
A
73; Note also that this routine is used only on 32-bit machines. If you're contemplating use
74; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
75; for an example of how this is done.
9bccf70c
A
76
77 .align 5
91447636 78 .globl EXT(bcopy_physvir_32)
9bccf70c 79
91447636
A
80LEXT(bcopy_physvir_32)
81 mflr r0 ; get return address
55e303ae 82 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
91447636
A
83 mfsprg r8,2 ; get processor feature flags
84 stw r0,8(r1) ; save return address
55e303ae 85 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
91447636 86 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
55e303ae 87 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
91447636
A
88 subi r0,r7,1 ; get length - 1
89 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
90 add r11,r3,r0 ; Point to last byte of sink
55e303ae 91 mr r5,r7 ; Get the length into the right register
91447636
A
92 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
93
94; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
95; to bcopy_physvir() on 32-bit machines, and will result in a panic.
96
9bccf70c 97 add r12,r4,r0 ; Point to last byte of source
9bccf70c
A
98 xor r7,r11,r3 ; See if we went to next page
99 xor r8,r12,r4 ; See if we went to next page
100 or r0,r7,r8 ; Combine wrap
101
55e303ae
A
102// li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
103 li r9,((2<<3)|2) ; Set default attributes
9bccf70c
A
104 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
105 li r7,2 ; Set validity flags
106 li r8,2 ; Set validity flags
55e303ae 107 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
9bccf70c 108
9bccf70c
A
109 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
110 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
111 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
112 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
113 cmplw cr1,r11,r12 ; See if sink and source are same block
114
115 sync
116
117 mtdbatl 0,r11 ; Set sink lower DBAT
118 mtdbatu 0,r7 ; Set sink upper DBAT
119
120 beq- cr1,bcpvsame ; Source and sink are in same block
121
122 mtdbatl 1,r12 ; Set source lower DBAT
123 mtdbatu 1,r8 ; Set source upper DBAT
91447636 124
e5568f75 125bcpvsame:
91447636 126 sync ; wait for the BATs to stabilize
e5568f75 127 isync
91447636
A
128
129 bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
130
131 li r0,0 ; Get set to invalidate upper half of BATs
132 sync ; Make sure all is well
133 mtdbatu 0,r0 ; Clear sink upper DBAT
134 mtdbatu 1,r0 ; Clear source upper DBAT
135 sync
136 isync
137
138 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
139 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
140 mtlr r0
141 blr
142
9bccf70c 143
1c79356b 144; void bcopy_phys(from, to, nbytes)
91447636
A
145;
146; Turns off data translation before the copy. This one will not work in user state.
147; This routine is used on 32 and 64-bit machines.
55e303ae
A
148;
149; Note that the address parameters are long longs. We will transform these to 64-bit
150; values. Note that on 32-bit architectures that this will ignore the high half of the
151; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
152; there anyhow.
153;
154; Also note that you probably will not be happy if either the sink or source spans across the
155; boundary between RAM and I/O space. Good chance of hanging the machine and this code
156; will not check, so be careful.
1c79356b 157;
91447636
A
158; NOTE: when called, translation must be on, and we must be in 32-bit mode.
159; Interrupts may or may not be disabled.
1c79356b 160
9bccf70c
A
161 .align 5
162 .globl EXT(bcopy_phys)
163
164LEXT(bcopy_phys)
91447636 165 mflr r0 ; get return address
55e303ae 166 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
91447636 167 stw r0,8(r1) ; save
55e303ae 168 mfsprg r8,2 ; get processor feature flags
91447636 169 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
55e303ae
A
170 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
171 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
172 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
173 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
174 mr r5,r7 ; Get the length into the right register
91447636
A
175
176bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
1c79356b 177 mfmsr r9 ; Get the MSR
91447636
A
178 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
179 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
180 andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
55e303ae
A
181 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
182
183; 32-bit CPUs
91447636
A
184
185 mtmsr r9 ; turn DR, FP, and VEC off
1c79356b
A
186 isync ; Wait for it
187
91447636 188 bl EXT(bcopy) ; do the copy with translation off and caching on
55e303ae 189
91447636
A
190 mfmsr r9 ; Get the MSR
191 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
192 mtmsr r9 ; restore msr
193 isync ; wait for it to happen
194 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
195 mtlr r0
196 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
197 blr
55e303ae 198
91447636
A
199
200; 64-bit: turn DR off and SF on.
55e303ae 201
91447636
A
202bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
203 ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
204 srdi r2,r3,31 ; Get a 1 if source is in I/O memory
55e303ae 205 li r0,1 ; Note - we use this in a couple places below
91447636
A
206 srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
207 std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
208 rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
209 cmpldi cr0,r2,1 ; Is source in I/O memory?
210 cmpldi cr7,r10,1 ; Is sink in I/O memory?
55e303ae 211 mtmsrd r9 ; turn 64-bit addressing on, data translation off
55e303ae 212 isync ; wait for it to happen
91447636
A
213 cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
214 beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
215
216 bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
217
218bcopy_phys64x:
219 mfmsr r9 ; Get the MSR we used to copy
220 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
221 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
222 mtmsrd r9 ; turn 64-bit mode off, translation back on
223 isync ; wait for it to happen
224 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
225 ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
226 mtlr r0
227 mtmsrd r8,1 ; turn EE back on if necessary
228 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
229 blr
55e303ae 230
91447636
A
231; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
232; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
233; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
234; this state, so we have to disable interrupts as well. This is an unpleasant hack.
235
236io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
237 sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
238 sldi r0,r0,32+8 ; Get the right bit to turn off caching
239 andc r9,r9,r11 ; Turn off EE bit
240 mfspr r2,hid4 ; Get HID4
241 mtmsrd r9,1 ; Force off EE
242 or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
243 sync ; Sync up
244 mtspr hid4,r2 ; Make real accesses cache-inhibited
245 isync ; Toss prefetches
246
247 lis r12,0xE000 ; Get the unlikeliest ESID possible
248 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
249 slbie r12 ; Make sure the ERAT is cleared
55e303ae 250
91447636
A
251 sync
252 isync
55e303ae 253
91447636 254 bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
55e303ae 255
91447636
A
256 li r0,1 ; Get a 1
257 sldi r0,r0,32+8 ; Get the right bit to turn off caching
258 mfspr r2,hid4 ; Get HID4
259 andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
260 sync ; Sync up
261 mtspr hid4,r2 ; Make real accesses not cache-inhibited
262 isync ; Toss prefetches
263
264 lis r12,0xE000 ; Get the unlikeliest ESID possible
265 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
266 slbie r12 ; Make sure the ERAT is cleared
267 b bcopy_phys64x
268
1c79356b 269
91447636
A
270;
271; shortcopy
272;
273; Special case short operands (<32 bytes), which are very common. Note that the check for
274; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
275; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
276; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
277; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
278; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
279; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
280; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
281; might do unaligned accesses this code cannot be called from bcopy_nc().
282; r4 = destination
283; r5 = length (<32)
284; r6 = source
285; r12 = (dest - source)
286
287 .align 5
288shortcopy:
289 cmplw r12,r5 ; must move reverse if (dest-source)<length
290 mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
291 mtcrf 1,r5 ; ...which is faster on G4 and G5
292 bge++ backend ; handle forward moves (most common case)
293 add r6,r6,r5 ; point one past end of operands in reverse moves
294 add r4,r4,r5
295 b bbackend ; handle reverse moves
296
1c79356b
A
297;
298; void bcopy(from, to, nbytes)
299;
91447636
A
300; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
301; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
302; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
303; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
304; registers to accelerate the copy.
305;
306; NOTE: this code can be called in any of three "modes":
307; - on 32-bit processors (32-byte cache line)
308; - on 64-bit processors running in 32-bit mode (128-byte cache line)
309; - on 64-bit processors running in 64-bit mode (128-byte cache line)
1c79356b 310
9bccf70c
A
311 .align 5
312 .globl EXT(bcopy)
91447636 313 .globl EXT(bcopy_nop_if_32bit)
9bccf70c
A
314
315LEXT(bcopy)
91447636
A
316 cmplwi cr1,r5,kShort ; less than 32 bytes?
317 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
318 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
319 blt cr1,shortcopy ; special case short operands
1c79356b 320 crclr noncache ; Set cached
91447636
A
321LEXT(bcopy_nop_if_32bit)
322 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
323 bne+ copyit32 ; handle 32-bit processor
324 blr ; to==from so nothing to do
325
326;
327; bcopy_nc(from, to, nbytes)
328;
329; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
330; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
331; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
332; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
1c79356b 333
91447636
A
334 .align 5
335 .globl EXT(bcopy_nc)
336 .globl EXT(bcopy_nc_nop_if_32bit)
337
338LEXT(bcopy_nc)
55e303ae 339 cmpwi cr1,r5,0 ; Check if we have a 0 length
91447636
A
340 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
341 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
342 crset noncache ; Set non-cached
343 cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
344LEXT(bcopy_nc_nop_if_32bit)
345 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
346 bne+ copyit32 ; handle 32-bit processor
347 blr ; either zero length or to==from
1c79356b
A
348
349;
91447636
A
350; void* memcpy(to, from, nbytes)
351; void* memmove(to, from, nbytes)
55e303ae 352;
91447636
A
353; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
354; However, they would work correctly if called in 64-bit mode.
355
9bccf70c
A
356 .align 5
357 .globl EXT(memcpy)
91447636
A
358 .globl EXT(memmove)
359 .globl EXT(memcpy_nop_if_32bit)
360
9bccf70c 361LEXT(memcpy)
91447636
A
362LEXT(memmove)
363 cmplwi cr1,r5,kShort ; less than 32 bytes?
364 sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
365 mr r6,r4 ; Set source
366 mr r4,r3 ; Set the "to" (must preserve r3 for return value)
367 blt cr1,shortcopy ; special case short operands
1c79356b 368 crclr noncache ; Set cached
91447636
A
369LEXT(memcpy_nop_if_32bit)
370 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
371 beqlr- ; exit if to==from
1c79356b 372
91447636
A
373
374; Here to copy on 32-bit processors.
1c79356b 375;
91447636
A
376; When we move the memory, forward overlays must be handled. We
377; also can not use the cache instructions if we are from bcopy_nc.
378; We need to preserve R3 because it needs to be returned for memcpy.
379; We can be interrupted and lose control here.
1c79356b 380;
91447636
A
381; When entered:
382; r4 = destination
383; r5 = length (>0)
384; r6 = source
385; r12 = (dest - source)
386; cr5 = noncache flag
387
388copyit32: ; WARNING! can drop down to this label
389 cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
390 cntlzw r11,r5 ; get magnitude of length
391 dcbt 0,r6 ; start to touch in source
392 lis r10,hi16(0x80000000) ; get 0x80000000
393 neg r9,r4 ; start to get alignment for destination
394 dcbtst 0,r4 ; start to touch in destination
395 sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
396 blt- cr1,reverse32bit ; reverse move required
397
398; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
399; NOTE: we never do an unaligned access if the source and destination are "relatively"
400; word aligned. We depend on this in the uncached case on 64-bit processors.
401; r4 = destination
402; r5 = length (>0)
403; r6 = source
404; r8 = inverse of largest mask smaller than operand length
405; r9 = neg(dest), used to compute alignment
406; cr5 = noncache flag
407
408forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
409 rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
410 andc. r0,r7,r8 ; limit to the maximum front end move
411 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
1c79356b
A
412 beq alline ; Already on a line...
413
91447636
A
414 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
415 sub r5,r5,r0 ; Set the length left to move
416
1c79356b
A
417 bf 31,alhalf ; No single byte to do...
418 lbz r7,0(r6) ; Get the byte
419 addi r6,r6,1 ; Point to the next
420 stb r7,0(r4) ; Save the single
421 addi r4,r4,1 ; Bump sink
422
423; Sink is halfword aligned here
424
425alhalf: bf 30,alword ; No halfword to do...
426 lhz r7,0(r6) ; Get the halfword
427 addi r6,r6,2 ; Point to the next
428 sth r7,0(r4) ; Save the halfword
429 addi r4,r4,2 ; Bump sink
430
431; Sink is word aligned here
432
433alword: bf 29,aldouble ; No word to do...
434 lwz r7,0(r6) ; Get the word
435 addi r6,r6,4 ; Point to the next
436 stw r7,0(r4) ; Save the word
437 addi r4,r4,4 ; Bump sink
438
439; Sink is double aligned here
440
441aldouble: bf 28,alquad ; No double to do...
442 lwz r7,0(r6) ; Get the first word
443 lwz r8,4(r6) ; Get the second word
444 addi r6,r6,8 ; Point to the next
445 stw r7,0(r4) ; Save the first word
446 stw r8,4(r4) ; Save the second word
447 addi r4,r4,8 ; Bump sink
448
449; Sink is quadword aligned here
450
451alquad: bf 27,alline ; No quad to do...
452 lwz r7,0(r6) ; Get the first word
453 lwz r8,4(r6) ; Get the second word
454 lwz r9,8(r6) ; Get the third word
455 stw r7,0(r4) ; Save the first word
456 lwz r11,12(r6) ; Get the fourth word
457 addi r6,r6,16 ; Point to the next
458 stw r8,4(r4) ; Save the second word
459 stw r9,8(r4) ; Save the third word
460 stw r11,12(r4) ; Save the fourth word
461 addi r4,r4,16 ; Bump sink
462
463; Sink is line aligned here
464
465alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
91447636
A
466 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
467 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
1c79356b 468 beq- backend ; No full lines to move
91447636
A
469
470 mtctr r0 ; set up loop count
471 li r0,96 ; Stride for touch ahead
472 b nxtline
473
474 .align 4
475nxtline:
476 lwz r2,0(r6) ; Get the first word
477 lwz r5,4(r6) ; Get the second word
478 lwz r7,8(r6) ; Get the third word
479 lwz r8,12(r6) ; Get the fourth word
480 lwz r9,16(r6) ; Get the fifth word
481 lwz r10,20(r6) ; Get the sixth word
482 lwz r11,24(r6) ; Get the seventh word
483 lwz r12,28(r6) ; Get the eighth word
1c79356b 484 bt- noncache,skipz ; Skip if we are not cached...
91447636
A
485 dcbz 0,r4 ; Blow away the whole line because we are replacing it
486 dcbt r6,r0 ; Touch ahead a bit
487skipz:
1c79356b 488 addi r6,r6,32 ; Point to the next
91447636
A
489 stw r2,0(r4) ; Save the first word
490 stw r5,4(r4) ; Save the second word
491 stw r7,8(r4) ; Save the third word
492 stw r8,12(r4) ; Save the fourth word
493 stw r9,16(r4) ; Save the fifth word
494 stw r10,20(r4) ; Save the sixth word
495 stw r11,24(r4) ; Save the seventh word
496 stw r12,28(r4) ; Save the eighth word
1c79356b 497 addi r4,r4,32 ; Bump sink
91447636 498 bdnz+ nxtline ; Do the next line, if any...
1c79356b
A
499
500
501; Move backend quadword
502
91447636
A
503backend: ; Join here from "shortcopy" for forward moves <32 bytes
504 bf 27,noquad ; No quad to do...
1c79356b
A
505 lwz r7,0(r6) ; Get the first word
506 lwz r8,4(r6) ; Get the second word
507 lwz r9,8(r6) ; Get the third word
508 lwz r11,12(r6) ; Get the fourth word
509 stw r7,0(r4) ; Save the first word
510 addi r6,r6,16 ; Point to the next
511 stw r8,4(r4) ; Save the second word
512 stw r9,8(r4) ; Save the third word
513 stw r11,12(r4) ; Save the fourth word
514 addi r4,r4,16 ; Bump sink
515
516; Move backend double
517
518noquad: bf 28,nodouble ; No double to do...
519 lwz r7,0(r6) ; Get the first word
520 lwz r8,4(r6) ; Get the second word
521 addi r6,r6,8 ; Point to the next
522 stw r7,0(r4) ; Save the first word
523 stw r8,4(r4) ; Save the second word
524 addi r4,r4,8 ; Bump sink
525
526; Move backend word
527
528nodouble: bf 29,noword ; No word to do...
529 lwz r7,0(r6) ; Get the word
530 addi r6,r6,4 ; Point to the next
531 stw r7,0(r4) ; Save the word
532 addi r4,r4,4 ; Bump sink
533
534; Move backend halfword
535
536noword: bf 30,nohalf ; No halfword to do...
537 lhz r7,0(r6) ; Get the halfword
538 addi r6,r6,2 ; Point to the next
539 sth r7,0(r4) ; Save the halfword
540 addi r4,r4,2 ; Bump sink
541
542; Move backend byte
543
91447636 544nohalf: bflr 31 ; Leave cuz we are all done...
1c79356b
A
545 lbz r7,0(r6) ; Get the byte
546 stb r7,0(r4) ; Save the single
55e303ae
A
547 blr
548
9bccf70c 549
91447636
A
550; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
551; NOTE: we never do an unaligned access if the source and destination are "relatively"
552; word aligned. We depend on this in the uncached case on 64-bit processors.
553; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
554; r4 = destination
555; r5 = length (>0)
556; r6 = source
557; r8 = inverse of largest mask smaller than operand length
558; cr5 = noncache flag (but we don't dcbz anyway)
1c79356b 559
91447636
A
560reverse32bit: ; here from 64-bit code with word aligned uncached operands
561 add r4,r5,r4 ; Point past the last sink byte
1c79356b 562 add r6,r5,r6 ; Point past the last source byte
91447636
A
563 rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
564 li r12,-1 ; Make sure we touch in the actual line
565 andc. r0,r7,r8 ; Apply movement limit
1c79356b 566 dcbt r12,r6 ; Touch in the last line of source
91447636 567 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
1c79356b 568 dcbtst r12,r4 ; Touch in the last line of the sink
91447636
A
569 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
570 beq- balline ; Aready on cache line boundary (or too short to bother)
1c79356b
A
571
572 sub r5,r5,r0 ; Precaculate move length left after alignment
573
574 bf 31,balhalf ; No single byte to do...
575 lbz r7,-1(r6) ; Get the byte
576 subi r6,r6,1 ; Point to the next
577 stb r7,-1(r4) ; Save the single
578 subi r4,r4,1 ; Bump sink
579
580; Sink is halfword aligned here
581
582balhalf: bf 30,balword ; No halfword to do...
583 lhz r7,-2(r6) ; Get the halfword
584 subi r6,r6,2 ; Point to the next
585 sth r7,-2(r4) ; Save the halfword
586 subi r4,r4,2 ; Bump sink
587
588; Sink is word aligned here
589
590balword: bf 29,baldouble ; No word to do...
591 lwz r7,-4(r6) ; Get the word
592 subi r6,r6,4 ; Point to the next
593 stw r7,-4(r4) ; Save the word
594 subi r4,r4,4 ; Bump sink
595
596; Sink is double aligned here
597
598baldouble: bf 28,balquad ; No double to do...
599 lwz r7,-8(r6) ; Get the first word
600 lwz r8,-4(r6) ; Get the second word
601 subi r6,r6,8 ; Point to the next
602 stw r7,-8(r4) ; Save the first word
603 stw r8,-4(r4) ; Save the second word
604 subi r4,r4,8 ; Bump sink
605
606; Sink is quadword aligned here
607
608balquad: bf 27,balline ; No quad to do...
609 lwz r7,-16(r6) ; Get the first word
610 lwz r8,-12(r6) ; Get the second word
611 lwz r9,-8(r6) ; Get the third word
612 lwz r11,-4(r6) ; Get the fourth word
613 stw r7,-16(r4) ; Save the first word
614 subi r6,r6,16 ; Point to the next
615 stw r8,-12(r4) ; Save the second word
616 stw r9,-8(r4) ; Save the third word
617 stw r11,-4(r4) ; Save the fourth word
618 subi r4,r4,16 ; Bump sink
619
620; Sink is line aligned here
621
622balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
91447636
A
623 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
624 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
1c79356b 625 beq- bbackend ; No full lines to move
91447636
A
626 mtctr r0 ; set up loop count
627 b bnxtline
1c79356b 628
91447636
A
629 .align 4
630bnxtline:
1c79356b
A
631 lwz r7,-32(r6) ; Get the first word
632 lwz r5,-28(r6) ; Get the second word
633 lwz r2,-24(r6) ; Get the third word
634 lwz r12,-20(r6) ; Get the third word
635 lwz r11,-16(r6) ; Get the fifth word
636 lwz r10,-12(r6) ; Get the sixth word
637 lwz r9,-8(r6) ; Get the seventh word
638 lwz r8,-4(r6) ; Get the eighth word
639 subi r6,r6,32 ; Point to the next
640
641 stw r7,-32(r4) ; Get the first word
91447636 642 stw r5,-28(r4) ; Get the second word
1c79356b
A
643 stw r2,-24(r4) ; Get the third word
644 stw r12,-20(r4) ; Get the third word
645 stw r11,-16(r4) ; Get the fifth word
646 stw r10,-12(r4) ; Get the sixth word
647 stw r9,-8(r4) ; Get the seventh word
648 stw r8,-4(r4) ; Get the eighth word
649 subi r4,r4,32 ; Bump sink
650
91447636 651 bdnz+ bnxtline ; Do the next line, if any...
1c79356b
A
652
653;
654; Note: We touched these lines in at the beginning
655;
656
657; Move backend quadword
658
91447636
A
659bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
660 bf 27,bnoquad ; No quad to do...
1c79356b
A
661 lwz r7,-16(r6) ; Get the first word
662 lwz r8,-12(r6) ; Get the second word
663 lwz r9,-8(r6) ; Get the third word
664 lwz r11,-4(r6) ; Get the fourth word
665 stw r7,-16(r4) ; Save the first word
666 subi r6,r6,16 ; Point to the next
667 stw r8,-12(r4) ; Save the second word
668 stw r9,-8(r4) ; Save the third word
669 stw r11,-4(r4) ; Save the fourth word
670 subi r4,r4,16 ; Bump sink
671
672; Move backend double
673
674bnoquad: bf 28,bnodouble ; No double to do...
675 lwz r7,-8(r6) ; Get the first word
676 lwz r8,-4(r6) ; Get the second word
677 subi r6,r6,8 ; Point to the next
678 stw r7,-8(r4) ; Save the first word
679 stw r8,-4(r4) ; Save the second word
680 subi r4,r4,8 ; Bump sink
681
682; Move backend word
683
684bnodouble: bf 29,bnoword ; No word to do...
685 lwz r7,-4(r6) ; Get the word
686 subi r6,r6,4 ; Point to the next
687 stw r7,-4(r4) ; Save the word
688 subi r4,r4,4 ; Bump sink
689
690; Move backend halfword
691
692bnoword: bf 30,bnohalf ; No halfword to do...
693 lhz r7,-2(r6) ; Get the halfword
694 subi r6,r6,2 ; Point to the next
695 sth r7,-2(r4) ; Save the halfword
696 subi r4,r4,2 ; Bump sink
697
698; Move backend byte
699
91447636 700bnohalf: bflr 31 ; Leave cuz we are all done...
1c79356b
A
701 lbz r7,-1(r6) ; Get the byte
702 stb r7,-1(r4) ; Save the single
91447636 703 blr
55e303ae
A
704
705
706// Here on 64-bit processors, which have a 128-byte cache line. This can be
707// called either in 32 or 64-bit mode, which makes the test for reverse moves
708// a little tricky. We've already filtered out the (sou==dest) and (len==0)
709// special cases.
710//
711// When entered:
712// r4 = destination (32 or 64-bit ptr)
713// r5 = length (always 32 bits)
714// r6 = source (32 or 64-bit ptr)
91447636
A
715// r12 = (dest - source), reverse move required if (dest-source)<length
716// cr5 = noncache flag
55e303ae
A
717
718 .align 5
719copyit64:
91447636
A
720 rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
721 cntlzw r11,r5 // get magnitude of length
55e303ae 722 dcbt 0,r6 // touch in 1st block of source
55e303ae 723 dcbtst 0,r4 // touch in 1st destination cache block
91447636
A
724 subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
725 li r0,0 // get a 0
726 lis r10,hi16(0x80000000)// get 0x80000000
727 addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
728 neg r9,r4 // start to get alignment for destination
729 sraw r8,r10,r11 // get mask based on operand length, to limit alignment
730 bt-- noncache,c64uncached// skip if uncached
731 beq-- c64rdouble // handle cached reverse moves
732
55e303ae
A
733
734// Forward, cached or doubleword aligned uncached. This is the common case.
91447636
A
735// NOTE: we never do an unaligned access if the source and destination are "relatively"
736// doubleword aligned. We depend on this in the uncached case.
737// r4 = destination
738// r5 = length (>0)
739// r6 = source
740// r8 = inverse of largest mask smaller than operand length
741// r9 = neg(dest), used to compute alignment
742// cr5 = noncache flag
55e303ae
A
743
744c64double:
91447636
A
745 rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
746 andc r7,r7,r8 // limit by operand length
55e303ae
A
747 andi. r8,r7,7 // r8 <- #bytes to doubleword align
748 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
749 sub r5,r5,r7 // adjust length remaining
750 cmpwi cr1,r9,0 // any doublewords to move to cache align?
751 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
752 cmpwi cr7,r10,0 // set cr7 on chunk count
753 beq c64double2 // dest already doubleword aligned
754 mtctr r8
755 b c64double1
756
757 .align 5 // align inner loops
758c64double1: // copy bytes until dest is doubleword aligned
759 lbz r0,0(r6)
760 addi r6,r6,1
761 stb r0,0(r4)
762 addi r4,r4,1
763 bdnz c64double1
764
91447636 765c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
55e303ae
A
766 beq cr1,c64double4 // no doublewords to xfer in order to cache align
767 mtctr r9
768 b c64double3
769
770 .align 5 // align inner loops
771c64double3: // copy doublewords until dest is 128-byte aligned
772 ld r7,0(r6)
773 addi r6,r6,8
774 std r7,0(r4)
775 addi r4,r4,8
776 bdnz c64double3
777
91447636 778// Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
55e303ae
A
779// data (64 bytes), we load/store each twice per 128-byte chunk.
780
781c64double4: // r10/cr7=128-byte chunks
782 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
783 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
784 beq cr7,c64double7 // no 128-byte chunks
55e303ae 785
91447636
A
786 ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
787 ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
55e303ae 788
91447636
A
789 sub r8,r6,r4 // r8 <- (source - dest)
790 rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
791 cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
792 mtctr r10
793 b c64InnerLoop
794
55e303ae
A
795 .align 5 // align inner loop
796c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
91447636 797 ld r0,0(r6) // start pipe: load 1st half-line
55e303ae 798 ld r2,8(r6)
55e303ae 799 ld r7,16(r6)
55e303ae 800 ld r8,24(r6)
55e303ae 801 ld r9,32(r6)
55e303ae 802 ld r10,40(r6)
55e303ae 803 ld r11,48(r6)
55e303ae 804 ld r12,56(r6)
55e303ae
A
805 bt noncache,c64InnerLoop1 // skip if uncached or overlap
806 dcbz128 0,r4 // avoid prefetch of next cache line
807c64InnerLoop1:
91447636
A
808
809 std r0,0(r4)
55e303ae 810 std r2,8(r4)
55e303ae 811 std r7,16(r4)
55e303ae 812 std r8,24(r4)
55e303ae 813 std r9,32(r4)
55e303ae 814 std r10,40(r4)
55e303ae 815 std r11,48(r4)
55e303ae 816 std r12,56(r4)
55e303ae 817
91447636
A
818 ld r0,64(r6) // load 2nd half of chunk
819 ld r2,72(r6)
820 ld r7,80(r6)
821 ld r8,88(r6)
822 ld r9,96(r6)
823 ld r10,104(r6)
824 ld r11,112(r6)
825 ld r12,120(r6)
826 addi r6,r6,128
827
828 std r0,64(r4)
55e303ae
A
829 std r2,72(r4)
830 std r7,80(r4)
831 std r8,88(r4)
832 std r9,96(r4)
833 std r10,104(r4)
834 std r11,112(r4)
835 std r12,120(r4)
836 addi r4,r4,128 // advance to next dest chunk
837
91447636
A
838 bdnz c64InnerLoop // loop if more chunks
839
840
55e303ae
A
841c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
842 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
843 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
844 beq cr1,c64byte // no leftover doublewords
845 mtctr r0
846 b c64double8
847
848 .align 5 // align inner loop
849c64double8: // loop copying leftover doublewords
850 ld r0,0(r6)
851 addi r6,r6,8
852 std r0,0(r4)
853 addi r4,r4,8
854 bdnz c64double8
855
856
857// Forward byte loop.
858
859c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
91447636 860 beqlr // done if no leftover bytes
55e303ae
A
861 mtctr r5
862 b c64byte1
863
864 .align 5 // align inner loop
865c64byte1:
866 lbz r0,0(r6)
867 addi r6,r6,1
868 stb r0,0(r4)
869 addi r4,r4,1
870 bdnz c64byte1
871
91447636 872 blr
55e303ae
A
873
874
875// Uncached copies. We must avoid unaligned accesses, since they always take alignment
876// exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
877// a byte at a time, but that is still much faster than alignment exceptions.
91447636
A
878// r4 = destination
879// r5 = length (>0)
880// r6 = source
881// r8 = inverse of largest mask smaller than operand length
882// r9 = neg(dest), used to compute alignment
883// r12 = (dest-source), used to test relative alignment
884// cr0 = beq if reverse move required
885// cr5 = noncache flag
55e303ae
A
886
887c64uncached:
91447636
A
888 rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
889 rlwinm r11,r12,0,30,31 // relatively word aligned?
890 cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
55e303ae 891 cmpwi cr1,r11,0 // set cr1 beq if word aligned
91447636
A
892 beq-- c64reverseUncached
893
894 beq cr7,c64double // doubleword aligned
895 beq cr1,forward32bit // word aligned, use G3/G4 code
55e303ae
A
896 cmpwi r5,0 // set cr0 on byte count
897 b c64byte // unaligned operands
898
899c64reverseUncached:
91447636
A
900 beq cr7,c64rdouble // doubleword aligned so can use LD/STD
901 beq cr1,reverse32bit // word aligned, use G3/G4 code
55e303ae
A
902 add r6,r6,r5 // point to (end+1) of source and dest
903 add r4,r4,r5
904 cmpwi r5,0 // set cr0 on length
905 b c64rbyte // copy a byte at a time
906
907
908
909// Reverse doubleword copies. This is used for all cached copies, and doubleword
910// aligned uncached copies.
91447636
A
911// r4 = destination
912// r5 = length (>0)
913// r6 = source
914// r8 = inverse of largest mask of low-order 1s smaller than operand length
915// cr5 = noncache flag
55e303ae
A
916
917c64rdouble:
918 add r6,r6,r5 // point to (end+1) of source and dest
919 add r4,r4,r5
91447636
A
920 rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
921 andc. r7,r7,r8 // limit by operand length
55e303ae
A
922 sub r5,r5,r7 // adjust length
923 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
924 cmpwi cr1,r8,0 // any chunks?
925 beq c64rd2 // source already doubleword aligned
926 mtctr r7
927
928c64rd1: // copy bytes until source doublword aligned
929 lbzu r0,-1(r6)
930 stbu r0,-1(r4)
931 bdnz c64rd1
932
933c64rd2: // r8/cr1 <- count of 64-byte chunks
934 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
935 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
936 cmpwi cr7,r0,0 // leftover doublewords?
937 beq cr1,c64rd4 // no chunks to xfer
55e303ae 938 mtctr r8
55e303ae
A
939 b c64rd3
940
941 .align 5 // align inner loop
942c64rd3: // loop copying 64-byte chunks
943 ld r7,-8(r6)
944 ld r8,-16(r6)
945 ld r9,-24(r6)
946 ld r10,-32(r6)
947 ld r11,-40(r6)
948 ld r12,-48(r6)
949 std r7,-8(r4)
950 std r8,-16(r4)
951 ld r7,-56(r6)
952 ldu r8,-64(r6)
953 std r9,-24(r4)
954 std r10,-32(r4)
955 std r11,-40(r4)
956 std r12,-48(r4)
957 std r7,-56(r4)
958 stdu r8,-64(r4)
959 bdnz c64rd3
960
961c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
962 beq cr7,c64rbyte // no leftover doublewords
963 mtctr r0
964
965c64rd5: // loop copying leftover doublewords
966 ldu r0,-8(r6)
967 stdu r0,-8(r4)
968 bdnz c64rd5
969
970
971// Reverse byte loop.
972
973c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
91447636 974 beqlr // done if no leftover bytes
55e303ae
A
975 mtctr r5
976
977c64rbyte1:
978 lbzu r0,-1(r6)
979 stbu r0,-1(r4)
980 bdnz c64rbyte1
981
91447636 982 blr
55e303ae 983